diff --git a/.licenserc.yaml b/.licenserc.yaml
index 6e2b465590..804e6ef5ec 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -21,17 +21,28 @@ header:
     copyright-owner: Apache Software Foundation
 
   paths-ignore:
-    - '**/*.csv'
-    - '**/*.txt'
-    - '**/*.md'
-    - '**/*.pem'
-    - '**/*.sha256'
-    - '**/Cargo.lock'
-    - '**/target/**'
-    - '.gitattributes'
-    - '.github/ISSUE_TEMPLATE/**'
-    - '.gitmodules'
-    - 'DISCLAIMER'
+    # License and notice files
+    - 'licenses'
     - 'LICENSE'
     - 'NOTICE'
-    - 'licenses'
+    - 'DISCLAIMER'
+
+    # Documentation and configuration files
+    - '**/*.md'
+    - '**/Cargo.lock'
+    - 'rust-toolchain'
+    - '**/*.lds'
+
+    # Cryptographic and binary files for testing
+    - '**/*.pem'
+    - '**/*.key' 
+    - '**/*.crt'
+    - '**/*.rsa'
+    - '**/*.der'
+
+    # Third-party libraries - included in licenses directory
+    - 'ring-0.17.14'                           # LICENSE-ring.txt
+
+dependency:
+  files:
+    - 'Cargo.toml'
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 46ae475fa3..570d0c9ac8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -206,3 +206,8 @@
 This product bundles various third-party components under other open source
 licenses. This section summarizes those components and their licenses. See
 licenses/ for text of these licenses.
+
+Third-party components included in this distribution:
+
+- ring-0.17.14/
+   License: See licenses/LICENSE-ring.txt
\ No newline at end of file
diff --git a/licenses/LICENSE-BoringSSL b/licenses/LICENSE-BoringSSL
new file mode 100644
index 0000000000..8e6eb34e5b
--- /dev/null
+++ b/licenses/LICENSE-BoringSSL
@@ -0,0 +1,272 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+Licenses for support code
+-------------------------
+
+Parts of the TLS test suite are under the Go license. This code is not included
+in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
+distributing code linked against BoringSSL does not trigger this license:
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+BoringSSL uses the Chromium test infrastructure to run a continuous build,
+trybots etc. The scripts which manage this, and the script for generating build
+metadata, are under the Chromium license. Distributing code linked against
+BoringSSL does not trigger this license.
+
+Copyright 2015 The Chromium Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-other-bits b/licenses/LICENSE-other-bits
new file mode 100644
index 0000000000..20cfadb6ee
--- /dev/null
+++ b/licenses/LICENSE-other-bits
@@ -0,0 +1,13 @@
+Copyright 2015-2025 Brian Smith.
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-ring.txt b/licenses/LICENSE-ring.txt
new file mode 100644
index 0000000000..963c394c44
--- /dev/null
+++ b/licenses/LICENSE-ring.txt
@@ -0,0 +1,9 @@
+*ring* uses an "ISC" license, like BoringSSL used to use, for new code
+files. See LICENSE-other-bits for the text of that license.
+
+See LICENSE-BoringSSL for code that was sourced from BoringSSL under the
+Apache 2.0 license. Some code that was sourced from BoringSSL under the ISC
+license. In each case, the license info is at the top of the file.
+
+See src/polyfill/once_cell/LICENSE-APACHE and src/polyfill/once_cell/LICENSE-MIT
+for the license to code that was sourced from the once_cell project.
\ No newline at end of file
diff --git a/ring-0.17.14/.cargo_vcs_info.json b/ring-0.17.14/.cargo_vcs_info.json
new file mode 100644
index 0000000000..9361bf9c60
--- /dev/null
+++ b/ring-0.17.14/.cargo_vcs_info.json
@@ -0,0 +1,7 @@
+{
+  "git": {
+    "sha1": "2723abbca9e83347d82b056d5b239c6604f786df",
+    "dirty": true
+  },
+  "path_in_vcs": ""
+}
\ No newline at end of file
diff --git a/ring-0.17.14/Cargo.toml b/ring-0.17.14/Cargo.toml
new file mode 100644
index 0000000000..273383ece7
--- /dev/null
+++ b/ring-0.17.14/Cargo.toml
@@ -0,0 +1,231 @@
+[package]
+build = "build.rs"
+categories = ["cryptography", "no-std"]
+description = "An experiment."
+edition = "2021"
+keywords = ["crypto", "cryptography", "rand", "ECC", "RSA"]
+license = "Apache-2.0 AND ISC"
+name = "ring"
+repository = "https://github.com/briansmith/ring"
+
+# Keep in sync with .github/workflows/ci.yml ("MSRV") and see the MSRV note
+# in cpu/arm.rs.
+# 1.66 is required on x86/x86_64 for https://github.com/rust-lang/rust/pull/101861.
+rust-version = "1.66.0"
+
+# Keep in sync with `links` below.
+version = "0.17.14"
+
+# Keep in sync with `version` above.
+#
+# build.rs verifies that this equals "ring_core_{major}_{minor}_{patch}_{pre}"
+# as keeping this in sync with the symbol prefixing is crucial for ensuring
+# the safety of multiple versions of *ring* being used in a program.
+links = "ring_core_0_17_14_"
+
+include = [
+    "LICENSE",
+    "LICENSE-other-bits",
+    "LICENSE-BoringSSL",
+    "src/polyfill/once_cell/LICENSE-APACHE",
+    "src/polyfill/once_cell/LICENSE-MIT",
+
+    "Cargo.toml",
+
+    "pregenerated/*",
+
+    "benches/*.rs",
+    "build.rs",
+
+    "crypto/chacha/asm/chacha-armv4.pl",
+    "crypto/chacha/asm/chacha-armv8.pl",
+    "crypto/chacha/asm/chacha-x86.pl",
+    "crypto/chacha/asm/chacha-x86_64.pl",
+    "crypto/constant_time_test.c",
+    "crypto/cpu_intel.c",
+    "crypto/crypto.c",
+    "crypto/curve25519/asm/x25519-asm-arm.S",
+    "crypto/curve25519/curve25519.c",
+    "crypto/curve25519/curve25519_64_adx.c",
+    "crypto/curve25519/curve25519_tables.h",
+    "crypto/curve25519/internal.h",
+    "crypto/fipsmodule/aes/aes_nohw.c",
+    "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aesni-x86.pl",
+    "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aesv8-armx.pl",
+    "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl",
+    "crypto/fipsmodule/aes/asm/ghash-armv4.pl",
+    "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl",
+    "crypto/fipsmodule/aes/asm/ghash-x86.pl",
+    "crypto/fipsmodule/aes/asm/ghash-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/ghashv8-armx.pl",
+    "crypto/fipsmodule/aes/asm/bsaes-armv7.pl",
+    "crypto/fipsmodule/aes/asm/bsaes-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/vsaes-armv7.pl",
+    "crypto/fipsmodule/aes/asm/vpaes-armv7.pl",
+    "crypto/fipsmodule/aes/asm/vpaes-armv8.pl",
+    "crypto/fipsmodule/aes/asm/vpaes-x86.pl",
+    "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl",
+    "crypto/fipsmodule/bn/asm/armv4-mont.pl",
+    "crypto/fipsmodule/bn/asm/armv8-mont.pl",
+    "crypto/fipsmodule/bn/asm/x86-mont.pl",
+    "crypto/fipsmodule/bn/asm/x86_64-mont.pl",
+    "crypto/fipsmodule/bn/asm/x86_64-mont5.pl",
+    "crypto/fipsmodule/bn/internal.h",
+    "crypto/fipsmodule/bn/montgomery.c",
+    "crypto/fipsmodule/bn/montgomery_inv.c",
+    "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl",
+    "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
+    "crypto/fipsmodule/ec/ecp_nistz.c",
+    "crypto/fipsmodule/ec/ecp_nistz.h",
+    "crypto/fipsmodule/ec/ecp_nistz384.h",
+    "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/gfp_p256.c",
+    "crypto/fipsmodule/ec/gfp_p384.c",
+    "crypto/fipsmodule/ec/p256.c",
+    "crypto/fipsmodule/ec/p256-nistz-table.h",
+    "crypto/fipsmodule/ec/p256-nistz.c",
+    "crypto/fipsmodule/ec/p256-nistz.h",
+    "crypto/fipsmodule/ec/p256_shared.h",
+    "crypto/fipsmodule/ec/p256_table.h",
+    "crypto/fipsmodule/ec/util.h",
+    "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
+    "crypto/fipsmodule/sha/asm/sha256-armv4.pl",
+    "crypto/fipsmodule/sha/asm/sha512-armv4.pl",
+    "crypto/fipsmodule/sha/asm/sha512-armv8.pl",
+    "crypto/fipsmodule/sha/asm/sha512-x86_64.pl",
+    "crypto/internal.h",
+    "crypto/limbs/limbs.c",
+    "crypto/limbs/limbs.h",
+    "crypto/limbs/limbs.inl",
+    "crypto/mem.c",
+    "crypto/perlasm/arm-xlate.pl",
+    "crypto/perlasm/x86asm.pl",
+    "crypto/perlasm/x86gas.pl",
+    "crypto/perlasm/x86nasm.pl",
+    "crypto/perlasm/x86_64-xlate.pl",
+    "crypto/poly1305/poly1305.c",
+    "crypto/poly1305/poly1305_arm.c",
+    "crypto/poly1305/poly1305_arm_asm.S",
+    "crypto/cipher/asm/chacha20_poly1305_armv8.pl",
+    "crypto/cipher/asm/chacha20_poly1305_x86_64.pl",
+    "examples/**/*.rs",
+    "include/ring-core/aes.h",
+    "include/ring-core/asm_base.h",
+    "include/ring-core/base.h",
+    "include/ring-core/check.h",
+    "include/ring-core/mem.h",
+    "include/ring-core/target.h",
+    "include/ring-core/type_check.h",
+    "src/**/*.rs",
+    "src/aead/poly1305_test.txt",
+    "src/data/alg-rsa-encryption.der",
+    "src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der",
+    "src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der",
+    "src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der",
+    "src/rsa/signature_rsa_example_private_key.der",
+    "src/rsa/signature_rsa_example_public_key.der",
+    "tests/**/*.rs",
+    "tests/ecdsa_test_private_key_p256.p8",
+    "tests/ecdsa_test_public_key_p256.der",
+    "tests/ecdsa_test_public_key_p256_debug.txt",
+    "tests/ed25519_test_private_key.bin",
+    "tests/ed25519_test_private_key.p8",
+    "tests/ed25519_test_public_key.bin",
+    "tests/ed25519_test_public_key.der",
+    "tests/rsa_test_private_key_2048.p8",
+    "tests/rsa_test_public_key_2048.der",
+    "tests/rsa_test_public_key_2048_debug.txt",
+    "tests/rsa_test_public_modulus.bin",
+    "third_party/fiat/asm/fiat_curve25519_adx_mul.S",
+    "third_party/fiat/asm/fiat_curve25519_adx_square.S",
+    "third_party/fiat/curve25519_32.h",
+    "third_party/fiat/curve25519_64.h",
+    "third_party/fiat/curve25519_64_adx.h",
+    "third_party/fiat/curve25519_64_msvc.h",
+    "third_party/fiat/p256_32.h",
+    "third_party/fiat/p256_64.h",
+    "third_party/fiat/p256_64_msvc.h",
+    "third_party/fiat/LICENSE",
+]
+
+[package.metadata.docs.rs]
+all-features = true
+
+[lib]
+name = "ring"
+
+[dependencies]
+cfg-if = { version = "1.0.0", default-features = false }
+untrusted = { version = "0.9" }
+
+[target.'cfg(not(target_os = "optee"))'.dependencies]
+getrandom = { version = "0.2.10" }
+
+[target.'cfg(all(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")), any(target_os = "android", target_os = "linux")))'.dependencies]
+libc = { version = "0.2.148", default-features = false }
+
+[target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_vendor = "apple", any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))'.dependencies]
+libc = { version = "0.2.155", default-features = false }
+
+[target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))'.dependencies]
+windows-sys = { version = "0.52", features = ["Win32_Foundation", "Win32_System_Threading"] }
+
+[target.'cfg(target_os="optee")'.dependencies]
+optee-utee = { version = "0.4.0" }
+
+[target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies]
+wasm-bindgen-test = { version = "0.3.37", default-features = false, features = ["std"] }
+
+[target.'cfg(any(unix, windows, target_os = "wasi"))'.dev-dependencies]
+libc = { version = "0.2.148", default-features = false }
+
+[build-dependencies]
+cc = { version = "1.2.8", default-features = false }
+
+[features]
+# These features are documented in the top-level module's documentation.
+default = ["alloc", "dev_urandom_fallback"]
+alloc = []
+dev_urandom_fallback = []
+less-safe-getrandom-custom-or-rdrand = []
+less-safe-getrandom-espidf = []
+slow_tests = []
+std = ["alloc"]
+unstable-testing-arm-no-hw = []
+unstable-testing-arm-no-neon = []
+test_logging = []
+wasm32_unknown_unknown_js = ["getrandom/js"]
+
+# XXX: debug = false because of https://github.com/rust-lang/rust/issues/34122
+
+[profile.bench]
+opt-level = 3
+debug = false
+rpath = false
+lto = true
+debug-assertions = false
+codegen-units = 1
+
+[profile.release]
+opt-level = 3
+debug = false
+rpath = false
+lto = true
+debug-assertions = false
+codegen-units = 1
+
+[workspace]
+members = [
+    # intentionally not a default member so that `cargo test` doesn't cause criterion.rs and all its
+    # dependencies to get built.
+    "bench",
+
+    "cavp",
+]
+default-members = [
+    ".",
+    "cavp"
+]
diff --git a/ring-0.17.14/LICENSE b/ring-0.17.14/LICENSE
new file mode 100644
index 0000000000..2dac4d9d3f
--- /dev/null
+++ b/ring-0.17.14/LICENSE
@@ -0,0 +1,9 @@
+*ring* uses an "ISC" license, like BoringSSL used to use, for new code
+files. See LICENSE-other-bits for the text of that license.
+
+See LICENSE-BoringSSL for code that was sourced from BoringSSL under the
+Apache 2.0 license. Some code that was sourced from BoringSSL under the ISC
+license. In each case, the license info is at the top of the file.
+
+See src/polyfill/once_cell/LICENSE-APACHE and src/polyfill/once_cell/LICENSE-MIT
+for the license to code that was sourced from the once_cell project.
diff --git a/ring-0.17.14/LICENSE-BoringSSL b/ring-0.17.14/LICENSE-BoringSSL
new file mode 100644
index 0000000000..a0f82a1f8e
--- /dev/null
+++ b/ring-0.17.14/LICENSE-BoringSSL
@@ -0,0 +1,272 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+Licenses for support code
+-------------------------
+
+Parts of the TLS test suite are under the Go license. This code is not included
+in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
+distributing code linked against BoringSSL does not trigger this license:
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+BoringSSL uses the Chromium test infrastructure to run a continuous build,
+trybots etc. The scripts which manage this, and the script for generating build
+metadata, are under the Chromium license. Distributing code linked against
+BoringSSL does not trigger this license.
+
+Copyright 2015 The Chromium Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ring-0.17.14/LICENSE-other-bits b/ring-0.17.14/LICENSE-other-bits
new file mode 100644
index 0000000000..bccf030aec
--- /dev/null
+++ b/ring-0.17.14/LICENSE-other-bits
@@ -0,0 +1,13 @@
+Copyright 2015-2025 Brian Smith.
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/ring-0.17.14/README.md b/ring-0.17.14/README.md
new file mode 100644
index 0000000000..7ae6f6157e
--- /dev/null
+++ b/ring-0.17.14/README.md
@@ -0,0 +1,54 @@
+THE SOFTWARE IS PROVIDED "AS IS" AND BRIAN SMITH AND THE AUTHORS DISCLAIM
+ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL BRIAN SMITH OR THE AUTHORS
+BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+Most of the C and assembly language code in *ring* comes from BoringSSL. 
+BoringSSL is a fork of OpenSSL. This quote from the BoringSSL README.md 
+discouraging you from using it applies to this project:
+
+> BoringSSL is a fork of OpenSSL that is designed to meet Google's needs.
+>
+> Although BoringSSL is an open source project, it is not intended for general
+> use, as OpenSSL is. We don't recommend that third parties depend upon it.
+
+This project was originally shared on GitHub in 2015 as an experiment. It was
+put on crates.io shortly to help other people with their experiments. It is an
+experiment.
+
+
+Side Channels
+-------------
+
+See [SIDE-CHANNELS.md](SIDE-CHANNELS.md) for important information regarding
+the limitations of the side channel mitigations in this project.
+
+
+Toolchains & Targets
+--------------------
+
+Be especially weary about using toolchains (C compilers, etc.) or targets
+that aren't supported by other projects, especially BoringSSL. The further you
+are from using the same version of Clang that Chrome uses, the more weary you
+should be.
+
+
+Bug Reporting
+-------------
+
+For security vulnerabilities, see https://github.com/briansmith/ring/security/policy.
+
+Please report bugs that aren't security vulnerabilities either as pull requests or as issues in
+[the issue tracker](https://github.com/briansmith/ring/issues).
+
+
+
+Release Notes
+-------------
+It is recommended that you review every commit in this project. Some
+particularly noteworthy changes are noted in the [RELEASES.md](RELEASES.md). We could use some
+help in making this better.
diff --git a/ring-0.17.14/build.rs b/ring-0.17.14/build.rs
new file mode 100644
index 0000000000..2441037315
--- /dev/null
+++ b/ring-0.17.14/build.rs
@@ -0,0 +1,1045 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Build the non-Rust components.
+
+// It seems like it would be a good idea to use `log!` for logging, but it
+// isn't worth having the external dependencies (one for the `log` crate, and
+// another for the concrete logging implementation). Instead we use `eprintln!`
+// to log everything to stderr.
+
+use std::{
+    ffi::{OsStr, OsString},
+    fs::{self, DirEntry},
+    io::Write,
+    path::{Path, PathBuf},
+    process::{Command, Stdio},
+};
+
+mod env {
+    use std::ffi::OsString;
+
+    /// Read an environment variable and tell Cargo that we depend on it.
+    ///
+    /// The name is static since we intend to only read a static set of environment
+    /// variables.
+    pub fn var_os(name: &'static str) -> Option<OsString> {
+        println!("cargo:rerun-if-env-changed={}", name);
+        std::env::var_os(name)
+    }
+
+    pub fn var(name: &'static str) -> Option<String> {
+        var_os(name).and_then(|value| value.into_string().ok())
+    }
+}
+
+const X86: &str = "x86";
+const X86_64: &str = "x86_64";
+const AARCH64: &str = "aarch64";
+const ARM: &str = "arm";
+const WASM32: &str = "wasm32";
+
+#[rustfmt::skip]
+const RING_SRCS: &[(&[&str], &str)] = &[
+    (&[], "crypto/curve25519/curve25519.c"),
+    (&[], "crypto/fipsmodule/aes/aes_nohw.c"),
+    (&[], "crypto/fipsmodule/bn/montgomery.c"),
+    (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
+    (&[], "crypto/fipsmodule/ec/ecp_nistz.c"),
+    (&[], "crypto/fipsmodule/ec/gfp_p256.c"),
+    (&[], "crypto/fipsmodule/ec/gfp_p384.c"),
+    (&[], "crypto/fipsmodule/ec/p256.c"),
+    (&[], "crypto/limbs/limbs.c"),
+    (&[], "crypto/mem.c"),
+    (&[], "crypto/poly1305/poly1305.c"),
+
+    (&[ARM, X86_64, X86], "crypto/crypto.c"),
+
+    (&[X86_64, X86], "crypto/cpu_intel.c"),
+
+    (&[X86], "crypto/fipsmodule/aes/asm/aesni-x86.pl"),
+    (&[X86], "crypto/fipsmodule/aes/asm/ghash-x86.pl"),
+    (&[X86], "crypto/fipsmodule/aes/asm/vpaes-x86.pl"),
+    (&[X86], "crypto/fipsmodule/bn/asm/x86-mont.pl"),
+    (&[X86], "crypto/chacha/asm/chacha-x86.pl"),
+
+    (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
+    (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont.pl"),
+    (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont5.pl"),
+    (&[X86_64], "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"),
+    (&[X86_64], SHA512_X86_64),
+    (&[X86_64], "crypto/cipher/asm/chacha20_poly1305_x86_64.pl"),
+    (&[X86_64], "third_party/fiat/asm/fiat_curve25519_adx_mul.S"),
+    (&[X86_64], "third_party/fiat/asm/fiat_curve25519_adx_square.S"),
+
+    (&[AARCH64, X86_64], "crypto/fipsmodule/ec/p256-nistz.c"),
+
+    (&[ARM], "crypto/fipsmodule/aes/asm/bsaes-armv7.pl"),
+    (&[ARM], "crypto/fipsmodule/aes/asm/ghash-armv4.pl"),
+    (&[ARM], "crypto/fipsmodule/aes/asm/vpaes-armv7.pl"),
+    (&[ARM], "crypto/fipsmodule/bn/asm/armv4-mont.pl"),
+    (&[ARM], "crypto/chacha/asm/chacha-armv4.pl"),
+    (&[ARM], "crypto/curve25519/asm/x25519-asm-arm.S"),
+    (&[ARM], "crypto/poly1305/poly1305_arm.c"),
+    (&[ARM], "crypto/poly1305/poly1305_arm_asm.S"),
+    (&[ARM], "crypto/fipsmodule/sha/asm/sha256-armv4.pl"),
+    (&[ARM], "crypto/fipsmodule/sha/asm/sha512-armv4.pl"),
+
+    (&[AARCH64], "crypto/chacha/asm/chacha-armv8.pl"),
+    (&[AARCH64], "crypto/cipher/asm/chacha20_poly1305_armv8.pl"),
+    (&[AARCH64], "crypto/fipsmodule/aes/asm/aesv8-armx.pl"),
+    (&[AARCH64], "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl"),
+    (&[AARCH64], "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl"),
+    (&[AARCH64], "crypto/fipsmodule/aes/asm/ghashv8-armx.pl"),
+    (&[AARCH64], "crypto/fipsmodule/aes/asm/vpaes-armv8.pl"),
+    (&[AARCH64], "crypto/fipsmodule/bn/asm/armv8-mont.pl"),
+    (&[AARCH64], "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl"),
+    (&[AARCH64], SHA512_ARMV8),
+];
+
+const SHA256_X86_64: &str = "crypto/fipsmodule/sha/asm/sha256-x86_64.pl";
+const SHA512_X86_64: &str = "crypto/fipsmodule/sha/asm/sha512-x86_64.pl";
+
+const SHA256_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha256-armv8.pl";
+const SHA512_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha512-armv8.pl";
+
+const RING_TEST_SRCS: &[&str] = &[("crypto/constant_time_test.c")];
+
+const PREGENERATED: &str = "pregenerated";
+
+fn cpp_flags(compiler: &cc::Tool) -> &'static [&'static str] {
+    if !compiler.is_like_msvc() {
+        static NON_MSVC_FLAGS: &[&str] = &[
+            "-fvisibility=hidden",
+            "-std=c1x", // GCC 4.6 requires "c1x" instead of "c11"
+            "-Wall",
+            "-Wbad-function-cast",
+            "-Wcast-align",
+            "-Wcast-qual",
+            "-Wconversion",
+            "-Wmissing-field-initializers",
+            "-Wmissing-include-dirs",
+            "-Wnested-externs",
+            "-Wredundant-decls",
+            "-Wshadow",
+            "-Wsign-compare",
+            "-Wsign-conversion",
+            "-Wstrict-prototypes",
+            "-Wundef",
+            "-Wuninitialized",
+        ];
+        NON_MSVC_FLAGS
+    } else {
+        static MSVC_FLAGS: &[&str] = &[
+            "/Gy", // Enable function-level linking.
+            "/Zc:wchar_t",
+            "/Zc:forScope",
+            "/Zc:inline",
+            // Warnings.
+            "/Wall",
+            "/wd4127", // C4127: conditional expression is constant
+            "/wd4464", // C4464: relative include path contains '..'
+            "/wd4514", // C4514: <name>: unreferenced inline function has be
+            "/wd4710", // C4710: function not inlined
+            "/wd4711", // C4711: function 'function' selected for inline expansion
+            "/wd4820", // C4820: <struct>: <n> bytes padding added after <name>
+            "/wd5045", /* C5045: Compiler will insert Spectre mitigation for memory load if
+                        * /Qspectre switch specified */
+        ];
+        MSVC_FLAGS
+    }
+}
+
+// None means "any OS" or "any target". The first match in sequence order is
+// taken.
+const ASM_TARGETS: &[AsmTarget] = &[
+    AsmTarget {
+        oss: LINUX_ABI,
+        arch: AARCH64,
+        perlasm_format: "linux64",
+    },
+    AsmTarget {
+        oss: LINUX_ABI,
+        arch: ARM,
+        perlasm_format: "linux32",
+    },
+    AsmTarget {
+        oss: LINUX_ABI,
+        arch: X86,
+        perlasm_format: "elf",
+    },
+    AsmTarget {
+        oss: LINUX_ABI,
+        arch: X86_64,
+        perlasm_format: "elf",
+    },
+    AsmTarget {
+        oss: &["horizon"],
+        arch: ARM,
+        perlasm_format: "linux32",
+    },
+    AsmTarget {
+        oss: APPLE_ABI,
+        arch: AARCH64,
+        perlasm_format: "ios64",
+    },
+    AsmTarget {
+        oss: APPLE_ABI,
+        arch: X86_64,
+        perlasm_format: "macosx",
+    },
+    AsmTarget {
+        oss: &[WINDOWS],
+        arch: X86,
+        perlasm_format: WIN32N,
+    },
+    AsmTarget {
+        oss: &[WINDOWS],
+        arch: X86_64,
+        perlasm_format: NASM,
+    },
+    AsmTarget {
+        oss: &[WINDOWS],
+        arch: AARCH64,
+        perlasm_format: "win64",
+    },
+];
+
+struct AsmTarget {
+    /// Operating systems.
+    oss: &'static [&'static str],
+
+    /// Architectures.
+    arch: &'static str,
+
+    /// The PerlAsm format name.
+    perlasm_format: &'static str,
+}
+
+impl AsmTarget {
+    fn use_nasm(&self) -> bool {
+        [WIN32N, NASM].contains(&self.perlasm_format)
+    }
+}
+
+/// Operating systems that have the same ABI as Linux on every architecture
+/// mentioned in `ASM_TARGETS`.
+const LINUX_ABI: &[&str] = &[
+    "android",
+    "dragonfly",
+    "freebsd",
+    "fuchsia",
+    "haiku",
+    "hurd",
+    "illumos",
+    "netbsd",
+    "openbsd",
+    "linux",
+    "redox",
+    "solaris",
+    "optee",
+];
+
+const WIN32N: &str = "win32n";
+const NASM: &str = "nasm";
+
+/// Operating systems that have the same ABI as macOS on every architecture
+/// mentioned in `ASM_TARGETS`.
+const APPLE_ABI: &[&str] = &["ios", "macos", "tvos", "visionos", "watchos"];
+
+const WINDOWS: &str = "windows";
+
+fn main() {
+    // Avoid assuming the working directory is the same is the $CARGO_MANIFEST_DIR so that toolchains
+    // which may assume other working directories can still build this code.
+    let c_root_dir = PathBuf::from(
+        env::var_os("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR should always be set"),
+    );
+
+    // Keep in sync with `core_name_and_version!` in prefixed.rs.
+    let core_name_and_version = [
+        &env::var("CARGO_PKG_NAME").unwrap(),
+        "core",
+        &env::var("CARGO_PKG_VERSION_MAJOR").unwrap(),
+        &env::var("CARGO_PKG_VERSION_MINOR").unwrap(),
+        &env::var("CARGO_PKG_VERSION_PATCH").unwrap(),
+        &env::var("CARGO_PKG_VERSION_PRE").unwrap(), // Often empty
+    ]
+    .join("_");
+    // Ensure `links` in Cargo.toml is consistent with the version.
+    assert_eq!(
+        &env::var("CARGO_MANIFEST_LINKS").unwrap(),
+        &core_name_and_version
+    );
+
+    const RING_PREGENERATE_ASM: &str = "RING_PREGENERATE_ASM";
+    match env::var_os(RING_PREGENERATE_ASM).as_deref() {
+        Some(s) if s == "1" => {
+            pregenerate_asm_main(&c_root_dir, &core_name_and_version);
+        }
+        None => ring_build_rs_main(&c_root_dir, &core_name_and_version),
+        _ => {
+            panic!("${} has an invalid value", RING_PREGENERATE_ASM);
+        }
+    }
+}
+
+fn ring_build_rs_main(c_root_dir: &Path, core_name_and_version: &str) {
+    let out_dir = env::var_os("OUT_DIR").unwrap();
+    let out_dir = PathBuf::from(out_dir);
+
+    let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+    let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
+    let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
+    let endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
+    let is_little_endian = endian == "little";
+
+    let is_git = fs::metadata(c_root_dir.join(".git")).is_ok();
+
+    // Published builds are always built in release mode.
+    let is_debug = is_git && env::var("DEBUG").unwrap() != "false";
+
+    // During local development, force warnings in non-Rust code to be treated
+    // as errors. Since warnings are highly compiler-dependent and compilers
+    // don't maintain backward compatibility w.r.t. which warnings they issue,
+    // don't do this for packaged builds.
+    let force_warnings_into_errors = is_git;
+
+    let target = Target {
+        arch,
+        os,
+        env,
+        is_debug,
+        force_warnings_into_errors,
+    };
+
+    let asm_target = if is_little_endian {
+        ASM_TARGETS.iter().find(|asm_target| {
+            asm_target.arch == target.arch && asm_target.oss.contains(&target.os.as_ref())
+        })
+    } else {
+        None
+    };
+
+    // If `.git` exists then assume this is the "local hacking" case where
+    // we want to make it easy to build *ring* using `cargo build`/`cargo test`
+    // without a prerequisite `package` step, at the cost of needing additional
+    // tools like `Perl` and/or `nasm`.
+    //
+    // If `.git` doesn't exist then assume that this is a packaged build where
+    // we want to optimize for minimizing the build tools required: No Perl,
+    // no nasm, etc.
+    let generated_dir = if !is_git {
+        c_root_dir.join(PREGENERATED)
+    } else {
+        generate_sources_and_preassemble(
+            &out_dir,
+            asm_target.into_iter(),
+            c_root_dir,
+            core_name_and_version,
+        );
+        out_dir.clone()
+    };
+
+    build_c_code(
+        asm_target,
+        &target,
+        &generated_dir,
+        c_root_dir,
+        &out_dir,
+        core_name_and_version,
+    );
+    emit_rerun_if_changed()
+}
+
+fn pregenerate_asm_main(c_root_dir: &Path, core_name_and_version: &str) {
+    let pregenerated = c_root_dir.join(PREGENERATED);
+    fs::create_dir(&pregenerated).unwrap();
+    generate_sources_and_preassemble(
+        &pregenerated,
+        ASM_TARGETS.iter(),
+        c_root_dir,
+        core_name_and_version,
+    );
+}
+
+fn generate_sources_and_preassemble<'a>(
+    out_dir: &Path,
+    asm_targets: impl Iterator<Item = &'a AsmTarget>,
+    c_root_dir: &Path,
+    core_name_and_version: &str,
+) {
+    generate_prefix_symbols_headers(out_dir, core_name_and_version).unwrap();
+
+    let perl_exe = get_perl_exe();
+
+    for asm_target in asm_targets {
+        let perlasm_src_dsts = perlasm_src_dsts(out_dir, asm_target);
+        perlasm(&perl_exe, &perlasm_src_dsts, asm_target, c_root_dir);
+
+        if asm_target.use_nasm() {
+            // Package pregenerated object files in addition to pregenerated
+            // assembly language source files, so that the user doesn't need
+            // to install the assembler.
+            let srcs = asm_srcs(perlasm_src_dsts);
+            for src in srcs {
+                nasm(&src, asm_target.arch, out_dir, out_dir, c_root_dir);
+            }
+        }
+    }
+}
+
+struct Target {
+    arch: String,
+    os: String,
+    env: String,
+
+    /// Is this a debug build? This affects whether assertions might be enabled
+    /// in the C code. For packaged builds, this should always be `false`.
+    is_debug: bool,
+
+    /// true: Force warnings to be treated as errors.
+    /// false: Use the default behavior (perhaps determined by `$CFLAGS`, etc.)
+    force_warnings_into_errors: bool,
+}
+
+fn build_c_code(
+    asm_target: Option<&AsmTarget>,
+    target: &Target,
+    generated_dir: &Path,
+    c_root_dir: &Path,
+    out_dir: &Path,
+    core_name_and_version: &str,
+) {
+    let (asm_srcs, obj_srcs) = if let Some(asm_target) = asm_target {
+        let perlasm_src_dsts = perlasm_src_dsts(generated_dir, asm_target);
+
+        let asm_srcs = asm_srcs(perlasm_src_dsts);
+
+        if asm_target.use_nasm() {
+            // Nasm was already used to generate the object files, so use them instead of
+            // assembling.
+            let obj_srcs = asm_srcs
+                .iter()
+                .map(|src| obj_path(generated_dir, src.as_path()))
+                .collect::<Vec<_>>();
+            (vec![], obj_srcs)
+        } else {
+            (asm_srcs, vec![])
+        }
+    } else {
+        (vec![], vec![])
+    };
+
+    let core_srcs = sources_for_arch(&target.arch)
+        .into_iter()
+        .filter(|p| !is_perlasm(p))
+        .filter(|p| {
+            if let Some(extension) = p.extension() {
+                // We don't (and can't) use any .S on Windows since MSVC and NASM can't assemble
+                // them.
+                if extension == "S"
+                    && (target.arch == X86_64 || target.arch == X86)
+                    && target.os == WINDOWS
+                {
+                    return false;
+                }
+            }
+            true
+        })
+        .collect::<Vec<_>>();
+
+    let test_srcs = RING_TEST_SRCS.iter().map(PathBuf::from).collect::<Vec<_>>();
+
+    let libs = [
+        (
+            core_name_and_version,
+            &core_srcs[..],
+            &asm_srcs[..],
+            &obj_srcs[..],
+        ),
+        (
+            &(String::from(core_name_and_version) + "_test"),
+            &test_srcs[..],
+            &[],
+            &[],
+        ),
+    ];
+
+    // XXX: Ideally, ring-test would only be built for `cargo test`, but Cargo
+    // can't do that yet.
+    libs.iter()
+        .for_each(|&(lib_name, srcs, asm_srcs, obj_srcs)| {
+            let srcs = srcs.iter().chain(asm_srcs);
+            build_library(
+                target,
+                c_root_dir,
+                out_dir,
+                lib_name,
+                srcs,
+                generated_dir,
+                obj_srcs,
+            )
+        });
+
+    println!(
+        "cargo:rustc-link-search=native={}",
+        out_dir.to_str().expect("Invalid path")
+    );
+}
+
+fn new_build(target: &Target, c_root_dir: &Path, include_dir: &Path) -> cc::Build {
+    let mut b = cc::Build::new();
+    configure_cc(&mut b, target, c_root_dir, include_dir);
+    b
+}
+
+fn build_library<'a>(
+    target: &Target,
+    c_root_dir: &Path,
+    out_dir: &Path,
+    lib_name: &str,
+    srcs: impl Iterator<Item = &'a PathBuf>,
+    include_dir: &Path,
+    preassembled_objs: &[PathBuf],
+) {
+    let mut c = new_build(target, c_root_dir, include_dir);
+
+    // Compile all the (dirty) source files into object files.
+    srcs.for_each(|src| {
+        c.file(c_root_dir.join(src));
+    });
+
+    preassembled_objs.iter().for_each(|obj| {
+        c.object(obj);
+    });
+
+    // Rebuild the library if necessary.
+    let lib_path = PathBuf::from(out_dir).join(format!("lib{}.a", lib_name));
+
+    // Handled below.
+    let _ = c.cargo_metadata(false);
+
+    c.compile(
+        lib_path
+            .file_name()
+            .and_then(|f| f.to_str())
+            .expect("No filename"),
+    );
+
+    // Link the library. This works even when the library doesn't need to be
+    // rebuilt.
+    println!("cargo:rustc-link-lib=static={}", lib_name);
+}
+
+fn obj_path(out_dir: &Path, src: &Path) -> PathBuf {
+    let mut out_path = out_dir.join(src.file_name().unwrap());
+    // To eliminate unnecessary conditional logic, use ".o" as the extension,
+    // even when the compiler (e.g. MSVC) would normally use something else
+    // (e.g. ".obj"). cc-rs seems to do the same.
+    assert!(out_path.set_extension("o"));
+    out_path
+}
+
+fn configure_cc(c: &mut cc::Build, target: &Target, c_root_dir: &Path, include_dir: &Path) {
+    let compiler = c.get_compiler();
+    // FIXME: On Windows AArch64 we currently must use Clang to compile C code
+    let compiler = if target.os == WINDOWS && target.arch == AARCH64 && !compiler.is_like_clang() {
+        let _ = c.compiler("clang");
+        c.get_compiler()
+    } else {
+        compiler
+    };
+
+    let _ = c.include(c_root_dir.join("include"));
+    let _ = c.include(include_dir);
+    for f in cpp_flags(&compiler) {
+        let _ = c.flag(f);
+    }
+
+    if APPLE_ABI.contains(&target.os.as_str()) {
+        // ``-gfull`` is required for Darwin's |-dead_strip|.
+        let _ = c.flag("-gfull");
+    } else if !compiler.is_like_msvc() {
+        let _ = c.flag("-g3");
+    };
+
+    if !target.is_debug {
+        let _ = c.define("NDEBUG", None);
+    }
+
+    if target.arch == X86 {
+        let is_msvc_not_clang_cl = compiler.is_like_msvc() && !compiler.is_like_clang_cl();
+        if !is_msvc_not_clang_cl {
+            let _ = c.flag("-msse2");
+        }
+    }
+
+    // Allow cross-compiling without a target sysroot for these targets.
+    if (target.arch == WASM32)
+        || (target.os == "linux" && target.env == "musl" && target.arch != X86_64)
+    {
+        // TODO: Expand this to non-clang compilers in 0.17.0 if practical.
+        if compiler.is_like_clang() {
+            let _ = c.flag("-nostdlibinc");
+            let _ = c.define("RING_CORE_NOSTDLIBINC", "1");
+        }
+    }
+
+    if target.force_warnings_into_errors {
+        c.warnings_into_errors(true);
+    }
+}
+
+fn nasm(file: &Path, arch: &str, include_dir: &Path, out_dir: &Path, c_root_dir: &Path) {
+    let out_file = obj_path(out_dir, file);
+    let oformat = match arch {
+        x if x == X86_64 => "win64",
+        x if x == X86 => "win32",
+        _ => panic!("unsupported arch: {}", arch),
+    };
+
+    // Nasm requires that the path end in a path separator.
+    let mut include_dir = include_dir.as_os_str().to_os_string();
+    include_dir.push(OsString::from(String::from(std::path::MAIN_SEPARATOR)));
+
+    let mut c = Command::new("./target/tools/windows/nasm/nasm");
+    let _ = c
+        .arg("-o")
+        .arg(out_file.to_str().expect("Invalid path"))
+        .arg("-f")
+        .arg(oformat)
+        .arg("-i")
+        .arg("include/")
+        .arg("-i")
+        .arg(include_dir)
+        .arg("-Xgnu")
+        .arg("-gcv8")
+        .arg(c_root_dir.join(file));
+    run_command(c);
+}
+
+fn run_command_with_args(command_name: &Path, args: &[OsString]) {
+    let mut cmd = Command::new(command_name);
+    let _ = cmd.args(args);
+    run_command(cmd)
+}
+
+fn run_command(mut cmd: Command) {
+    eprintln!("running {:?}", cmd);
+    cmd.stderr(Stdio::inherit());
+    let status = cmd.status().unwrap_or_else(|e| {
+        panic!("failed to execute [{:?}]: {}", cmd, e);
+    });
+    if !status.success() {
+        panic!("execution failed");
+    }
+}
+
+fn sources_for_arch(arch: &str) -> Vec<PathBuf> {
+    RING_SRCS
+        .iter()
+        .filter(|&&(archs, _)| archs.is_empty() || archs.contains(&arch))
+        .map(|&(_, p)| PathBuf::from(p))
+        .collect::<Vec<_>>()
+}
+
+fn perlasm_src_dsts(out_dir: &Path, asm_target: &AsmTarget) -> Vec<(PathBuf, PathBuf)> {
+    let srcs = sources_for_arch(asm_target.arch);
+    let mut src_dsts = srcs
+        .iter()
+        .filter(|p| is_perlasm(p))
+        .map(|src| (src.clone(), asm_path(out_dir, src, asm_target)))
+        .collect::<Vec<_>>();
+
+    // Some PerlAsm source files need to be run multiple times with different
+    // output paths.
+    {
+        // Appease the borrow checker.
+        let mut maybe_synthesize = |concrete, synthesized| {
+            let concrete_path = PathBuf::from(concrete);
+            if srcs.contains(&concrete_path) {
+                let synthesized_path = PathBuf::from(synthesized);
+                src_dsts.push((
+                    concrete_path,
+                    asm_path(out_dir, &synthesized_path, asm_target),
+                ))
+            }
+        };
+        maybe_synthesize(SHA512_X86_64, SHA256_X86_64);
+        maybe_synthesize(SHA512_ARMV8, SHA256_ARMV8);
+    }
+
+    src_dsts
+}
+
+fn asm_srcs(perlasm_src_dsts: Vec<(PathBuf, PathBuf)>) -> Vec<PathBuf> {
+    perlasm_src_dsts
+        .into_iter()
+        .map(|(_src, dst)| dst)
+        .collect::<Vec<_>>()
+}
+
+fn is_perlasm(path: &Path) -> bool {
+    path.extension().unwrap().to_str().unwrap() == "pl"
+}
+
+fn asm_path(out_dir: &Path, src: &Path, asm_target: &AsmTarget) -> PathBuf {
+    let src_stem = src.file_stem().expect("source file without basename");
+
+    let dst_stem = src_stem.to_str().unwrap();
+    let dst_filename = format!("{}-{}", dst_stem, asm_target.perlasm_format);
+    let extension = if asm_target.use_nasm() { "asm" } else { "S" };
+    out_dir.join(dst_filename).with_extension(extension)
+}
+
+fn perlasm(
+    perl_exe: &Path,
+    src_dst: &[(PathBuf, PathBuf)],
+    asm_target: &AsmTarget,
+    c_root_dir: &Path,
+) {
+    for (src, dst) in src_dst {
+        let mut args = vec![
+            join_components_with_forward_slashes(&c_root_dir.join(src)),
+            asm_target.perlasm_format.into(),
+        ];
+        if asm_target.arch == X86 {
+            args.push("-fPIC".into());
+        }
+        // Work around PerlAsm issue for ARM and AAarch64 targets by replacing
+        // back slashes with forward slashes.
+        args.push(join_components_with_forward_slashes(dst));
+        run_command_with_args(perl_exe, &args);
+    }
+}
+
+fn join_components_with_forward_slashes(path: &Path) -> OsString {
+    let parts = path.components().map(|c| c.as_os_str()).collect::<Vec<_>>();
+    parts.join(OsStr::new("/"))
+}
+
+fn get_perl_exe() -> PathBuf {
+    get_command("PERL_EXECUTABLE", "perl")
+}
+
+fn get_command(var: &'static str, default: &str) -> PathBuf {
+    PathBuf::from(env::var_os(var).unwrap_or_else(|| default.into()))
+}
+
+// TODO: We should emit `cargo:rerun-if-changed-env` for the various
+// environment variables that affect the build.
+fn emit_rerun_if_changed() {
+    for path in &["crypto", "include", "third_party/fiat"] {
+        walk_dir(&PathBuf::from(path), &|entry| {
+            let path = entry.path();
+            match path.extension().and_then(|ext| ext.to_str()) {
+                Some("c") | Some("S") | Some("h") | Some("inl") | Some("pl") | None => {
+                    println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
+                }
+                _ => {
+                    // Ignore other types of files.
+                }
+            }
+        })
+    }
+}
+
+fn walk_dir(dir: &Path, cb: &impl Fn(&DirEntry)) {
+    if dir.is_dir() {
+        for entry in fs::read_dir(dir).unwrap() {
+            let entry = entry.unwrap();
+            let path = entry.path();
+            if path.is_dir() {
+                walk_dir(&path, cb);
+            } else {
+                cb(&entry);
+            }
+        }
+    }
+}
+
+/// Creates the necessary header files for symbol renaming.
+///
+/// For simplicity, both non-Nasm- and Nasm- style headers are always
+/// generated, even though local non-packaged builds need only one of them.
+fn generate_prefix_symbols_headers(
+    out_dir: &Path,
+    core_name_and_version: &str,
+) -> Result<(), std::io::Error> {
+    let prefix = &(String::from(core_name_and_version) + "_");
+
+    generate_prefix_symbols_header(out_dir, "prefix_symbols.h", '#', None, prefix)?;
+
+    generate_prefix_symbols_header(
+        out_dir,
+        "prefix_symbols_asm.h",
+        '#',
+        Some("#if defined(__APPLE__)"),
+        prefix,
+    )?;
+
+    generate_prefix_symbols_header(
+        out_dir,
+        "prefix_symbols_nasm.inc",
+        '%',
+        Some("%ifidn __OUTPUT_FORMAT__,win32"),
+        prefix,
+    )?;
+
+    Ok(())
+}
+
+fn generate_prefix_symbols_header(
+    out_dir: &Path,
+    filename: &str,
+    pp: char,
+    prefix_condition: Option<&str>,
+    prefix: &str,
+) -> Result<(), std::io::Error> {
+    let dir = out_dir.join("ring_core_generated");
+    fs::create_dir_all(&dir)?;
+
+    let path = dir.join(filename);
+    let mut file = fs::File::create(path)?;
+
+    let filename_ident = filename.replace('.', "_").to_uppercase();
+    writeln!(
+        file,
+        r#"
+{pp}ifndef ring_core_generated_{filename_ident}
+{pp}define ring_core_generated_{filename_ident}
+"#,
+        pp = pp,
+        filename_ident = filename_ident
+    )?;
+
+    if let Some(prefix_condition) = prefix_condition {
+        writeln!(file, "{}", prefix_condition)?;
+        writeln!(file, "{}", prefix_all_symbols(pp, "_", prefix))?;
+        writeln!(file, "{pp}else", pp = pp)?;
+    };
+    writeln!(file, "{}", prefix_all_symbols(pp, "", prefix))?;
+    if prefix_condition.is_some() {
+        writeln!(file, "{pp}endif", pp = pp)?
+    }
+
+    writeln!(file, "{pp}endif", pp = pp)?;
+
+    Ok(())
+}
+
+fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
+    // Rename some nistz256 assembly functions to match the names of their
+    // polyfills.
+    static SYMBOLS_TO_RENAME: &[(&str, &str)] = &[
+        ("ecp_nistz256_point_double", "p256_point_double"),
+        ("ecp_nistz256_point_add", "p256_point_add"),
+        ("ecp_nistz256_point_add_affine", "p256_point_add_affine"),
+        ("ecp_nistz256_ord_mul_mont", "p256_scalar_mul_mont"),
+        ("ecp_nistz256_ord_sqr_mont", "p256_scalar_sqr_rep_mont"),
+        ("ecp_nistz256_mul_mont", "p256_mul_mont"),
+        ("ecp_nistz256_sqr_mont", "p256_sqr_mont"),
+    ];
+
+    static SYMBOLS_TO_PREFIX: &[&str] = &[
+        "adx_bmi2_available",
+        "avx2_available",
+        "CRYPTO_memcmp",
+        "CRYPTO_poly1305_finish",
+        "CRYPTO_poly1305_finish_neon",
+        "CRYPTO_poly1305_init",
+        "CRYPTO_poly1305_init_neon",
+        "CRYPTO_poly1305_update",
+        "CRYPTO_poly1305_update_neon",
+        "ChaCha20_ctr32",
+        "ChaCha20_ctr32_avx2",
+        "ChaCha20_ctr32_neon",
+        "ChaCha20_ctr32_nohw",
+        "ChaCha20_ctr32_ssse3",
+        "ChaCha20_ctr32_ssse3_4x",
+        "LIMB_is_zero",
+        "LIMBS_add_mod",
+        "LIMBS_are_zero",
+        "LIMBS_equal",
+        "LIMBS_less_than",
+        "LIMBS_reduce_once",
+        "LIMBS_select_512_32",
+        "LIMBS_shl_mod",
+        "LIMBS_sub_mod",
+        "LIMBS_window5_split_window",
+        "LIMBS_window5_unsplit_window",
+        "LIMB_shr",
+        "OPENSSL_cpuid_setup",
+        "aes_gcm_dec_kernel",
+        "aes_gcm_dec_update_vaes_avx2",
+        "aes_gcm_enc_kernel",
+        "aes_gcm_enc_update_vaes_avx2",
+        "aes_hw_ctr32_encrypt_blocks",
+        "aes_hw_set_encrypt_key",
+        "aes_hw_set_encrypt_key_alt",
+        "aes_hw_set_encrypt_key_base",
+        "aes_nohw_ctr32_encrypt_blocks",
+        "aes_nohw_encrypt",
+        "aes_nohw_set_encrypt_key",
+        "aesni_gcm_decrypt",
+        "aesni_gcm_encrypt",
+        "bn_from_montgomery_in_place",
+        "bn_gather5",
+        "bn_mul_mont",
+        "bn_mul_mont_nohw",
+        "bn_mul4x_mont",
+        "bn_mulx4x_mont",
+        "bn_mul8x_mont_neon",
+        "bn_mul4x_mont_gather5",
+        "bn_mulx4x_mont_gather5",
+        "bn_neg_inv_mod_r_u64",
+        "bn_power5_nohw",
+        "bn_powerx5",
+        "bn_scatter5",
+        "bn_sqr8x_internal",
+        "bn_sqr8x_mont",
+        "bn_sqrx8x_internal",
+        "bsaes_ctr32_encrypt_blocks",
+        "bssl_constant_time_test_conditional_memcpy",
+        "bssl_constant_time_test_conditional_memxor",
+        "bssl_constant_time_test_main",
+        "chacha20_poly1305_open",
+        "chacha20_poly1305_open_avx2",
+        "chacha20_poly1305_open_sse41",
+        "chacha20_poly1305_seal",
+        "chacha20_poly1305_seal_avx2",
+        "chacha20_poly1305_seal_sse41",
+        "ecp_nistz256_mul_mont_adx",
+        "ecp_nistz256_mul_mont_nohw",
+        "ecp_nistz256_ord_mul_mont_adx",
+        "ecp_nistz256_ord_mul_mont_nohw",
+        "ecp_nistz256_ord_sqr_mont_adx",
+        "ecp_nistz256_ord_sqr_mont_nohw",
+        "ecp_nistz256_point_add_adx",
+        "ecp_nistz256_point_add_nohw",
+        "ecp_nistz256_point_add_affine_adx",
+        "ecp_nistz256_point_add_affine_nohw",
+        "ecp_nistz256_point_double_adx",
+        "ecp_nistz256_point_double_nohw",
+        "ecp_nistz256_select_w5_avx2",
+        "ecp_nistz256_select_w5_nohw",
+        "ecp_nistz256_select_w7_avx2",
+        "ecp_nistz256_select_w7_nohw",
+        "ecp_nistz256_sqr_mont_adx",
+        "ecp_nistz256_sqr_mont_nohw",
+        "fiat_curve25519_adx_mul",
+        "fiat_curve25519_adx_square",
+        "gcm_ghash_avx",
+        "gcm_ghash_clmul",
+        "gcm_ghash_neon",
+        "gcm_ghash_vpclmulqdq_avx2_1",
+        "gcm_gmult_clmul",
+        "gcm_gmult_neon",
+        "gcm_init_avx",
+        "gcm_init_clmul",
+        "gcm_init_neon",
+        "gcm_init_vpclmulqdq_avx2",
+        "k25519Precomp",
+        "limbs_mul_add_limb",
+        "little_endian_bytes_from_scalar",
+        "ecp_nistz256_neg",
+        "ecp_nistz256_select_w5",
+        "ecp_nistz256_select_w7",
+        "neon_available",
+        "p256_mul_mont",
+        "p256_point_add",
+        "p256_point_add_affine",
+        "p256_point_double",
+        "p256_point_mul",
+        "p256_point_mul_base",
+        "p256_point_mul_base_vartime",
+        "p256_scalar_mul_mont",
+        "p256_scalar_sqr_rep_mont",
+        "p256_sqr_mont",
+        "p384_elem_div_by_2",
+        "p384_elem_mul_mont",
+        "p384_elem_neg",
+        "p384_elem_sub",
+        "p384_point_add",
+        "p384_point_double",
+        "p384_point_mul",
+        "p384_scalar_mul_mont",
+        "openssl_poly1305_neon2_addmulmod",
+        "openssl_poly1305_neon2_blocks",
+        "sha256_block_data_order",
+        "sha256_block_data_order_avx",
+        "sha256_block_data_order_ssse3",
+        "sha256_block_data_order_hw",
+        "sha256_block_data_order_neon",
+        "sha256_block_data_order_nohw",
+        "sha512_block_data_order",
+        "sha512_block_data_order_avx",
+        "sha512_block_data_order_hw",
+        "sha512_block_data_order_neon",
+        "sha512_block_data_order_nohw",
+        "vpaes_ctr32_encrypt_blocks",
+        "vpaes_encrypt",
+        "vpaes_encrypt_key_to_bsaes",
+        "vpaes_set_encrypt_key",
+        "x25519_NEON",
+        "x25519_fe_invert",
+        "x25519_fe_isnegative",
+        "x25519_fe_mul_ttt",
+        "x25519_fe_neg",
+        "x25519_fe_tobytes",
+        "x25519_ge_double_scalarmult_vartime",
+        "x25519_ge_frombytes_vartime",
+        "x25519_ge_scalarmult_base",
+        "x25519_ge_scalarmult_base_adx",
+        "x25519_public_from_private_generic_masked",
+        "x25519_sc_mask",
+        "x25519_sc_muladd",
+        "x25519_sc_reduce",
+        "x25519_scalar_mult_adx",
+        "x25519_scalar_mult_generic_masked",
+    ];
+
+    let mut out = String::new();
+
+    for (old, new) in SYMBOLS_TO_RENAME {
+        let line = format!(
+            "{pp}define {prefix_prefix}{old} {prefix_prefix}{new}\n",
+            pp = pp,
+            prefix_prefix = prefix_prefix,
+            old = old,
+            new = new
+        );
+        out += &line;
+    }
+
+    for symbol in SYMBOLS_TO_PREFIX {
+        let line = format!(
+            "{pp}define {prefix_prefix}{symbol} {prefix_prefix}{prefix}{symbol}\n",
+            pp = pp,
+            prefix_prefix = prefix_prefix,
+            prefix = prefix,
+            symbol = symbol
+        );
+        out += &line;
+    }
+
+    out
+}
diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl b/ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl
new file mode 100644
index 0000000000..700da12005
--- /dev/null
+++ b/ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl
@@ -0,0 +1,1137 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# December 2014
+#
+# ChaCha20 for ARMv4.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
+#
+# Cortex-A5		19.3(*)/+95%    21.8        14.1
+# Cortex-A8		10.5(*)/+160%   13.9        6.35
+# Cortex-A9		12.9(**)/+110%  14.3        6.50
+# Cortex-A15		11.0/+40%       16.0        5.00
+# Snapdragon S4		11.5/+125%      13.6        4.90
+#
+# (*)	most "favourable" result for aligned data on little-endian
+#	processor, result for misaligned data is 10-15% lower;
+# (**)	this result is a trade-off: it can be improved by 20%,
+#	but then Snapdragon S4 and Cortex-A8 results get
+#	20-25% worse;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
+my @t=map("r$_",(8..11));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my $odd = $d0&1;
+my ($xc,$xc_) = (@t[0..1]);
+my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
+my @ret;
+
+	# Consider order in which variables are addressed by their
+	# index:
+	#
+	#       a   b   c   d
+	#
+	#       0   4   8  12 < even round
+	#       1   5   9  13
+	#       2   6  10  14
+	#       3   7  11  15
+	#       0   5  10  15 < odd round
+	#       1   6  11  12
+	#       2   7   8  13
+	#       3   4   9  14
+	#
+	# 'a', 'b' are permanently allocated in registers, @x[0..7],
+	# while 'c's and pair of 'd's are maintained in memory. If
+	# you observe 'c' column, you'll notice that pair of 'c's is
+	# invariant between rounds. This means that we have to reload
+	# them once per round, in the middle. This is why you'll see
+	# bunch of 'c' stores and loads in the middle, but none in
+	# the beginning or end. If you observe 'd' column, you'll
+	# notice that 15 and 13 are reused in next pair of rounds.
+	# This is why these two are chosen for offloading to memory,
+	# to make loads count more.
+							push @ret,(
+	"&add	(@x[$a0],@x[$a0],@x[$b0])",
+	"&mov	($xd,$xd,'ror#16')",
+	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
+	 "&mov	($xd_,$xd_,'ror#16')",
+	"&eor	($xd,$xd,@x[$a0],'ror#16')",
+	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
+
+	"&add	($xc,$xc,$xd)",
+	"&mov	(@x[$b0],@x[$b0],'ror#20')",
+	 "&add	($xc_,$xc_,$xd_)",
+	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
+	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
+	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
+
+	"&add	(@x[$a0],@x[$a0],@x[$b0])",
+	"&mov	($xd,$xd,'ror#24')",
+	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
+	 "&mov	($xd_,$xd_,'ror#24')",
+	"&eor	($xd,$xd,@x[$a0],'ror#24')",
+	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
+
+	"&add	($xc,$xc,$xd)",
+	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
+							push @ret,(
+	"&str	($xd,'[sp,#4*(16+$d0)]')",
+	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
+							push @ret,(
+	 "&add	($xc_,$xc_,$xd_)",
+	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
+							push @ret,(
+	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
+	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
+							push @ret,(
+	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
+	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
+
+	$xd=@x[$d2]					if (!$odd);
+	$xd_=@x[$d3]					if ($odd);
+							push @ret,(
+	"&str	($xc,'[sp,#4*(16+$c0)]')",
+	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
+	"&add	(@x[$a2],@x[$a2],@x[$b2])",
+	"&mov	($xd,$xd,'ror#16')",
+	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
+	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
+	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
+	 "&mov	($xd_,$xd_,'ror#16')",
+	"&eor	($xd,$xd,@x[$a2],'ror#16')",
+	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
+
+	"&add	($xc,$xc,$xd)",
+	"&mov	(@x[$b2],@x[$b2],'ror#20')",
+	 "&add	($xc_,$xc_,$xd_)",
+	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
+	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
+	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
+
+	"&add	(@x[$a2],@x[$a2],@x[$b2])",
+	"&mov	($xd,$xd,'ror#24')",
+	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
+	 "&mov	($xd_,$xd_,'ror#24')",
+	"&eor	($xd,$xd,@x[$a2],'ror#24')",
+	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
+
+	"&add	($xc,$xc,$xd)",
+	"&mov	(@x[$b2],@x[$b2],'ror#25')",
+	 "&add	($xc_,$xc_,$xd_)",
+	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
+	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
+	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
+
+	@ret;
+}
+
+$code.=<<___;
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+.Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone:
+.long	1,0,0,0
+
+.globl	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
+.align	5
+ChaCha20_ctr32_nohw:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0-r2,r4-r11,lr}
+	adr	r14,.Lsigma
+	ldmia	r12,{r4-r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+	stmdb	sp!,{r4-r7}		@ copy counter and nonce
+	ldmia	r3,{r4-r11}		@ load key
+	ldmia	r14,{r0-r3}		@ load sigma
+	stmdb	sp!,{r4-r11}		@ copy key
+	stmdb	sp!,{r0-r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
+	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
+	b	.Loop_outer_enter
+
+.align	4
+.Loop_outer:
+	ldmia	sp,{r0-r9}		@ load key material
+	str	@t[3],[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+.Loop_outer_enter:
+	ldr	@t[3], [sp,#4*(15)]
+	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	@t[2], [sp,#4*(13)]
+	ldr	@x[14],[sp,#4*(14)]
+	str	@t[3], [sp,#4*(16+15)]
+	mov	@t[3],#10
+	b	.Loop
+
+.align	4
+.Loop:
+	subs	@t[3],@t[3],#1
+___
+	foreach (&ROUND(0, 4, 8,12)) { eval; }
+	foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+	bne	.Loop
+
+	ldr	@t[3],[sp,#4*(32+2)]	@ load len
+
+	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	@t[1], [sp,#4*(16+9)]
+	str	@x[12],[sp,#4*(16+12)]
+	str	@t[2], [sp,#4*(16+13)]
+	str	@x[14],[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ @x[0-7] and second half at sp+4*(16+8)
+
+	cmp	@t[3],#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	@t[0],[sp,#4*(0)]	@ load key material
+	ldr	@t[1],[sp,#4*(1)]
+
+#if __ARM_ARCH>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH<7
+	orr	@t[2],r12,r14
+	tst	@t[2],#3		@ are input and output aligned?
+	ldr	@t[2],[sp,#4*(2)]
+	bne	.Lunaligned
+	cmp	@t[3],#64		@ restore flags
+# else
+	ldr	@t[2],[sp,#4*(2)]
+# endif
+	ldr	@t[3],[sp,#4*(3)]
+
+	add	@x[0],@x[0],@t[0]	@ accumulate key material
+	add	@x[1],@x[1],@t[1]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[0],[r12],#16		@ load input
+	ldrhs	@t[1],[r12,#-12]
+
+	add	@x[2],@x[2],@t[2]
+	add	@x[3],@x[3],@t[3]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[2],[r12,#-8]
+	ldrhs	@t[3],[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	@x[0],@x[0]
+	rev	@x[1],@x[1]
+	rev	@x[2],@x[2]
+	rev	@x[3],@x[3]
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[0],@x[0],@t[0]	@ xor with input
+	eorhs	@x[1],@x[1],@t[1]
+	 add	@t[0],sp,#4*(4)
+	str	@x[0],[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[2],@x[2],@t[2]
+	eorhs	@x[3],@x[3],@t[3]
+	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
+	str	@x[1],[r14,#-12]
+	str	@x[2],[r14,#-8]
+	str	@x[3],[r14,#-4]
+
+	add	@x[4],@x[4],@t[0]	@ accumulate key material
+	add	@x[5],@x[5],@t[1]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[0],[r12],#16		@ load input
+	ldrhs	@t[1],[r12,#-12]
+	add	@x[6],@x[6],@t[2]
+	add	@x[7],@x[7],@t[3]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[2],[r12,#-8]
+	ldrhs	@t[3],[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	@x[4],@x[4]
+	rev	@x[5],@x[5]
+	rev	@x[6],@x[6]
+	rev	@x[7],@x[7]
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[4],@x[4],@t[0]
+	eorhs	@x[5],@x[5],@t[1]
+	 add	@t[0],sp,#4*(8)
+	str	@x[4],[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[6],@x[6],@t[2]
+	eorhs	@x[7],@x[7],@t[3]
+	str	@x[5],[r14,#-12]
+	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
+	str	@x[6],[r14,#-8]
+	 add	@x[0],sp,#4*(16+8)
+	str	@x[7],[r14,#-4]
+
+	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
+
+	add	@x[0],@x[0],@t[0]	@ accumulate key material
+	add	@x[1],@x[1],@t[1]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[0],[r12],#16		@ load input
+	ldrhs	@t[1],[r12,#-12]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
+	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
+	add	@x[2],@x[2],@t[2]
+	add	@x[3],@x[3],@t[3]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[2],[r12,#-8]
+	ldrhs	@t[3],[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	@x[0],@x[0]
+	rev	@x[1],@x[1]
+	rev	@x[2],@x[2]
+	rev	@x[3],@x[3]
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[0],@x[0],@t[0]
+	eorhs	@x[1],@x[1],@t[1]
+	 add	@t[0],sp,#4*(12)
+	str	@x[0],[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[2],@x[2],@t[2]
+	eorhs	@x[3],@x[3],@t[3]
+	str	@x[1],[r14,#-12]
+	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
+	str	@x[2],[r14,#-8]
+	str	@x[3],[r14,#-4]
+
+	add	@x[4],@x[4],@t[0]	@ accumulate key material
+	add	@x[5],@x[5],@t[1]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	 addhi	@t[0],@t[0],#1		@ next counter value
+	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[0],[r12],#16		@ load input
+	ldrhs	@t[1],[r12,#-12]
+	add	@x[6],@x[6],@t[2]
+	add	@x[7],@x[7],@t[3]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	@t[2],[r12,#-8]
+	ldrhs	@t[3],[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	@x[4],@x[4]
+	rev	@x[5],@x[5]
+	rev	@x[6],@x[6]
+	rev	@x[7],@x[7]
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[4],@x[4],@t[0]
+	eorhs	@x[5],@x[5],@t[1]
+# ifdef	__thumb2__
+	 it	ne
+# endif
+	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	@x[6],@x[6],@t[2]
+	eorhs	@x[7],@x[7],@t[3]
+	str	@x[4],[r14],#16		@ store output
+	str	@x[5],[r14,#-12]
+# ifdef	__thumb2__
+	it	hs
+# endif
+	 subhs	@t[3],@t[0],#64		@ len-=64
+	str	@x[6],[r14,#-8]
+	str	@x[7],[r14,#-4]
+	bhi	.Loop_outer
+
+	beq	.Ldone
+# if __ARM_ARCH<7
+	b	.Ltail
+
+.align	4
+.Lunaligned:				@ unaligned endian-neutral path
+	cmp	@t[3],#64		@ restore flags
+# endif
+#endif
+#if __ARM_ARCH<7
+	ldr	@t[3],[sp,#4*(3)]
+___
+for ($i=0;$i<16;$i+=4) {
+my $j=$i&0x7;
+
+$code.=<<___	if ($i==4);
+	add	@x[0],sp,#4*(16+8)
+___
+$code.=<<___	if ($i==8);
+	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
+	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
+___
+$code.=<<___;
+	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
+___
+$code.=<<___	if ($i==12);
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	@t[0],@t[0],#1			@ next counter value
+	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
+___
+$code.=<<___;
+	add	@x[$j+1],@x[$j+1],@t[1]
+	add	@x[$j+2],@x[$j+2],@t[2]
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
+	ldrhsb	@t[0],[r12],#16			@ ... load input
+	eorlo	@t[1],@t[1],@t[1]
+	ldrhsb	@t[1],[r12,#-12]
+
+	add	@x[$j+3],@x[$j+3],@t[3]
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	@t[2],@t[2],@t[2]
+	ldrhsb	@t[2],[r12,#-8]
+	eorlo	@t[3],@t[3],@t[3]
+	ldrhsb	@t[3],[r12,#-4]
+
+	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
+	eor	@x[$j+1],@t[1],@x[$j+1]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	@t[0],[r12,#-15]		@ load more input
+	ldrhsb	@t[1],[r12,#-11]
+	eor	@x[$j+2],@t[2],@x[$j+2]
+	 strb	@x[$j+0],[r14],#16		@ store output
+	eor	@x[$j+3],@t[3],@x[$j+3]
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	@t[2],[r12,#-7]
+	ldrhsb	@t[3],[r12,#-3]
+	 strb	@x[$j+1],[r14,#-12]
+	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
+	 strb	@x[$j+2],[r14,#-8]
+	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	@t[0],[r12,#-14]		@ load more input
+	ldrhsb	@t[1],[r12,#-10]
+	 strb	@x[$j+3],[r14,#-4]
+	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
+	 strb	@x[$j+0],[r14,#-15]
+	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	@t[2],[r12,#-6]
+	ldrhsb	@t[3],[r12,#-2]
+	 strb	@x[$j+1],[r14,#-11]
+	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
+	 strb	@x[$j+2],[r14,#-7]
+	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	@t[0],[r12,#-13]		@ load more input
+	ldrhsb	@t[1],[r12,#-9]
+	 strb	@x[$j+3],[r14,#-3]
+	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
+	 strb	@x[$j+0],[r14,#-14]
+	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	@t[2],[r12,#-5]
+	ldrhsb	@t[3],[r12,#-1]
+	 strb	@x[$j+1],[r14,#-10]
+	 strb	@x[$j+2],[r14,#-6]
+	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
+	 strb	@x[$j+3],[r14,#-2]
+	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
+	 strb	@x[$j+0],[r14,#-13]
+	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
+	 strb	@x[$j+1],[r14,#-9]
+	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
+	 strb	@x[$j+2],[r14,#-5]
+	 strb	@x[$j+3],[r14,#-1]
+___
+$code.=<<___	if ($i<12);
+	add	@t[0],sp,#4*(4+$i)
+	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
+___
+}
+$code.=<<___;
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	@t[3],@t[0],#64			@ len-=64
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#endif
+
+.Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	@t[1],sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+.Loop_tail:
+	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
+	ldrb	@t[3],[r12],#1		@ read input
+	subs	@t[0],@t[0],#1
+	eor	@t[3],@t[3],@t[2]
+	strb	@t[3],[r14],#1		@ store output
+	bne	.Loop_tail
+
+.Ldone:
+	add	sp,sp,#4*(32+3)
+	ldmia	sp!,{r4-r11,pc}
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+___
+
+{{{
+my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
+    map("q$_",(0..15));
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+	(
+	"&vadd_i32	($a,$a,$b)",
+	"&veor		($d,$d,$a)",
+	"&vrev32_16	($d,$d)",	# vrot ($d,16)
+
+	"&vadd_i32	($c,$c,$d)",
+	"&veor		($t,$b,$c)",
+	"&vshr_u32	($b,$t,20)",
+	"&vsli_32	($b,$t,12)",
+
+	"&vadd_i32	($a,$a,$b)",
+	"&veor		($t,$d,$a)",
+	"&vshr_u32	($d,$t,24)",
+	"&vsli_32	($d,$t,8)",
+
+	"&vadd_i32	($c,$c,$d)",
+	"&veor		($t,$b,$c)",
+	"&vshr_u32	($b,$t,25)",
+	"&vsli_32	($b,$t,7)",
+
+	"&vext_8	($c,$c,$c,8)",
+	"&vext_8	($b,$b,$b,$odd?12:4)",
+	"&vext_8	($d,$d,$d,$odd?4:12)"
+	);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
+.align	5
+ChaCha20_ctr32_neon:
+	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb		sp!,{r0-r2,r4-r11,lr}
+	adr		r14,.Lsigma
+	vstmdb		sp!,{d8-d15}		@ ABI spec says so
+	stmdb		sp!,{r0-r3}
+
+	vld1.32		{$b0-$c0},[r3]		@ load key
+	ldmia		r3,{r4-r11}		@ load key
+
+	sub		sp,sp,#4*(16+16)
+	vld1.32		{$d0},[r12]		@ load counter and nonce
+	add		r12,sp,#4*8
+	ldmia		r14,{r0-r3}		@ load sigma
+	vld1.32		{$a0},[r14]!		@ load sigma
+	vld1.32		{$t0},[r14]		@ one
+	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
+
+	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
+	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
+	vshl.i32	$t1#lo,$t0#lo,#1	@ two
+	vstr		$t0#lo,[sp,#4*(16+0)]
+	vshl.i32	$t2#lo,$t0#lo,#2	@ four
+	vstr		$t1#lo,[sp,#4*(16+2)]
+	vmov		$a1,$a0
+	vstr		$t2#lo,[sp,#4*(16+4)]
+	vmov		$a2,$a0
+	vmov		$b1,$b0
+	vmov		$b2,$b0
+	b		.Loop_neon_enter
+
+.align	4
+.Loop_neon_outer:
+	ldmia		sp,{r0-r9}		@ load key material
+	cmp		@t[3],#64*2		@ if len<=64*2
+	bls		.Lbreak_neon		@ switch to integer-only
+	vmov		$a1,$a0
+	str		@t[3],[sp,#4*(32+2)]	@ save len
+	vmov		$a2,$a0
+	str		r12,  [sp,#4*(32+1)]	@ save inp
+	vmov		$b1,$b0
+	str		r14,  [sp,#4*(32+0)]	@ save out
+	vmov		$b2,$b0
+.Loop_neon_enter:
+	ldr		@t[3], [sp,#4*(15)]
+	vadd.i32	$d1,$d0,$t0		@ counter+1
+	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
+	vmov		$c1,$c0
+	ldr		@t[2], [sp,#4*(13)]
+	vmov		$c2,$c0
+	ldr		@x[14],[sp,#4*(14)]
+	vadd.i32	$d2,$d1,$t0		@ counter+2
+	str		@t[3], [sp,#4*(16+15)]
+	mov		@t[3],#10
+	add		@x[12],@x[12],#3	@ counter+3
+	b		.Loop_neon
+
+.align	4
+.Loop_neon:
+	subs		@t[3],@t[3],#1
+___
+	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
+	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
+	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
+	my @thread3=&ROUND(0,4,8,12);
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread3));
+		eval(shift(@thread1));	eval(shift(@thread3));
+		eval(shift(@thread2));	eval(shift(@thread3));
+	}
+
+	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
+	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
+	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
+	@thread3=&ROUND(0,5,10,15);
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread3));
+		eval(shift(@thread1));	eval(shift(@thread3));
+		eval(shift(@thread2));	eval(shift(@thread3));
+	}
+$code.=<<___;
+	bne		.Loop_neon
+
+	add		@t[3],sp,#32
+	vld1.32		{$t0-$t1},[sp]		@ load key material
+	vld1.32		{$t2-$t3},[@t[3]]
+
+	ldr		@t[3],[sp,#4*(32+2)]	@ load len
+
+	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
+	str		@t[1], [sp,#4*(16+9)]
+	str		@x[12],[sp,#4*(16+12)]
+	str		@t[2], [sp,#4*(16+13)]
+	str		@x[14],[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ @x[0-7] and second half at sp+4*(16+8)
+
+	ldr		r12,[sp,#4*(32+1)]	@ load inp
+	ldr		r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	$a0,$a0,$t0		@ accumulate key material
+	vadd.i32	$a1,$a1,$t0
+	vadd.i32	$a2,$a2,$t0
+	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	$b0,$b0,$t1
+	vadd.i32	$b1,$b1,$t1
+	vadd.i32	$b2,$b2,$t1
+	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	$c0,$c0,$t2
+	vadd.i32	$c1,$c1,$t2
+	vadd.i32	$c2,$c2,$t2
+	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
+	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
+
+	vadd.i32	$d0,$d0,$t3
+	vadd.i32	$d1,$d1,$t3
+	vadd.i32	$d2,$d2,$t3
+
+	cmp		@t[3],#64*4
+	blo		.Ltail_neon
+
+	vld1.8		{$t0-$t1},[r12]!	@ load input
+	 mov		@t[3],sp
+	vld1.8		{$t2-$t3},[r12]!
+	veor		$a0,$a0,$t0		@ xor with input
+	veor		$b0,$b0,$t1
+	vld1.8		{$t0-$t1},[r12]!
+	veor		$c0,$c0,$t2
+	veor		$d0,$d0,$t3
+	vld1.8		{$t2-$t3},[r12]!
+
+	veor		$a1,$a1,$t0
+	 vst1.8		{$a0-$b0},[r14]!	@ store output
+	veor		$b1,$b1,$t1
+	vld1.8		{$t0-$t1},[r12]!
+	veor		$c1,$c1,$t2
+	 vst1.8		{$c0-$d0},[r14]!
+	veor		$d1,$d1,$t3
+	vld1.8		{$t2-$t3},[r12]!
+
+	veor		$a2,$a2,$t0
+	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
+	 veor		$t0#hi,$t0#hi,$t0#hi
+	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
+	veor		$b2,$b2,$t1
+	 vld1.32	{$c0-$d0},[@t[3]]
+	veor		$c2,$c2,$t2
+	 vst1.8		{$a1-$b1},[r14]!
+	veor		$d2,$d2,$t3
+	 vst1.8		{$c1-$d1},[r14]!
+
+	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
+	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
+
+	ldmia		sp,{@t[0]-@t[3]}	@ load key material
+	add		@x[0],@x[0],@t[0]	@ accumulate key material
+	ldr		@t[0],[r12],#16		@ load input
+	 vst1.8		{$a2-$b2},[r14]!
+	add		@x[1],@x[1],@t[1]
+	ldr		@t[1],[r12,#-12]
+	 vst1.8		{$c2-$d2},[r14]!
+	add		@x[2],@x[2],@t[2]
+	ldr		@t[2],[r12,#-8]
+	add		@x[3],@x[3],@t[3]
+	ldr		@t[3],[r12,#-4]
+# ifdef	__ARMEB__
+	rev		@x[0],@x[0]
+	rev		@x[1],@x[1]
+	rev		@x[2],@x[2]
+	rev		@x[3],@x[3]
+# endif
+	eor		@x[0],@x[0],@t[0]	@ xor with input
+	 add		@t[0],sp,#4*(4)
+	eor		@x[1],@x[1],@t[1]
+	str		@x[0],[r14],#16		@ store output
+	eor		@x[2],@x[2],@t[2]
+	str		@x[1],[r14,#-12]
+	eor		@x[3],@x[3],@t[3]
+	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
+	str		@x[2],[r14,#-8]
+	str		@x[3],[r14,#-4]
+
+	add		@x[4],@x[4],@t[0]	@ accumulate key material
+	ldr		@t[0],[r12],#16		@ load input
+	add		@x[5],@x[5],@t[1]
+	ldr		@t[1],[r12,#-12]
+	add		@x[6],@x[6],@t[2]
+	ldr		@t[2],[r12,#-8]
+	add		@x[7],@x[7],@t[3]
+	ldr		@t[3],[r12,#-4]
+# ifdef	__ARMEB__
+	rev		@x[4],@x[4]
+	rev		@x[5],@x[5]
+	rev		@x[6],@x[6]
+	rev		@x[7],@x[7]
+# endif
+	eor		@x[4],@x[4],@t[0]
+	 add		@t[0],sp,#4*(8)
+	eor		@x[5],@x[5],@t[1]
+	str		@x[4],[r14],#16		@ store output
+	eor		@x[6],@x[6],@t[2]
+	str		@x[5],[r14,#-12]
+	eor		@x[7],@x[7],@t[3]
+	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
+	str		@x[6],[r14,#-8]
+	 add		@x[0],sp,#4*(16+8)
+	str		@x[7],[r14,#-4]
+
+	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
+
+	add		@x[0],@x[0],@t[0]	@ accumulate key material
+	ldr		@t[0],[r12],#16		@ load input
+	add		@x[1],@x[1],@t[1]
+	ldr		@t[1],[r12,#-12]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
+	add		@x[2],@x[2],@t[2]
+	ldr		@t[2],[r12,#-8]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
+	add		@x[3],@x[3],@t[3]
+	ldr		@t[3],[r12,#-4]
+# ifdef	__ARMEB__
+	rev		@x[0],@x[0]
+	rev		@x[1],@x[1]
+	rev		@x[2],@x[2]
+	rev		@x[3],@x[3]
+# endif
+	eor		@x[0],@x[0],@t[0]
+	 add		@t[0],sp,#4*(12)
+	eor		@x[1],@x[1],@t[1]
+	str		@x[0],[r14],#16		@ store output
+	eor		@x[2],@x[2],@t[2]
+	str		@x[1],[r14,#-12]
+	eor		@x[3],@x[3],@t[3]
+	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
+	str		@x[2],[r14,#-8]
+	str		@x[3],[r14,#-4]
+
+	add		@x[4],@x[4],@t[0]	@ accumulate key material
+	 add		@t[0],@t[0],#4		@ next counter value
+	add		@x[5],@x[5],@t[1]
+	 str		@t[0],[sp,#4*(12)]	@ save next counter value
+	ldr		@t[0],[r12],#16		@ load input
+	add		@x[6],@x[6],@t[2]
+	 add		@x[4],@x[4],#3		@ counter+3
+	ldr		@t[1],[r12,#-12]
+	add		@x[7],@x[7],@t[3]
+	ldr		@t[2],[r12,#-8]
+	ldr		@t[3],[r12,#-4]
+# ifdef	__ARMEB__
+	rev		@x[4],@x[4]
+	rev		@x[5],@x[5]
+	rev		@x[6],@x[6]
+	rev		@x[7],@x[7]
+# endif
+	eor		@x[4],@x[4],@t[0]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
+	eor		@x[5],@x[5],@t[1]
+	eor		@x[6],@x[6],@t[2]
+	str		@x[4],[r14],#16		@ store output
+	eor		@x[7],@x[7],@t[3]
+	str		@x[5],[r14,#-12]
+	 sub		@t[3],@t[0],#64*4	@ len-=64*4
+	str		@x[6],[r14,#-8]
+	str		@x[7],[r14,#-4]
+	bhi		.Loop_neon_outer
+
+	b		.Ldone_neon
+
+.align	4
+.Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str		@t[3], [sp,#4*(20+32+2)]	@ save len
+	 add		@t[3],sp,#4*(32+4)
+	str		r12,   [sp,#4*(20+32+1)]	@ save inp
+	str		r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr		@x[12],[sp,#4*(16+10)]
+	ldr		@x[14],[sp,#4*(16+11)]
+	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
+	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
+	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
+
+	ldr		@t[3], [sp,#4*(15)]
+	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
+	ldr		@t[2], [sp,#4*(13)]
+	ldr		@x[14],[sp,#4*(14)]
+	str		@t[3], [sp,#4*(20+16+15)]
+	add		@t[3],sp,#4*(20)
+	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
+	add		sp,sp,#4*(20)			@ switch frame
+	vst1.32		{$c0-$d0},[@t[3]]
+	mov		@t[3],#10
+	b		.Loop				@ go integer-only
+
+.align	4
+.Ltail_neon:
+	cmp		@t[3],#64*3
+	bhs		.L192_or_more_neon
+	cmp		@t[3],#64*2
+	bhs		.L128_or_more_neon
+	cmp		@t[3],#64*1
+	bhs		.L64_or_more_neon
+
+	add		@t[0],sp,#4*(8)
+	vst1.8		{$a0-$b0},[sp]
+	add		@t[2],sp,#4*(0)
+	vst1.8		{$c0-$d0},[@t[0]]
+	b		.Loop_tail_neon
+
+.align	4
+.L64_or_more_neon:
+	vld1.8		{$t0-$t1},[r12]!
+	vld1.8		{$t2-$t3},[r12]!
+	veor		$a0,$a0,$t0
+	veor		$b0,$b0,$t1
+	veor		$c0,$c0,$t2
+	veor		$d0,$d0,$t3
+	vst1.8		{$a0-$b0},[r14]!
+	vst1.8		{$c0-$d0},[r14]!
+
+	beq		.Ldone_neon
+
+	add		@t[0],sp,#4*(8)
+	vst1.8		{$a1-$b1},[sp]
+	add		@t[2],sp,#4*(0)
+	vst1.8		{$c1-$d1},[@t[0]]
+	sub		@t[3],@t[3],#64*1	@ len-=64*1
+	b		.Loop_tail_neon
+
+.align	4
+.L128_or_more_neon:
+	vld1.8		{$t0-$t1},[r12]!
+	vld1.8		{$t2-$t3},[r12]!
+	veor		$a0,$a0,$t0
+	veor		$b0,$b0,$t1
+	vld1.8		{$t0-$t1},[r12]!
+	veor		$c0,$c0,$t2
+	veor		$d0,$d0,$t3
+	vld1.8		{$t2-$t3},[r12]!
+
+	veor		$a1,$a1,$t0
+	veor		$b1,$b1,$t1
+	 vst1.8		{$a0-$b0},[r14]!
+	veor		$c1,$c1,$t2
+	 vst1.8		{$c0-$d0},[r14]!
+	veor		$d1,$d1,$t3
+	vst1.8		{$a1-$b1},[r14]!
+	vst1.8		{$c1-$d1},[r14]!
+
+	beq		.Ldone_neon
+
+	add		@t[0],sp,#4*(8)
+	vst1.8		{$a2-$b2},[sp]
+	add		@t[2],sp,#4*(0)
+	vst1.8		{$c2-$d2},[@t[0]]
+	sub		@t[3],@t[3],#64*2	@ len-=64*2
+	b		.Loop_tail_neon
+
+.align	4
+.L192_or_more_neon:
+	vld1.8		{$t0-$t1},[r12]!
+	vld1.8		{$t2-$t3},[r12]!
+	veor		$a0,$a0,$t0
+	veor		$b0,$b0,$t1
+	vld1.8		{$t0-$t1},[r12]!
+	veor		$c0,$c0,$t2
+	veor		$d0,$d0,$t3
+	vld1.8		{$t2-$t3},[r12]!
+
+	veor		$a1,$a1,$t0
+	veor		$b1,$b1,$t1
+	vld1.8		{$t0-$t1},[r12]!
+	veor		$c1,$c1,$t2
+	 vst1.8		{$a0-$b0},[r14]!
+	veor		$d1,$d1,$t3
+	vld1.8		{$t2-$t3},[r12]!
+
+	veor		$a2,$a2,$t0
+	 vst1.8		{$c0-$d0},[r14]!
+	veor		$b2,$b2,$t1
+	 vst1.8		{$a1-$b1},[r14]!
+	veor		$c2,$c2,$t2
+	 vst1.8		{$c1-$d1},[r14]!
+	veor		$d2,$d2,$t3
+	vst1.8		{$a2-$b2},[r14]!
+	vst1.8		{$c2-$d2},[r14]!
+
+	beq		.Ldone_neon
+
+	ldmia		sp,{@t[0]-@t[3]}	@ load key material
+	add		@x[0],@x[0],@t[0]	@ accumulate key material
+	 add		@t[0],sp,#4*(4)
+	add		@x[1],@x[1],@t[1]
+	add		@x[2],@x[2],@t[2]
+	add		@x[3],@x[3],@t[3]
+	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
+
+	add		@x[4],@x[4],@t[0]	@ accumulate key material
+	 add		@t[0],sp,#4*(8)
+	add		@x[5],@x[5],@t[1]
+	add		@x[6],@x[6],@t[2]
+	add		@x[7],@x[7],@t[3]
+	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
+# ifdef	__ARMEB__
+	rev		@x[0],@x[0]
+	rev		@x[1],@x[1]
+	rev		@x[2],@x[2]
+	rev		@x[3],@x[3]
+	rev		@x[4],@x[4]
+	rev		@x[5],@x[5]
+	rev		@x[6],@x[6]
+	rev		@x[7],@x[7]
+# endif
+	stmia		sp,{@x[0]-@x[7]}
+	 add		@x[0],sp,#4*(16+8)
+
+	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
+
+	add		@x[0],@x[0],@t[0]	@ accumulate key material
+	 add		@t[0],sp,#4*(12)
+	add		@x[1],@x[1],@t[1]
+	add		@x[2],@x[2],@t[2]
+	add		@x[3],@x[3],@t[3]
+	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
+
+	add		@x[4],@x[4],@t[0]	@ accumulate key material
+	 add		@t[0],sp,#4*(8)
+	add		@x[5],@x[5],@t[1]
+	 add		@x[4],@x[4],#3		@ counter+3
+	add		@x[6],@x[6],@t[2]
+	add		@x[7],@x[7],@t[3]
+	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
+# ifdef	__ARMEB__
+	rev		@x[0],@x[0]
+	rev		@x[1],@x[1]
+	rev		@x[2],@x[2]
+	rev		@x[3],@x[3]
+	rev		@x[4],@x[4]
+	rev		@x[5],@x[5]
+	rev		@x[6],@x[6]
+	rev		@x[7],@x[7]
+# endif
+	stmia		@t[0],{@x[0]-@x[7]}
+	 add		@t[2],sp,#4*(0)
+	 sub		@t[3],@t[3],#64*3	@ len-=64*3
+
+.Loop_tail_neon:
+	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
+	ldrb		@t[1],[r12],#1		@ read input
+	subs		@t[3],@t[3],#1
+	eor		@t[0],@t[0],@t[1]
+	strb		@t[0],[r14],#1		@ store output
+	bne		.Loop_tail_neon
+
+.Ldone_neon:
+	add		sp,sp,#4*(32+4)
+	vldmia		sp,{d8-d15}
+	add		sp,sp,#4*(16+3)
+	ldmia		sp!,{r4-r11,pc}
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+#endif
+___
+}}}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl b/ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl
new file mode 100644
index 0000000000..8f2fe30a7c
--- /dev/null
+++ b/ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl
@@ -0,0 +1,1130 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# June 2015
+#
+# ChaCha20 for ARMv8.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#			IALU/gcc-4.9    3xNEON+1xIALU	6xNEON+2xIALU
+#
+# Apple A7		5.50/+49%       3.33            1.70
+# Cortex-A53		8.40/+80%       4.72		4.72(*)
+# Cortex-A57		8.06/+43%       4.90            4.43(**)
+# Denver		4.50/+82%       2.63		2.67(*)
+# X-Gene		9.50/+46%       8.82		8.89(*)
+# Mongoose		8.00/+44%	3.64		3.25
+# Kryo			8.17/+50%	4.83		4.65
+#
+# (*)	it's expected that doubling interleave factor doesn't help
+#	all processors, only those with higher NEON latency and
+#	higher instruction issue rate;
+# (**)	expected improvement was actually higher;
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
+
+my @x=map("x$_",(5..17,19..21));
+my @d=map("x$_",(22..28,30));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+    (
+	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
+	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
+	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
+	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
+	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
+	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
+	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
+	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
+	"&ror_32	(@x[$d0],@x[$d0],16)",
+	 "&ror_32	(@x[$d1],@x[$d1],16)",
+	  "&ror_32	(@x[$d2],@x[$d2],16)",
+	   "&ror_32	(@x[$d3],@x[$d3],16)",
+
+	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
+	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
+	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
+	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
+	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
+	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
+	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
+	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
+	"&ror_32	(@x[$b0],@x[$b0],20)",
+	 "&ror_32	(@x[$b1],@x[$b1],20)",
+	  "&ror_32	(@x[$b2],@x[$b2],20)",
+	   "&ror_32	(@x[$b3],@x[$b3],20)",
+
+	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
+	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
+	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
+	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
+	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
+	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
+	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
+	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
+	"&ror_32	(@x[$d0],@x[$d0],24)",
+	 "&ror_32	(@x[$d1],@x[$d1],24)",
+	  "&ror_32	(@x[$d2],@x[$d2],24)",
+	   "&ror_32	(@x[$d3],@x[$d3],24)",
+
+	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
+	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
+	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
+	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
+	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
+	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
+	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
+	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
+	"&ror_32	(@x[$b0],@x[$b0],25)",
+	 "&ror_32	(@x[$b1],@x[$b1],25)",
+	  "&ror_32	(@x[$b2],@x[$b2],25)",
+	   "&ror_32	(@x[$b3],@x[$b3],25)"
+    );
+}
+
+$code.=<<___;
+.section .rodata
+
+.align	5
+.Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+.Lone:
+.long	1,0,0,0
+.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+
+.text
+
+.globl	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
+.align	5
+ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	@x[0],:pg_hi21:.Lsigma
+	add	@x[0],@x[0],:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	@d[0],@d[1],[@x[0]]		// load sigma
+	ldp	@d[2],@d[3],[$key]		// load key
+	ldp	@d[4],@d[5],[$key,#16]
+	ldp	@d[6],@d[7],[$ctr]		// load counter
+#ifdef	__AARCH64EB__
+	ror	@d[2],@d[2],#32
+	ror	@d[3],@d[3],#32
+	ror	@d[4],@d[4],#32
+	ror	@d[5],@d[5],#32
+	ror	@d[6],@d[6],#32
+	ror	@d[7],@d[7],#32
+#endif
+
+.Loop_outer:
+	mov.32	@x[0],@d[0]			// unpack key block
+	lsr	@x[1],@d[0],#32
+	mov.32	@x[2],@d[1]
+	lsr	@x[3],@d[1],#32
+	mov.32	@x[4],@d[2]
+	lsr	@x[5],@d[2],#32
+	mov.32	@x[6],@d[3]
+	lsr	@x[7],@d[3],#32
+	mov.32	@x[8],@d[4]
+	lsr	@x[9],@d[4],#32
+	mov.32	@x[10],@d[5]
+	lsr	@x[11],@d[5],#32
+	mov.32	@x[12],@d[6]
+	lsr	@x[13],@d[6],#32
+	mov.32	@x[14],@d[7]
+	lsr	@x[15],@d[7],#32
+
+	mov	$ctr,#10
+	subs	$len,$len,#64
+.Loop:
+	sub	$ctr,$ctr,#1
+___
+	foreach (&ROUND(0, 4, 8,12)) { eval; }
+	foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+	cbnz	$ctr,.Loop
+
+	add.32	@x[0],@x[0],@d[0]		// accumulate key block
+	add	@x[1],@x[1],@d[0],lsr#32
+	add.32	@x[2],@x[2],@d[1]
+	add	@x[3],@x[3],@d[1],lsr#32
+	add.32	@x[4],@x[4],@d[2]
+	add	@x[5],@x[5],@d[2],lsr#32
+	add.32	@x[6],@x[6],@d[3]
+	add	@x[7],@x[7],@d[3],lsr#32
+	add.32	@x[8],@x[8],@d[4]
+	add	@x[9],@x[9],@d[4],lsr#32
+	add.32	@x[10],@x[10],@d[5]
+	add	@x[11],@x[11],@d[5],lsr#32
+	add.32	@x[12],@x[12],@d[6]
+	add	@x[13],@x[13],@d[6],lsr#32
+	add.32	@x[14],@x[14],@d[7]
+	add	@x[15],@x[15],@d[7],lsr#32
+
+	b.lo	.Ltail
+
+	add	@x[0],@x[0],@x[1],lsl#32	// pack
+	add	@x[2],@x[2],@x[3],lsl#32
+	ldp	@x[1],@x[3],[$inp,#0]		// load input
+	add	@x[4],@x[4],@x[5],lsl#32
+	add	@x[6],@x[6],@x[7],lsl#32
+	ldp	@x[5],@x[7],[$inp,#16]
+	add	@x[8],@x[8],@x[9],lsl#32
+	add	@x[10],@x[10],@x[11],lsl#32
+	ldp	@x[9],@x[11],[$inp,#32]
+	add	@x[12],@x[12],@x[13],lsl#32
+	add	@x[14],@x[14],@x[15],lsl#32
+	ldp	@x[13],@x[15],[$inp,#48]
+	add	$inp,$inp,#64
+#ifdef	__AARCH64EB__
+	rev	@x[0],@x[0]
+	rev	@x[2],@x[2]
+	rev	@x[4],@x[4]
+	rev	@x[6],@x[6]
+	rev	@x[8],@x[8]
+	rev	@x[10],@x[10]
+	rev	@x[12],@x[12]
+	rev	@x[14],@x[14]
+#endif
+	eor	@x[0],@x[0],@x[1]
+	eor	@x[2],@x[2],@x[3]
+	eor	@x[4],@x[4],@x[5]
+	eor	@x[6],@x[6],@x[7]
+	eor	@x[8],@x[8],@x[9]
+	eor	@x[10],@x[10],@x[11]
+	eor	@x[12],@x[12],@x[13]
+	eor	@x[14],@x[14],@x[15]
+
+	stp	@x[0],@x[2],[$out,#0]		// store output
+	 add	@d[6],@d[6],#1			// increment counter
+	stp	@x[4],@x[6],[$out,#16]
+	stp	@x[8],@x[10],[$out,#32]
+	stp	@x[12],@x[14],[$out,#48]
+	add	$out,$out,#64
+
+	b.hi	.Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+.Ltail:
+	add	$len,$len,#64
+.Less_than_64:
+	sub	$out,$out,#1
+	add	$inp,$inp,$len
+	add	$out,$out,$len
+	add	$ctr,sp,$len
+	neg	$len,$len
+
+	add	@x[0],@x[0],@x[1],lsl#32	// pack
+	add	@x[2],@x[2],@x[3],lsl#32
+	add	@x[4],@x[4],@x[5],lsl#32
+	add	@x[6],@x[6],@x[7],lsl#32
+	add	@x[8],@x[8],@x[9],lsl#32
+	add	@x[10],@x[10],@x[11],lsl#32
+	add	@x[12],@x[12],@x[13],lsl#32
+	add	@x[14],@x[14],@x[15],lsl#32
+#ifdef	__AARCH64EB__
+	rev	@x[0],@x[0]
+	rev	@x[2],@x[2]
+	rev	@x[4],@x[4]
+	rev	@x[6],@x[6]
+	rev	@x[8],@x[8]
+	rev	@x[10],@x[10]
+	rev	@x[12],@x[12]
+	rev	@x[14],@x[14]
+#endif
+	stp	@x[0],@x[2],[sp,#0]
+	stp	@x[4],@x[6],[sp,#16]
+	stp	@x[8],@x[10],[sp,#32]
+	stp	@x[12],@x[14],[sp,#48]
+
+.Loop_tail:
+	ldrb	w10,[$inp,$len]
+	ldrb	w11,[$ctr,$len]
+	add	$len,$len,#1
+	eor	w10,w10,w11
+	strb	w10,[$out,$len]
+	cbnz	$len,.Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+___
+
+{{{
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
+    map("v$_.4s",(0..7,16..23));
+my (@K)=map("v$_.4s",(24..30));
+my $ONE="v31.4s";
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+	(
+	"&add		('$a','$a','$b')",
+	"&eor		('$d','$d','$a')",
+	"&rev32_16	('$d','$d')",		# vrot ($d,16)
+
+	"&add		('$c','$c','$d')",
+	"&eor		('$t','$b','$c')",
+	"&ushr		('$b','$t',20)",
+	"&sli		('$b','$t',12)",
+
+	"&add		('$a','$a','$b')",
+	"&eor		('$t','$d','$a')",
+	"&ushr		('$d','$t',24)",
+	"&sli		('$d','$t',8)",
+
+	"&add		('$c','$c','$d')",
+	"&eor		('$t','$b','$c')",
+	"&ushr		('$b','$t',25)",
+	"&sli		('$b','$t',7)",
+
+	"&ext		('$c','$c','$c',8)",
+	"&ext		('$d','$d','$d',$odd?4:12)",
+	"&ext		('$b','$b','$b',$odd?12:4)"
+	);
+}
+
+$code.=<<___;
+
+.globl	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
+.align	5
+ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	@x[0],:pg_hi21:.Lsigma
+	add	@x[0],@x[0],:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	$len,#512
+	b.hs	.L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	@d[0],@d[1],[@x[0]]		// load sigma
+	ld1	{@K[0]},[@x[0]],#16
+	ldp	@d[2],@d[3],[$key]		// load key
+	ldp	@d[4],@d[5],[$key,#16]
+	ld1	{@K[1],@K[2]},[$key]
+	ldp	@d[6],@d[7],[$ctr]		// load counter
+	ld1	{@K[3]},[$ctr]
+	ld1	{$ONE},[@x[0]]
+#ifdef	__AARCH64EB__
+	rev64	@K[0],@K[0]
+	ror	@d[2],@d[2],#32
+	ror	@d[3],@d[3],#32
+	ror	@d[4],@d[4],#32
+	ror	@d[5],@d[5],#32
+	ror	@d[6],@d[6],#32
+	ror	@d[7],@d[7],#32
+#endif
+	add	@K[3],@K[3],$ONE		// += 1
+	add	@K[4],@K[3],$ONE
+	add	@K[5],@K[4],$ONE
+	shl	$ONE,$ONE,#2			// 1 -> 4
+
+.Loop_outer_neon:
+	mov.32	@x[0],@d[0]			// unpack key block
+	lsr	@x[1],@d[0],#32
+	 mov	$A0,@K[0]
+	mov.32	@x[2],@d[1]
+	lsr	@x[3],@d[1],#32
+	 mov	$A1,@K[0]
+	mov.32	@x[4],@d[2]
+	lsr	@x[5],@d[2],#32
+	 mov	$A2,@K[0]
+	mov.32	@x[6],@d[3]
+	 mov	$B0,@K[1]
+	lsr	@x[7],@d[3],#32
+	 mov	$B1,@K[1]
+	mov.32	@x[8],@d[4]
+	 mov	$B2,@K[1]
+	lsr	@x[9],@d[4],#32
+	 mov	$D0,@K[3]
+	mov.32	@x[10],@d[5]
+	 mov	$D1,@K[4]
+	lsr	@x[11],@d[5],#32
+	 mov	$D2,@K[5]
+	mov.32	@x[12],@d[6]
+	 mov	$C0,@K[2]
+	lsr	@x[13],@d[6],#32
+	 mov	$C1,@K[2]
+	mov.32	@x[14],@d[7]
+	 mov	$C2,@K[2]
+	lsr	@x[15],@d[7],#32
+
+	mov	$ctr,#10
+	subs	$len,$len,#256
+.Loop_neon:
+	sub	$ctr,$ctr,#1
+___
+	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+	my @thread3=&ROUND(0,4,8,12);
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread3));
+		eval(shift(@thread1));	eval(shift(@thread3));
+		eval(shift(@thread2));	eval(shift(@thread3));
+	}
+
+	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+	@thread3=&ROUND(0,5,10,15);
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread3));
+		eval(shift(@thread1));	eval(shift(@thread3));
+		eval(shift(@thread2));	eval(shift(@thread3));
+	}
+$code.=<<___;
+	cbnz	$ctr,.Loop_neon
+
+	add.32	@x[0],@x[0],@d[0]		// accumulate key block
+	 add	$A0,$A0,@K[0]
+	add	@x[1],@x[1],@d[0],lsr#32
+	 add	$A1,$A1,@K[0]
+	add.32	@x[2],@x[2],@d[1]
+	 add	$A2,$A2,@K[0]
+	add	@x[3],@x[3],@d[1],lsr#32
+	 add	$C0,$C0,@K[2]
+	add.32	@x[4],@x[4],@d[2]
+	 add	$C1,$C1,@K[2]
+	add	@x[5],@x[5],@d[2],lsr#32
+	 add	$C2,$C2,@K[2]
+	add.32	@x[6],@x[6],@d[3]
+	 add	$D0,$D0,@K[3]
+	add	@x[7],@x[7],@d[3],lsr#32
+	add.32	@x[8],@x[8],@d[4]
+	 add	$D1,$D1,@K[4]
+	add	@x[9],@x[9],@d[4],lsr#32
+	add.32	@x[10],@x[10],@d[5]
+	 add	$D2,$D2,@K[5]
+	add	@x[11],@x[11],@d[5],lsr#32
+	add.32	@x[12],@x[12],@d[6]
+	 add	$B0,$B0,@K[1]
+	add	@x[13],@x[13],@d[6],lsr#32
+	add.32	@x[14],@x[14],@d[7]
+	 add	$B1,$B1,@K[1]
+	add	@x[15],@x[15],@d[7],lsr#32
+	 add	$B2,$B2,@K[1]
+
+	b.lo	.Ltail_neon
+
+	add	@x[0],@x[0],@x[1],lsl#32	// pack
+	add	@x[2],@x[2],@x[3],lsl#32
+	ldp	@x[1],@x[3],[$inp,#0]		// load input
+	add	@x[4],@x[4],@x[5],lsl#32
+	add	@x[6],@x[6],@x[7],lsl#32
+	ldp	@x[5],@x[7],[$inp,#16]
+	add	@x[8],@x[8],@x[9],lsl#32
+	add	@x[10],@x[10],@x[11],lsl#32
+	ldp	@x[9],@x[11],[$inp,#32]
+	add	@x[12],@x[12],@x[13],lsl#32
+	add	@x[14],@x[14],@x[15],lsl#32
+	ldp	@x[13],@x[15],[$inp,#48]
+	add	$inp,$inp,#64
+#ifdef	__AARCH64EB__
+	rev	@x[0],@x[0]
+	rev	@x[2],@x[2]
+	rev	@x[4],@x[4]
+	rev	@x[6],@x[6]
+	rev	@x[8],@x[8]
+	rev	@x[10],@x[10]
+	rev	@x[12],@x[12]
+	rev	@x[14],@x[14]
+#endif
+	ld1.8	{$T0-$T3},[$inp],#64
+	eor	@x[0],@x[0],@x[1]
+	eor	@x[2],@x[2],@x[3]
+	eor	@x[4],@x[4],@x[5]
+	eor	@x[6],@x[6],@x[7]
+	eor	@x[8],@x[8],@x[9]
+	 eor	$A0,$A0,$T0
+	eor	@x[10],@x[10],@x[11]
+	 eor	$B0,$B0,$T1
+	eor	@x[12],@x[12],@x[13]
+	 eor	$C0,$C0,$T2
+	eor	@x[14],@x[14],@x[15]
+	 eor	$D0,$D0,$T3
+	 ld1.8	{$T0-$T3},[$inp],#64
+
+	stp	@x[0],@x[2],[$out,#0]		// store output
+	 add	@d[6],@d[6],#4			// increment counter
+	stp	@x[4],@x[6],[$out,#16]
+	 add	@K[3],@K[3],$ONE		// += 4
+	stp	@x[8],@x[10],[$out,#32]
+	 add	@K[4],@K[4],$ONE
+	stp	@x[12],@x[14],[$out,#48]
+	 add	@K[5],@K[5],$ONE
+	add	$out,$out,#64
+
+	st1.8	{$A0-$D0},[$out],#64
+	ld1.8	{$A0-$D0},[$inp],#64
+
+	eor	$A1,$A1,$T0
+	eor	$B1,$B1,$T1
+	eor	$C1,$C1,$T2
+	eor	$D1,$D1,$T3
+	st1.8	{$A1-$D1},[$out],#64
+
+	eor	$A2,$A2,$A0
+	eor	$B2,$B2,$B0
+	eor	$C2,$C2,$C0
+	eor	$D2,$D2,$D0
+	st1.8	{$A2-$D2},[$out],#64
+
+	b.hi	.Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Ltail_neon:
+	add	$len,$len,#256
+	cmp	$len,#64
+	b.lo	.Less_than_64
+
+	add	@x[0],@x[0],@x[1],lsl#32	// pack
+	add	@x[2],@x[2],@x[3],lsl#32
+	ldp	@x[1],@x[3],[$inp,#0]		// load input
+	add	@x[4],@x[4],@x[5],lsl#32
+	add	@x[6],@x[6],@x[7],lsl#32
+	ldp	@x[5],@x[7],[$inp,#16]
+	add	@x[8],@x[8],@x[9],lsl#32
+	add	@x[10],@x[10],@x[11],lsl#32
+	ldp	@x[9],@x[11],[$inp,#32]
+	add	@x[12],@x[12],@x[13],lsl#32
+	add	@x[14],@x[14],@x[15],lsl#32
+	ldp	@x[13],@x[15],[$inp,#48]
+	add	$inp,$inp,#64
+#ifdef	__AARCH64EB__
+	rev	@x[0],@x[0]
+	rev	@x[2],@x[2]
+	rev	@x[4],@x[4]
+	rev	@x[6],@x[6]
+	rev	@x[8],@x[8]
+	rev	@x[10],@x[10]
+	rev	@x[12],@x[12]
+	rev	@x[14],@x[14]
+#endif
+	eor	@x[0],@x[0],@x[1]
+	eor	@x[2],@x[2],@x[3]
+	eor	@x[4],@x[4],@x[5]
+	eor	@x[6],@x[6],@x[7]
+	eor	@x[8],@x[8],@x[9]
+	eor	@x[10],@x[10],@x[11]
+	eor	@x[12],@x[12],@x[13]
+	eor	@x[14],@x[14],@x[15]
+
+	stp	@x[0],@x[2],[$out,#0]		// store output
+	 add	@d[6],@d[6],#4			// increment counter
+	stp	@x[4],@x[6],[$out,#16]
+	stp	@x[8],@x[10],[$out,#32]
+	stp	@x[12],@x[14],[$out,#48]
+	add	$out,$out,#64
+	b.eq	.Ldone_neon
+	sub	$len,$len,#64
+	cmp	$len,#64
+	b.lo	.Less_than_128
+
+	ld1.8	{$T0-$T3},[$inp],#64
+	eor	$A0,$A0,$T0
+	eor	$B0,$B0,$T1
+	eor	$C0,$C0,$T2
+	eor	$D0,$D0,$T3
+	st1.8	{$A0-$D0},[$out],#64
+	b.eq	.Ldone_neon
+	sub	$len,$len,#64
+	cmp	$len,#64
+	b.lo	.Less_than_192
+
+	ld1.8	{$T0-$T3},[$inp],#64
+	eor	$A1,$A1,$T0
+	eor	$B1,$B1,$T1
+	eor	$C1,$C1,$T2
+	eor	$D1,$D1,$T3
+	st1.8	{$A1-$D1},[$out],#64
+	b.eq	.Ldone_neon
+	sub	$len,$len,#64
+
+	st1.8	{$A2-$D2},[sp]
+	b	.Last_neon
+
+.Less_than_128:
+	st1.8	{$A0-$D0},[sp]
+	b	.Last_neon
+.Less_than_192:
+	st1.8	{$A1-$D1},[sp]
+	b	.Last_neon
+
+.align	4
+.Last_neon:
+	sub	$out,$out,#1
+	add	$inp,$inp,$len
+	add	$out,$out,$len
+	add	$ctr,sp,$len
+	neg	$len,$len
+
+.Loop_tail_neon:
+	ldrb	w10,[$inp,$len]
+	ldrb	w11,[$ctr,$len]
+	add	$len,$len,#1
+	eor	w10,w10,w11
+	strb	w10,[$out,$len]
+	cbnz	$len,.Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+___
+{
+my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
+    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
+
+$code.=<<___;
+.type	ChaCha20_512_neon,%function
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	@x[0],:pg_hi21:.Lsigma
+	add	@x[0],@x[0],:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	@d[0],@d[1],[@x[0]]		// load sigma
+	ld1	{@K[0]},[@x[0]],#16
+	ldp	@d[2],@d[3],[$key]		// load key
+	ldp	@d[4],@d[5],[$key,#16]
+	ld1	{@K[1],@K[2]},[$key]
+	ldp	@d[6],@d[7],[$ctr]		// load counter
+	ld1	{@K[3]},[$ctr]
+	ld1	{$ONE},[@x[0]]
+#ifdef	__AARCH64EB__
+	rev64	@K[0],@K[0]
+	ror	@d[2],@d[2],#32
+	ror	@d[3],@d[3],#32
+	ror	@d[4],@d[4],#32
+	ror	@d[5],@d[5],#32
+	ror	@d[6],@d[6],#32
+	ror	@d[7],@d[7],#32
+#endif
+	add	@K[3],@K[3],$ONE		// += 1
+	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
+	add	@K[3],@K[3],$ONE		// not typo
+	str	@K[2],[sp,#32]
+	add	@K[4],@K[3],$ONE
+	add	@K[5],@K[4],$ONE
+	add	@K[6],@K[5],$ONE
+	shl	$ONE,$ONE,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	$len,$len,#512			// not typo
+
+.Loop_outer_512_neon:
+	 mov	$A0,@K[0]
+	 mov	$A1,@K[0]
+	 mov	$A2,@K[0]
+	 mov	$A3,@K[0]
+	 mov	$A4,@K[0]
+	 mov	$A5,@K[0]
+	 mov	$B0,@K[1]
+	mov.32	@x[0],@d[0]			// unpack key block
+	 mov	$B1,@K[1]
+	lsr	@x[1],@d[0],#32
+	 mov	$B2,@K[1]
+	mov.32	@x[2],@d[1]
+	 mov	$B3,@K[1]
+	lsr	@x[3],@d[1],#32
+	 mov	$B4,@K[1]
+	mov.32	@x[4],@d[2]
+	 mov	$B5,@K[1]
+	lsr	@x[5],@d[2],#32
+	 mov	$D0,@K[3]
+	mov.32	@x[6],@d[3]
+	 mov	$D1,@K[4]
+	lsr	@x[7],@d[3],#32
+	 mov	$D2,@K[5]
+	mov.32	@x[8],@d[4]
+	 mov	$D3,@K[6]
+	lsr	@x[9],@d[4],#32
+	 mov	$C0,@K[2]
+	mov.32	@x[10],@d[5]
+	 mov	$C1,@K[2]
+	lsr	@x[11],@d[5],#32
+	 add	$D4,$D0,$ONE			// +4
+	mov.32	@x[12],@d[6]
+	 add	$D5,$D1,$ONE			// +4
+	lsr	@x[13],@d[6],#32
+	 mov	$C2,@K[2]
+	mov.32	@x[14],@d[7]
+	 mov	$C3,@K[2]
+	lsr	@x[15],@d[7],#32
+	 mov	$C4,@K[2]
+	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
+	 mov	$C5,@K[2]
+	 str	@K[5],[sp,#80]
+
+	mov	$ctr,#5
+	subs	$len,$len,#512
+.Loop_upper_neon:
+	sub	$ctr,$ctr,#1
+___
+	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
+	my $i = 0;
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread67));
+		eval(shift(@thread1));	eval(shift(@thread67));
+		eval(shift(@thread2));	eval(shift(@thread67));
+		eval(shift(@thread3));	eval(shift(@thread67));
+		eval(shift(@thread4));	eval(shift(@thread67));
+		eval(shift(@thread5));	eval(shift(@thread67));
+	}
+
+	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread67));
+		eval(shift(@thread1));	eval(shift(@thread67));
+		eval(shift(@thread2));	eval(shift(@thread67));
+		eval(shift(@thread3));	eval(shift(@thread67));
+		eval(shift(@thread4));	eval(shift(@thread67));
+		eval(shift(@thread5));	eval(shift(@thread67));
+	}
+$code.=<<___;
+	cbnz	$ctr,.Loop_upper_neon
+
+	add.32	@x[0],@x[0],@d[0]		// accumulate key block
+	add	@x[1],@x[1],@d[0],lsr#32
+	add.32	@x[2],@x[2],@d[1]
+	add	@x[3],@x[3],@d[1],lsr#32
+	add.32	@x[4],@x[4],@d[2]
+	add	@x[5],@x[5],@d[2],lsr#32
+	add.32	@x[6],@x[6],@d[3]
+	add	@x[7],@x[7],@d[3],lsr#32
+	add.32	@x[8],@x[8],@d[4]
+	add	@x[9],@x[9],@d[4],lsr#32
+	add.32	@x[10],@x[10],@d[5]
+	add	@x[11],@x[11],@d[5],lsr#32
+	add.32	@x[12],@x[12],@d[6]
+	add	@x[13],@x[13],@d[6],lsr#32
+	add.32	@x[14],@x[14],@d[7]
+	add	@x[15],@x[15],@d[7],lsr#32
+
+	add	@x[0],@x[0],@x[1],lsl#32	// pack
+	add	@x[2],@x[2],@x[3],lsl#32
+	ldp	@x[1],@x[3],[$inp,#0]		// load input
+	add	@x[4],@x[4],@x[5],lsl#32
+	add	@x[6],@x[6],@x[7],lsl#32
+	ldp	@x[5],@x[7],[$inp,#16]
+	add	@x[8],@x[8],@x[9],lsl#32
+	add	@x[10],@x[10],@x[11],lsl#32
+	ldp	@x[9],@x[11],[$inp,#32]
+	add	@x[12],@x[12],@x[13],lsl#32
+	add	@x[14],@x[14],@x[15],lsl#32
+	ldp	@x[13],@x[15],[$inp,#48]
+	add	$inp,$inp,#64
+#ifdef	__AARCH64EB__
+	rev	@x[0],@x[0]
+	rev	@x[2],@x[2]
+	rev	@x[4],@x[4]
+	rev	@x[6],@x[6]
+	rev	@x[8],@x[8]
+	rev	@x[10],@x[10]
+	rev	@x[12],@x[12]
+	rev	@x[14],@x[14]
+#endif
+	eor	@x[0],@x[0],@x[1]
+	eor	@x[2],@x[2],@x[3]
+	eor	@x[4],@x[4],@x[5]
+	eor	@x[6],@x[6],@x[7]
+	eor	@x[8],@x[8],@x[9]
+	eor	@x[10],@x[10],@x[11]
+	eor	@x[12],@x[12],@x[13]
+	eor	@x[14],@x[14],@x[15]
+
+	 stp	@x[0],@x[2],[$out,#0]		// store output
+	 add	@d[6],@d[6],#1			// increment counter
+	mov.32	@x[0],@d[0]			// unpack key block
+	lsr	@x[1],@d[0],#32
+	 stp	@x[4],@x[6],[$out,#16]
+	mov.32	@x[2],@d[1]
+	lsr	@x[3],@d[1],#32
+	 stp	@x[8],@x[10],[$out,#32]
+	mov.32	@x[4],@d[2]
+	lsr	@x[5],@d[2],#32
+	 stp	@x[12],@x[14],[$out,#48]
+	 add	$out,$out,#64
+	mov.32	@x[6],@d[3]
+	lsr	@x[7],@d[3],#32
+	mov.32	@x[8],@d[4]
+	lsr	@x[9],@d[4],#32
+	mov.32	@x[10],@d[5]
+	lsr	@x[11],@d[5],#32
+	mov.32	@x[12],@d[6]
+	lsr	@x[13],@d[6],#32
+	mov.32	@x[14],@d[7]
+	lsr	@x[15],@d[7],#32
+
+	mov	$ctr,#5
+.Loop_lower_neon:
+	sub	$ctr,$ctr,#1
+___
+	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread67));
+		eval(shift(@thread1));	eval(shift(@thread67));
+		eval(shift(@thread2));	eval(shift(@thread67));
+		eval(shift(@thread3));	eval(shift(@thread67));
+		eval(shift(@thread4));	eval(shift(@thread67));
+		eval(shift(@thread5));	eval(shift(@thread67));
+	}
+
+	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+	foreach (@thread0) {
+		eval;			eval(shift(@thread67));
+		eval(shift(@thread1));	eval(shift(@thread67));
+		eval(shift(@thread2));	eval(shift(@thread67));
+		eval(shift(@thread3));	eval(shift(@thread67));
+		eval(shift(@thread4));	eval(shift(@thread67));
+		eval(shift(@thread5));	eval(shift(@thread67));
+	}
+$code.=<<___;
+	cbnz	$ctr,.Loop_lower_neon
+
+	add.32	@x[0],@x[0],@d[0]		// accumulate key block
+	 ldp	@K[0],@K[1],[sp,#0]
+	add	@x[1],@x[1],@d[0],lsr#32
+	 ldp	@K[2],@K[3],[sp,#32]
+	add.32	@x[2],@x[2],@d[1]
+	 ldp	@K[4],@K[5],[sp,#64]
+	add	@x[3],@x[3],@d[1],lsr#32
+	 add	$A0,$A0,@K[0]
+	add.32	@x[4],@x[4],@d[2]
+	 add	$A1,$A1,@K[0]
+	add	@x[5],@x[5],@d[2],lsr#32
+	 add	$A2,$A2,@K[0]
+	add.32	@x[6],@x[6],@d[3]
+	 add	$A3,$A3,@K[0]
+	add	@x[7],@x[7],@d[3],lsr#32
+	 add	$A4,$A4,@K[0]
+	add.32	@x[8],@x[8],@d[4]
+	 add	$A5,$A5,@K[0]
+	add	@x[9],@x[9],@d[4],lsr#32
+	 add	$C0,$C0,@K[2]
+	add.32	@x[10],@x[10],@d[5]
+	 add	$C1,$C1,@K[2]
+	add	@x[11],@x[11],@d[5],lsr#32
+	 add	$C2,$C2,@K[2]
+	add.32	@x[12],@x[12],@d[6]
+	 add	$C3,$C3,@K[2]
+	add	@x[13],@x[13],@d[6],lsr#32
+	 add	$C4,$C4,@K[2]
+	add.32	@x[14],@x[14],@d[7]
+	 add	$C5,$C5,@K[2]
+	add	@x[15],@x[15],@d[7],lsr#32
+	 add	$D4,$D4,$ONE			// +4
+	add	@x[0],@x[0],@x[1],lsl#32	// pack
+	 add	$D5,$D5,$ONE			// +4
+	add	@x[2],@x[2],@x[3],lsl#32
+	 add	$D0,$D0,@K[3]
+	ldp	@x[1],@x[3],[$inp,#0]		// load input
+	 add	$D1,$D1,@K[4]
+	add	@x[4],@x[4],@x[5],lsl#32
+	 add	$D2,$D2,@K[5]
+	add	@x[6],@x[6],@x[7],lsl#32
+	 add	$D3,$D3,@K[6]
+	ldp	@x[5],@x[7],[$inp,#16]
+	 add	$D4,$D4,@K[3]
+	add	@x[8],@x[8],@x[9],lsl#32
+	 add	$D5,$D5,@K[4]
+	add	@x[10],@x[10],@x[11],lsl#32
+	 add	$B0,$B0,@K[1]
+	ldp	@x[9],@x[11],[$inp,#32]
+	 add	$B1,$B1,@K[1]
+	add	@x[12],@x[12],@x[13],lsl#32
+	 add	$B2,$B2,@K[1]
+	add	@x[14],@x[14],@x[15],lsl#32
+	 add	$B3,$B3,@K[1]
+	ldp	@x[13],@x[15],[$inp,#48]
+	 add	$B4,$B4,@K[1]
+	add	$inp,$inp,#64
+	 add	$B5,$B5,@K[1]
+
+#ifdef	__AARCH64EB__
+	rev	@x[0],@x[0]
+	rev	@x[2],@x[2]
+	rev	@x[4],@x[4]
+	rev	@x[6],@x[6]
+	rev	@x[8],@x[8]
+	rev	@x[10],@x[10]
+	rev	@x[12],@x[12]
+	rev	@x[14],@x[14]
+#endif
+	ld1.8	{$T0-$T3},[$inp],#64
+	eor	@x[0],@x[0],@x[1]
+	eor	@x[2],@x[2],@x[3]
+	eor	@x[4],@x[4],@x[5]
+	eor	@x[6],@x[6],@x[7]
+	eor	@x[8],@x[8],@x[9]
+	 eor	$A0,$A0,$T0
+	eor	@x[10],@x[10],@x[11]
+	 eor	$B0,$B0,$T1
+	eor	@x[12],@x[12],@x[13]
+	 eor	$C0,$C0,$T2
+	eor	@x[14],@x[14],@x[15]
+	 eor	$D0,$D0,$T3
+	 ld1.8	{$T0-$T3},[$inp],#64
+
+	stp	@x[0],@x[2],[$out,#0]		// store output
+	 add	@d[6],@d[6],#7			// increment counter
+	stp	@x[4],@x[6],[$out,#16]
+	stp	@x[8],@x[10],[$out,#32]
+	stp	@x[12],@x[14],[$out,#48]
+	add	$out,$out,#64
+	st1.8	{$A0-$D0},[$out],#64
+
+	ld1.8	{$A0-$D0},[$inp],#64
+	eor	$A1,$A1,$T0
+	eor	$B1,$B1,$T1
+	eor	$C1,$C1,$T2
+	eor	$D1,$D1,$T3
+	st1.8	{$A1-$D1},[$out],#64
+
+	ld1.8	{$A1-$D1},[$inp],#64
+	eor	$A2,$A2,$A0
+	 ldp	@K[0],@K[1],[sp,#0]
+	eor	$B2,$B2,$B0
+	 ldp	@K[2],@K[3],[sp,#32]
+	eor	$C2,$C2,$C0
+	eor	$D2,$D2,$D0
+	st1.8	{$A2-$D2},[$out],#64
+
+	ld1.8	{$A2-$D2},[$inp],#64
+	eor	$A3,$A3,$A1
+	eor	$B3,$B3,$B1
+	eor	$C3,$C3,$C1
+	eor	$D3,$D3,$D1
+	st1.8	{$A3-$D3},[$out],#64
+
+	ld1.8	{$A3-$D3},[$inp],#64
+	eor	$A4,$A4,$A2
+	eor	$B4,$B4,$B2
+	eor	$C4,$C4,$C2
+	eor	$D4,$D4,$D2
+	st1.8	{$A4-$D4},[$out],#64
+
+	shl	$A0,$ONE,#1			// 4 -> 8
+	eor	$A5,$A5,$A3
+	eor	$B5,$B5,$B3
+	eor	$C5,$C5,$C3
+	eor	$D5,$D5,$D3
+	st1.8	{$A5-$D5},[$out],#64
+
+	add	@K[3],@K[3],$A0			// += 8
+	add	@K[4],@K[4],$A0
+	add	@K[5],@K[5],$A0
+	add	@K[6],@K[6],$A0
+
+	b.hs	.Loop_outer_512_neon
+
+	adds	$len,$len,#512
+	ushr	$A0,$ONE,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	@K[0],$ONE,[sp,#0]		// wipe off-load area
+	stp	@K[0],$ONE,[sp,#32]
+	stp	@K[0],$ONE,[sp,#64]
+
+	b.eq	.Ldone_512_neon
+
+	cmp	$len,#192
+	sub	@K[3],@K[3],$A0			// -= 1
+	sub	@K[4],@K[4],$A0
+	sub	@K[5],@K[5],$A0
+	add	sp,sp,#128
+	b.hs	.Loop_outer_neon
+
+	eor	@K[1],@K[1],@K[1]
+	eor	@K[2],@K[2],@K[2]
+	eor	@K[3],@K[3],@K[3]
+	eor	@K[4],@K[4],@K[4]
+	eor	@K[5],@K[5],@K[5]
+	eor	@K[6],@K[6],@K[6]
+	b	.Loop_outer
+
+.Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_512_neon,.-ChaCha20_512_neon
+___
+}
+}}}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
+	(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))	or
+	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
+	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
+	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
+
+	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!";	# flush
diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-x86.pl b/ring-0.17.14/crypto/chacha/asm/chacha-x86.pl
new file mode 100644
index 0000000000..06f8ad9f46
--- /dev/null
+++ b/ring-0.17.14/crypto/chacha/asm/chacha-x86.pl
@@ -0,0 +1,480 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# January 2015
+#
+# ChaCha20 for x86.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#		1xIALU/gcc	4xSSSE3
+# Pentium	17.5/+80%
+# PIII		14.2/+60%
+# P4		18.6/+84%
+# Core2		9.56/+89%	4.83
+# Westmere	9.50/+45%	3.35
+# Sandy Bridge	10.5/+47%	3.20
+# Haswell	8.15/+50%	2.83
+# Skylake	7.53/+22%	2.75
+# Silvermont	17.4/+36%	8.35
+# Goldmont	13.4/+40%	4.36
+# Sledgehammer	10.2/+54%
+# Bulldozer	13.4/+50%	4.38(*)
+#
+# (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
+#
+# Modified from upstream OpenSSL to remove the XOP code.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$xmm=$ymm=1;
+$gasver=999;  # enable everything
+
+$a="eax";
+($b,$b_)=("ebx","ebp");
+($c,$c_)=("ecx","esi");
+($d,$d_)=("edx","edi");
+
+&static_label("ssse3_data");
+&static_label("pic_point");
+
+if ($xmm) {
+my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
+my ($out,$inp,$len)=("edi","esi","ecx");
+
+sub QUARTERROUND_SSSE3 {
+my ($ai,$bi,$ci,$di,$i)=@_;
+my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
+my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
+
+	#       a   b   c   d
+	#
+	#       0   4   8  12 < even round
+	#       1   5   9  13
+	#       2   6  10  14
+	#       3   7  11  15
+	#       0   5  10  15 < odd round
+	#       1   6  11  12
+	#       2   7   8  13
+	#       3   4   9  14
+
+	if ($i==0) {
+            my $j=4;
+	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
+	} elsif ($i==3) {
+            my $j=0;
+	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
+	} elsif ($i==4) {
+            my $j=4;
+	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
+	} elsif ($i==7) {
+            my $j=0;
+	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
+	}
+
+	#&paddd	($xa,$xb);			# see elsewhere
+	#&pxor	($xd,$xa);			# see elsewhere
+	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
+	&pshufb	($xd,&QWP(0,"eax"));		# rot16
+	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
+	&paddd	($xc,$xd);
+	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
+	&pxor	($xb,$xc);
+	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
+	&movdqa	($xa_,$xb);			# borrow as temporary
+	&pslld	($xb,12);
+	&psrld	($xa_,20);
+	&por	($xb,$xa_);
+	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
+	&paddd	($xa,$xb);
+	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
+	&pxor	($xd,$xa);
+	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
+	&pshufb	($xd,&QWP(16,"eax"));		# rot8
+	&paddd	($xc,$xd);
+	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
+	&movdqa	($xd_,$xd)			if ($di==$dn);
+	&pxor	($xb,$xc);
+	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
+	&movdqa	($xa,$xb);			# borrow as temporary
+	&pslld	($xb,7);
+	&psrld	($xa,25);
+	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
+	&por	($xb,$xa);
+
+	($xa,$xa_)=($xa_,$xa);
+	($xb,$xb_)=($xb_,$xb);
+	($xc,$xc_)=($xc_,$xc);
+	($xd,$xd_)=($xd_,$xd);
+}
+
+&function_begin("ChaCha20_ctr32_ssse3");
+	&call	(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("eax");
+
+	&mov		($out,&wparam(0));
+	&mov		($inp,&wparam(1));
+	&mov		($len,&wparam(2));
+	&mov		("edx",&wparam(3));		# key
+	&mov		("ebx",&wparam(4));		# counter and nonce
+
+	&mov		("ebp","esp");
+	&stack_push	(131);
+	&and		("esp",-64);
+	&mov		(&DWP(512,"esp"),"ebp");
+
+	&lea		("eax",&DWP(&label("ssse3_data")."-".
+				    &label("pic_point"),"eax"));
+	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
+
+if (defined($gasver) && $gasver>=2.17) {		# even though we encode
+							# pshufb manually, we
+							# handle only register
+							# operands, while this
+							# segment uses memory
+							# operand...
+	&cmp		($len,64*4);
+	&jb		(&label("1x"));
+
+	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
+	&mov		(&DWP(512+8,"esp"),"ebx");
+	&sub		($len,64*4);			# bias len
+	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
+
+	&movdqu		("xmm7",&QWP(0,"edx"));		# key
+	&pshufd		("xmm0","xmm3",0x00);
+	&pshufd		("xmm1","xmm3",0x55);
+	&pshufd		("xmm2","xmm3",0xaa);
+	&pshufd		("xmm3","xmm3",0xff);
+	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
+	&pshufd		("xmm4","xmm7",0x00);
+	&pshufd		("xmm5","xmm7",0x55);
+	 &psubd		("xmm0",&QWP(16*4,"eax"));
+	&pshufd		("xmm6","xmm7",0xaa);
+	&pshufd		("xmm7","xmm7",0xff);
+	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
+	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
+	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
+	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
+	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
+	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
+	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
+	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
+	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
+	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
+	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
+
+	&pshufd		("xmm0","xmm3",0x00);
+	&pshufd		("xmm1","xmm3",0x55);
+	&pshufd		("xmm2","xmm3",0xaa);
+	&pshufd		("xmm3","xmm3",0xff);
+	&pshufd		("xmm4","xmm7",0x00);
+	&pshufd		("xmm5","xmm7",0x55);
+	&pshufd		("xmm6","xmm7",0xaa);
+	&pshufd		("xmm7","xmm7",0xff);
+	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
+	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
+	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
+	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
+	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
+	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
+	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
+	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
+
+	&lea		($inp,&DWP(128,$inp));		# size optimization
+	&lea		($out,&DWP(128,$out));		# size optimization
+	&jmp		(&label("outer_loop"));
+
+&set_label("outer_loop",16);
+	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
+	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
+	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
+	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
+	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
+	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
+	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
+	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
+	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
+	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
+	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
+	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
+	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
+	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
+	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
+	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
+	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
+	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
+	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
+	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
+	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
+	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
+	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
+	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
+	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
+	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
+	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
+	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
+	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
+	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
+	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
+	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
+	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
+	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
+
+	&movdqa		($xa, &QWP(16*0-128,"ebp"));
+	&movdqa		($xd, "xmm4");
+	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
+	&movdqa		($xc, &QWP(16*8-128,"ebp"));
+	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
+
+	&mov		("edx",10);			# loop counter
+	&nop		();
+
+&set_label("loop",16);
+	&paddd		($xa,$xb_);			# elsewhere
+	&movdqa		($xb,$xb_);
+	&pxor		($xd,$xa);			# elsewhere
+	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
+	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
+	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
+	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
+	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
+	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
+	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
+	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
+	&dec		("edx");
+	&jnz		(&label("loop"));
+
+	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
+	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
+	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
+	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
+	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
+
+    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
+
+	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
+	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
+	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
+	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
+
+    for($i=0;$i<256;$i+=64) {
+	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
+	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
+	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
+	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
+
+	&movdqa		($xt2,$xa0);		# "de-interlace" data
+	&punpckldq	($xa0,$xa1);
+	&movdqa		($xt3,$xa2);
+	&punpckldq	($xa2,$xa3);
+	&punpckhdq	($xt2,$xa1);
+	&punpckhdq	($xt3,$xa3);
+	&movdqa		($xa1,$xa0);
+	&punpcklqdq	($xa0,$xa2);		# "a0"
+	&movdqa		($xa3,$xt2);
+	&punpcklqdq	($xt2,$xt3);		# "a2"
+	&punpckhqdq	($xa1,$xa2);		# "a1"
+	&punpckhqdq	($xa3,$xt3);		# "a3"
+
+	#($xa2,$xt2)=($xt2,$xa2);
+
+	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
+	&movdqu		($xt1,&QWP(64*1-128,$inp));
+	&movdqu		($xa2,&QWP(64*2-128,$inp));
+	&movdqu		($xt3,&QWP(64*3-128,$inp));
+	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
+	&pxor		($xt0,$xa0);
+	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
+	&pxor		($xt1,$xa1);
+	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
+	&pxor		($xt2,$xa2);
+	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
+	&pxor		($xt3,$xa3);
+	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
+	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
+	&movdqu		(&QWP(64*1-128,$out),$xt1);
+	&movdqu		(&QWP(64*2-128,$out),$xt2);
+	&movdqu		(&QWP(64*3-128,$out),$xt3);
+	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
+    }
+	&sub		($len,64*4);
+	&jnc		(&label("outer_loop"));
+
+	&add		($len,64*4);
+	&jz		(&label("done"));
+
+	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
+	&lea		($inp,&DWP(-128,$inp));
+	&mov		("edx",&DWP(512+4,"esp"));
+	&lea		($out,&DWP(-128,$out));
+
+	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
+	&movdqu		("xmm3",&QWP(0,"ebx"));
+	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
+	&pand		("xmm3",&QWP(16*7,"eax"));
+	&por		("xmm3","xmm2");		# counter value
+}
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
+
+sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
+	&paddd		($a,$b);
+	&pxor		($d,$a);
+	&pshufb		($d,$rot16);
+
+	&paddd		($c,$d);
+	&pxor		($b,$c);
+	&movdqa		($t,$b);
+	&psrld		($b,20);
+	&pslld		($t,12);
+	&por		($b,$t);
+
+	&paddd		($a,$b);
+	&pxor		($d,$a);
+	&pshufb		($d,$rot24);
+
+	&paddd		($c,$d);
+	&pxor		($b,$c);
+	&movdqa		($t,$b);
+	&psrld		($b,25);
+	&pslld		($t,7);
+	&por		($b,$t);
+}
+
+&set_label("1x");
+	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
+	&movdqu		($b,&QWP(0,"edx"));
+	&movdqu		($c,&QWP(16,"edx"));
+	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
+	&movdqa		($rot16,&QWP(0,"eax"));
+	&movdqa		($rot24,&QWP(16,"eax"));
+	&mov		(&DWP(16*3,"esp"),"ebp");
+
+	&movdqa		(&QWP(16*0,"esp"),$a);
+	&movdqa		(&QWP(16*1,"esp"),$b);
+	&movdqa		(&QWP(16*2,"esp"),$c);
+	&movdqa		(&QWP(16*3,"esp"),$d);
+	&mov		("edx",10);
+	&jmp		(&label("loop1x"));
+
+&set_label("outer1x",16);
+	&movdqa		($d,&QWP(16*5,"eax"));		# one
+	&movdqa		($a,&QWP(16*0,"esp"));
+	&movdqa		($b,&QWP(16*1,"esp"));
+	&movdqa		($c,&QWP(16*2,"esp"));
+	&paddd		($d,&QWP(16*3,"esp"));
+	&mov		("edx",10);
+	&movdqa		(&QWP(16*3,"esp"),$d);
+	&jmp		(&label("loop1x"));
+
+&set_label("loop1x",16);
+	&SSSE3ROUND();
+	&pshufd	($c,$c,0b01001110);
+	&pshufd	($b,$b,0b00111001);
+	&pshufd	($d,$d,0b10010011);
+	&nop	();
+
+	&SSSE3ROUND();
+	&pshufd	($c,$c,0b01001110);
+	&pshufd	($b,$b,0b10010011);
+	&pshufd	($d,$d,0b00111001);
+
+	&dec		("edx");
+	&jnz		(&label("loop1x"));
+
+	&paddd		($a,&QWP(16*0,"esp"));
+	&paddd		($b,&QWP(16*1,"esp"));
+	&paddd		($c,&QWP(16*2,"esp"));
+	&paddd		($d,&QWP(16*3,"esp"));
+
+	&cmp		($len,64);
+	&jb		(&label("tail"));
+
+	&movdqu		($t,&QWP(16*0,$inp));
+	&movdqu		($t1,&QWP(16*1,$inp));
+	&pxor		($a,$t);		# xor with input
+	&movdqu		($t,&QWP(16*2,$inp));
+	&pxor		($b,$t1);
+	&movdqu		($t1,&QWP(16*3,$inp));
+	&pxor		($c,$t);
+	&pxor		($d,$t1);
+	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
+
+	&movdqu		(&QWP(16*0,$out),$a);	# write output
+	&movdqu		(&QWP(16*1,$out),$b);
+	&movdqu		(&QWP(16*2,$out),$c);
+	&movdqu		(&QWP(16*3,$out),$d);
+	&lea		($out,&DWP(16*4,$out));	# inp+=64
+
+	&sub		($len,64);
+	&jnz		(&label("outer1x"));
+
+	&jmp		(&label("done"));
+
+&set_label("tail");
+	&movdqa		(&QWP(16*0,"esp"),$a);
+	&movdqa		(&QWP(16*1,"esp"),$b);
+	&movdqa		(&QWP(16*2,"esp"),$c);
+	&movdqa		(&QWP(16*3,"esp"),$d);
+
+	&xor		("eax","eax");
+	&xor		("edx","edx");
+	&xor		("ebp","ebp");
+
+&set_label("tail_loop");
+	&movb		("al",&BP(0,"esp","ebp"));
+	&movb		("dl",&BP(0,$inp,"ebp"));
+	&lea		("ebp",&DWP(1,"ebp"));
+	&xor		("al","dl");
+	&movb		(&BP(-1,$out,"ebp"),"al");
+	&dec		($len);
+	&jnz		(&label("tail_loop"));
+}
+&set_label("done");
+	&mov		("esp",&DWP(512,"esp"));
+&function_end("ChaCha20_ctr32_ssse3");
+
+&align	(64);
+&set_label("ssse3_data");
+&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
+&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
+&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
+&data_word(0,1,2,3);
+&data_word(4,4,4,4);
+&data_word(1,0,0,0);
+&data_word(4,0,0,0);
+&data_word(0,-1,-1,-1);
+&align	(64);
+}
+&asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl b/ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl
new file mode 100644
index 0000000000..002f369556
--- /dev/null
+++ b/ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl
@@ -0,0 +1,1854 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# November 2014
+#
+# ChaCha20 for x86_64.
+#
+# December 2016
+#
+# Add AVX512F code path.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
+#
+# P4		9.48/+99%	-/22.7(ii)	-
+# Core2		7.83/+55%	7.90/8.08	4.35
+# Westmere	7.19/+50%	5.60/6.70	3.00
+# Sandy Bridge	8.31/+42%	5.45/6.76	2.72
+# Ivy Bridge	6.71/+46%	5.40/6.49	2.41
+# Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
+# Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.57]
+# Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
+# Knights L	11.7/-		-		9.60(iii)   0.80
+# Goldmont	10.6/+17%	5.10/-		3.28
+# Sledgehammer	7.28/+52%	-/14.2(ii)	-
+# Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
+# Ryzen		5.96/+50%	5.19/-		2.40        2.09
+# VIA Nano	10.5/+46%	6.72/8.60	6.05
+#
+# (i)	compared to older gcc 3.x one can observe >2x improvement on
+#	most platforms;
+# (ii)	as it can be seen, SSE2 performance is too low on legacy
+#	processors; NxSSE2 results are naturally better, but not
+#	impressively better than IALU ones, which is why you won't
+#	find SSE2 code below;
+# (iii)	this is not optimal result for Atom because of MSROM
+#	limitations, SSE2 can do better, but gain is considered too
+#	low to justify the [maintenance] effort;
+# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
+#
+# Modified from upstream OpenSSL to remove the XOP code.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx = 2;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# input parameter block
+($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
+
+$code.=<<___;
+.text
+
+.section .rodata
+.align	64
+.Lzero:
+.long	0,0,0,0
+.Lone:
+.long	1,0,0,0
+.Linc:
+.long	0,1,2,3
+.Lfour:
+.long	4,4,4,4
+.Lincy:
+.long	0,2,4,6,1,3,5,7
+.Leight:
+.long	8,8,8,8,8,8,8,8
+.Lrot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.asciz	"expand 32-byte k"
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.text
+___
+
+sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
+    "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
+@t=("%esi","%edi");
+
+sub ROUND {			# critical path is 24 cycles per round
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_)=map("\"$_\"",@t);
+my @x=map("\"$_\"",@x);
+
+	# Consider order in which variables are addressed by their
+	# index:
+	#
+	#	a   b   c   d
+	#
+	#	0   4   8  12 < even round
+	#	1   5   9  13
+	#	2   6  10  14
+	#	3   7  11  15
+	#	0   5  10  15 < odd round
+	#	1   6  11  12
+	#	2   7   8  13
+	#	3   4   9  14
+	#
+	# 'a', 'b' and 'd's are permanently allocated in registers,
+	# @x[0..7,12..15], while 'c's are maintained in memory. If
+	# you observe 'c' column, you'll notice that pair of 'c's is
+	# invariant between rounds. This means that we have to reload
+	# them once per round, in the middle. This is why you'll see
+	# bunch of 'c' stores and loads in the middle, but none in
+	# the beginning or end.
+
+	# Normally instructions would be interleaved to favour in-order
+	# execution. Generally out-of-order cores manage it gracefully,
+	# but not this time for some reason. As in-order execution
+	# cores are dying breed, old Atom is the only one around,
+	# instructions are left uninterleaved. Besides, Atom is better
+	# off executing 1xSSSE3 code anyway...
+
+	(
+	"&add	(@x[$a0],@x[$b0])",	# Q1
+	"&xor	(@x[$d0],@x[$a0])",
+	"&rol	(@x[$d0],16)",
+	 "&add	(@x[$a1],@x[$b1])",	# Q2
+	 "&xor	(@x[$d1],@x[$a1])",
+	 "&rol	(@x[$d1],16)",
+
+	"&add	($xc,@x[$d0])",
+	"&xor	(@x[$b0],$xc)",
+	"&rol	(@x[$b0],12)",
+	 "&add	($xc_,@x[$d1])",
+	 "&xor	(@x[$b1],$xc_)",
+	 "&rol	(@x[$b1],12)",
+
+	"&add	(@x[$a0],@x[$b0])",
+	"&xor	(@x[$d0],@x[$a0])",
+	"&rol	(@x[$d0],8)",
+	 "&add	(@x[$a1],@x[$b1])",
+	 "&xor	(@x[$d1],@x[$a1])",
+	 "&rol	(@x[$d1],8)",
+
+	"&add	($xc,@x[$d0])",
+	"&xor	(@x[$b0],$xc)",
+	"&rol	(@x[$b0],7)",
+	 "&add	($xc_,@x[$d1])",
+	 "&xor	(@x[$b1],$xc_)",
+	 "&rol	(@x[$b1],7)",
+
+	"&mov	(\"4*$c0(%rsp)\",$xc)",	# reload pair of 'c's
+	 "&mov	(\"4*$c1(%rsp)\",$xc_)",
+	"&mov	($xc,\"4*$c2(%rsp)\")",
+	 "&mov	($xc_,\"4*$c3(%rsp)\")",
+
+	"&add	(@x[$a2],@x[$b2])",	# Q3
+	"&xor	(@x[$d2],@x[$a2])",
+	"&rol	(@x[$d2],16)",
+	 "&add	(@x[$a3],@x[$b3])",	# Q4
+	 "&xor	(@x[$d3],@x[$a3])",
+	 "&rol	(@x[$d3],16)",
+
+	"&add	($xc,@x[$d2])",
+	"&xor	(@x[$b2],$xc)",
+	"&rol	(@x[$b2],12)",
+	 "&add	($xc_,@x[$d3])",
+	 "&xor	(@x[$b3],$xc_)",
+	 "&rol	(@x[$b3],12)",
+
+	"&add	(@x[$a2],@x[$b2])",
+	"&xor	(@x[$d2],@x[$a2])",
+	"&rol	(@x[$d2],8)",
+	 "&add	(@x[$a3],@x[$b3])",
+	 "&xor	(@x[$d3],@x[$a3])",
+	 "&rol	(@x[$d3],8)",
+
+	"&add	($xc,@x[$d2])",
+	"&xor	(@x[$b2],$xc)",
+	"&rol	(@x[$b2],7)",
+	 "&add	($xc_,@x[$d3])",
+	 "&xor	(@x[$b3],$xc_)",
+	 "&rol	(@x[$b3],7)"
+	);
+}
+
+########################################################################
+# Generic code path that handles all lengths on pre-SSSE3 processors.
+$code.=<<___;
+.globl	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,\@function,5
+.align	64
+ChaCha20_ctr32_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbx
+.cfi_push	rbx
+	push	%rbp
+.cfi_push	rbp
+	push	%r12
+.cfi_push	r12
+	push	%r13
+.cfi_push	r13
+	push	%r14
+.cfi_push	r14
+	push	%r15
+.cfi_push	r15
+	sub	\$64+24,%rsp
+.cfi_adjust_cfa_offset	`64+24`
+.Lctr32_body:
+
+	#movdqa	.Lsigma(%rip),%xmm0
+	movdqu	($key),%xmm1
+	movdqu	16($key),%xmm2
+	movdqu	($counter),%xmm3
+	movdqa	.Lone(%rip),%xmm4
+
+	#movdqa	%xmm0,4*0(%rsp)		# key[0]
+	movdqa	%xmm1,4*4(%rsp)		# key[1]
+	movdqa	%xmm2,4*8(%rsp)		# key[2]
+	movdqa	%xmm3,4*12(%rsp)	# key[3]
+	mov	$len,%rbp		# reassign $len
+	jmp	.Loop_outer
+
+.align	32
+.Loop_outer:
+	mov	\$0x61707865,@x[0]      # 'expa'
+	mov	\$0x3320646e,@x[1]      # 'nd 3'
+	mov	\$0x79622d32,@x[2]      # '2-by'
+	mov	\$0x6b206574,@x[3]      # 'te k'
+	mov	4*4(%rsp),@x[4]
+	mov	4*5(%rsp),@x[5]
+	mov	4*6(%rsp),@x[6]
+	mov	4*7(%rsp),@x[7]
+	movd	%xmm3,@x[12]
+	mov	4*13(%rsp),@x[13]
+	mov	4*14(%rsp),@x[14]
+	mov	4*15(%rsp),@x[15]
+
+	mov	%rbp,64+0(%rsp)		# save len
+	mov	\$10,%ebp
+	mov	$inp,64+8(%rsp)		# save inp
+	movq	%xmm2,%rsi		# "@x[8]"
+	mov	$out,64+16(%rsp)	# save out
+	mov	%rsi,%rdi
+	shr	\$32,%rdi		# "@x[9]"
+	jmp	.Loop
+
+.align	32
+.Loop:
+___
+	foreach (&ROUND (0, 4, 8,12)) { eval; }
+	foreach (&ROUND	(0, 5,10,15)) { eval; }
+	&dec	("%ebp");
+	&jnz	(".Loop");
+
+$code.=<<___;
+	mov	@t[1],4*9(%rsp)		# modulo-scheduled
+	mov	@t[0],4*8(%rsp)
+	mov	64(%rsp),%rbp		# load len
+	movdqa	%xmm2,%xmm1
+	mov	64+8(%rsp),$inp		# load inp
+	paddd	%xmm4,%xmm3		# increment counter
+	mov	64+16(%rsp),$out	# load out
+
+	add	\$0x61707865,@x[0]      # 'expa'
+	add	\$0x3320646e,@x[1]      # 'nd 3'
+	add	\$0x79622d32,@x[2]      # '2-by'
+	add	\$0x6b206574,@x[3]      # 'te k'
+	add	4*4(%rsp),@x[4]
+	add	4*5(%rsp),@x[5]
+	add	4*6(%rsp),@x[6]
+	add	4*7(%rsp),@x[7]
+	add	4*12(%rsp),@x[12]
+	add	4*13(%rsp),@x[13]
+	add	4*14(%rsp),@x[14]
+	add	4*15(%rsp),@x[15]
+	paddd	4*8(%rsp),%xmm1
+
+	cmp	\$64,%rbp
+	jb	.Ltail
+
+	xor	4*0($inp),@x[0]		# xor with input
+	xor	4*1($inp),@x[1]
+	xor	4*2($inp),@x[2]
+	xor	4*3($inp),@x[3]
+	xor	4*4($inp),@x[4]
+	xor	4*5($inp),@x[5]
+	xor	4*6($inp),@x[6]
+	xor	4*7($inp),@x[7]
+	movdqu	4*8($inp),%xmm0
+	xor	4*12($inp),@x[12]
+	xor	4*13($inp),@x[13]
+	xor	4*14($inp),@x[14]
+	xor	4*15($inp),@x[15]
+	lea	4*16($inp),$inp		# inp+=64
+	pxor	%xmm1,%xmm0
+
+	movdqa	%xmm2,4*8(%rsp)
+	movd	%xmm3,4*12(%rsp)
+
+	mov	@x[0],4*0($out)		# write output
+	mov	@x[1],4*1($out)
+	mov	@x[2],4*2($out)
+	mov	@x[3],4*3($out)
+	mov	@x[4],4*4($out)
+	mov	@x[5],4*5($out)
+	mov	@x[6],4*6($out)
+	mov	@x[7],4*7($out)
+	movdqu	%xmm0,4*8($out)
+	mov	@x[12],4*12($out)
+	mov	@x[13],4*13($out)
+	mov	@x[14],4*14($out)
+	mov	@x[15],4*15($out)
+	lea	4*16($out),$out		# out+=64
+
+	sub	\$64,%rbp
+	jnz	.Loop_outer
+
+	jmp	.Ldone
+
+.align	16
+.Ltail:
+	mov	@x[0],4*0(%rsp)
+	mov	@x[1],4*1(%rsp)
+	xor	%rbx,%rbx
+	mov	@x[2],4*2(%rsp)
+	mov	@x[3],4*3(%rsp)
+	mov	@x[4],4*4(%rsp)
+	mov	@x[5],4*5(%rsp)
+	mov	@x[6],4*6(%rsp)
+	mov	@x[7],4*7(%rsp)
+	movdqa	%xmm1,4*8(%rsp)
+	mov	@x[12],4*12(%rsp)
+	mov	@x[13],4*13(%rsp)
+	mov	@x[14],4*14(%rsp)
+	mov	@x[15],4*15(%rsp)
+
+.Loop_tail:
+	movzb	($inp,%rbx),%eax
+	movzb	(%rsp,%rbx),%edx
+	lea	1(%rbx),%rbx
+	xor	%edx,%eax
+	mov	%al,-1($out,%rbx)
+	dec	%rbp
+	jnz	.Loop_tail
+
+.Ldone:
+	lea	64+24+48(%rsp),%rsi
+	mov	-48(%rsi),%r15
+.cfi_restore	r15
+	mov	-40(%rsi),%r14
+.cfi_restore	r14
+	mov	-32(%rsi),%r13
+.cfi_restore	r13
+	mov	-24(%rsi),%r12
+.cfi_restore	r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	rbx
+	lea	(%rsi),%rsp
+.cfi_adjust_cfa_offset	`-64-24-48`
+.Lno_data:
+	ret
+.cfi_endproc
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+___
+
+########################################################################
+# SSSE3 code path that handles longer messages.
+{
+# assign variables to favor Atom front-end
+my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
+    $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
+my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
+
+sub SSSE3_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
+my @x=map("\"$_\"",@xx);
+
+	# Consider order in which variables are addressed by their
+	# index:
+	#
+	#	a   b   c   d
+	#
+	#	0   4   8  12 < even round
+	#	1   5   9  13
+	#	2   6  10  14
+	#	3   7  11  15
+	#	0   5  10  15 < odd round
+	#	1   6  11  12
+	#	2   7   8  13
+	#	3   4   9  14
+	#
+	# 'a', 'b' and 'd's are permanently allocated in registers,
+	# @x[0..7,12..15], while 'c's are maintained in memory. If
+	# you observe 'c' column, you'll notice that pair of 'c's is
+	# invariant between rounds. This means that we have to reload
+	# them once per round, in the middle. This is why you'll see
+	# bunch of 'c' stores and loads in the middle, but none in
+	# the beginning or end.
+
+	(
+	"&paddd		(@x[$a0],@x[$b0])",	# Q1
+	 "&paddd	(@x[$a1],@x[$b1])",	# Q2
+	"&pxor		(@x[$d0],@x[$a0])",
+	 "&pxor		(@x[$d1],@x[$a1])",
+	"&pshufb	(@x[$d0],$t1)",
+	 "&pshufb	(@x[$d1],$t1)",
+
+	"&paddd		($xc,@x[$d0])",
+	 "&paddd	($xc_,@x[$d1])",
+	"&pxor		(@x[$b0],$xc)",
+	 "&pxor		(@x[$b1],$xc_)",
+	"&movdqa	($t0,@x[$b0])",
+	"&pslld		(@x[$b0],12)",
+	"&psrld		($t0,20)",
+	 "&movdqa	($t1,@x[$b1])",
+	 "&pslld	(@x[$b1],12)",
+	"&por		(@x[$b0],$t0)",
+	 "&psrld	($t1,20)",
+	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
+	 "&por		(@x[$b1],$t1)",
+
+	"&paddd		(@x[$a0],@x[$b0])",
+	 "&paddd	(@x[$a1],@x[$b1])",
+	"&pxor		(@x[$d0],@x[$a0])",
+	 "&pxor		(@x[$d1],@x[$a1])",
+	"&pshufb	(@x[$d0],$t0)",
+	 "&pshufb	(@x[$d1],$t0)",
+
+	"&paddd		($xc,@x[$d0])",
+	 "&paddd	($xc_,@x[$d1])",
+	"&pxor		(@x[$b0],$xc)",
+	 "&pxor		(@x[$b1],$xc_)",
+	"&movdqa	($t1,@x[$b0])",
+	"&pslld		(@x[$b0],7)",
+	"&psrld		($t1,25)",
+	 "&movdqa	($t0,@x[$b1])",
+	 "&pslld	(@x[$b1],7)",
+	"&por		(@x[$b0],$t1)",
+	 "&psrld	($t0,25)",
+	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
+	 "&por		(@x[$b1],$t0)",
+
+	"&movdqa	(\"`16*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
+	 "&movdqa	(\"`16*($c1-8)`(%rsp)\",$xc_)",
+	"&movdqa	($xc,\"`16*($c2-8)`(%rsp)\")",
+	 "&movdqa	($xc_,\"`16*($c3-8)`(%rsp)\")",
+
+	"&paddd		(@x[$a2],@x[$b2])",	# Q3
+	 "&paddd	(@x[$a3],@x[$b3])",	# Q4
+	"&pxor		(@x[$d2],@x[$a2])",
+	 "&pxor		(@x[$d3],@x[$a3])",
+	"&pshufb	(@x[$d2],$t1)",
+	 "&pshufb	(@x[$d3],$t1)",
+
+	"&paddd		($xc,@x[$d2])",
+	 "&paddd	($xc_,@x[$d3])",
+	"&pxor		(@x[$b2],$xc)",
+	 "&pxor		(@x[$b3],$xc_)",
+	"&movdqa	($t0,@x[$b2])",
+	"&pslld		(@x[$b2],12)",
+	"&psrld		($t0,20)",
+	 "&movdqa	($t1,@x[$b3])",
+	 "&pslld	(@x[$b3],12)",
+	"&por		(@x[$b2],$t0)",
+	 "&psrld	($t1,20)",
+	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
+	 "&por		(@x[$b3],$t1)",
+
+	"&paddd		(@x[$a2],@x[$b2])",
+	 "&paddd	(@x[$a3],@x[$b3])",
+	"&pxor		(@x[$d2],@x[$a2])",
+	 "&pxor		(@x[$d3],@x[$a3])",
+	"&pshufb	(@x[$d2],$t0)",
+	 "&pshufb	(@x[$d3],$t0)",
+
+	"&paddd		($xc,@x[$d2])",
+	 "&paddd	($xc_,@x[$d3])",
+	"&pxor		(@x[$b2],$xc)",
+	 "&pxor		(@x[$b3],$xc_)",
+	"&movdqa	($t1,@x[$b2])",
+	"&pslld		(@x[$b2],7)",
+	"&psrld		($t1,25)",
+	 "&movdqa	($t0,@x[$b3])",
+	 "&pslld	(@x[$b3],7)",
+	"&por		(@x[$b2],$t1)",
+	 "&psrld	($t0,25)",
+	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
+	 "&por		(@x[$b3],$t0)"
+	);
+}
+
+my $xframe = $win64 ? 0xa8 : 8;
+
+$code.=<<___;
+.globl	ChaCha20_ctr32_ssse3_4x
+.type	ChaCha20_ctr32_ssse3_4x,\@function,5
+.align	32
+ChaCha20_ctr32_ssse3_4x:
+.cfi_startproc
+	_CET_ENDBR
+	mov		%rsp,%r9		# frame pointer
+.cfi_def_cfa_register	r9
+___
+$code.=<<___;
+	sub		\$0x140+$xframe,%rsp
+___
+	################ stack layout
+	# +0x00		SIMD equivalent of @x[8-12]
+	# ...
+	# +0x40		constant copy of key[0-2] smashed by lanes
+	# ...
+	# +0x100	SIMD counters (with nonce smashed by lanes)
+	# ...
+	# +0x140
+$code.=<<___	if ($win64);
+	movaps		%xmm6,-0xa8(%r9)
+	movaps		%xmm7,-0x98(%r9)
+	movaps		%xmm8,-0x88(%r9)
+	movaps		%xmm9,-0x78(%r9)
+	movaps		%xmm10,-0x68(%r9)
+	movaps		%xmm11,-0x58(%r9)
+	movaps		%xmm12,-0x48(%r9)
+	movaps		%xmm13,-0x38(%r9)
+	movaps		%xmm14,-0x28(%r9)
+	movaps		%xmm15,-0x18(%r9)
+.L4x_body:
+___
+$code.=<<___;
+	movdqa		.Lsigma(%rip),$xa3	# key[0]
+	movdqu		($key),$xb3		# key[1]
+	movdqu		16($key),$xt3		# key[2]
+	movdqu		($counter),$xd3		# key[3]
+	lea		0x100(%rsp),%rcx	# size optimization
+	lea		.Lrot16(%rip),%r10
+	lea		.Lrot24(%rip),%r11
+
+	pshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
+	pshufd		\$0x55,$xa3,$xa1
+	movdqa		$xa0,0x40(%rsp)		# ... and offload
+	pshufd		\$0xaa,$xa3,$xa2
+	movdqa		$xa1,0x50(%rsp)
+	pshufd		\$0xff,$xa3,$xa3
+	movdqa		$xa2,0x60(%rsp)
+	movdqa		$xa3,0x70(%rsp)
+
+	pshufd		\$0x00,$xb3,$xb0
+	pshufd		\$0x55,$xb3,$xb1
+	movdqa		$xb0,0x80-0x100(%rcx)
+	pshufd		\$0xaa,$xb3,$xb2
+	movdqa		$xb1,0x90-0x100(%rcx)
+	pshufd		\$0xff,$xb3,$xb3
+	movdqa		$xb2,0xa0-0x100(%rcx)
+	movdqa		$xb3,0xb0-0x100(%rcx)
+
+	pshufd		\$0x00,$xt3,$xt0	# "$xc0"
+	pshufd		\$0x55,$xt3,$xt1	# "$xc1"
+	movdqa		$xt0,0xc0-0x100(%rcx)
+	pshufd		\$0xaa,$xt3,$xt2	# "$xc2"
+	movdqa		$xt1,0xd0-0x100(%rcx)
+	pshufd		\$0xff,$xt3,$xt3	# "$xc3"
+	movdqa		$xt2,0xe0-0x100(%rcx)
+	movdqa		$xt3,0xf0-0x100(%rcx)
+
+	pshufd		\$0x00,$xd3,$xd0
+	pshufd		\$0x55,$xd3,$xd1
+	paddd		.Linc(%rip),$xd0	# don't save counters yet
+	pshufd		\$0xaa,$xd3,$xd2
+	movdqa		$xd1,0x110-0x100(%rcx)
+	pshufd		\$0xff,$xd3,$xd3
+	movdqa		$xd2,0x120-0x100(%rcx)
+	movdqa		$xd3,0x130-0x100(%rcx)
+
+	jmp		.Loop_enter4x
+
+.align	32
+.Loop_outer4x:
+	movdqa		0x40(%rsp),$xa0		# re-load smashed key
+	movdqa		0x50(%rsp),$xa1
+	movdqa		0x60(%rsp),$xa2
+	movdqa		0x70(%rsp),$xa3
+	movdqa		0x80-0x100(%rcx),$xb0
+	movdqa		0x90-0x100(%rcx),$xb1
+	movdqa		0xa0-0x100(%rcx),$xb2
+	movdqa		0xb0-0x100(%rcx),$xb3
+	movdqa		0xc0-0x100(%rcx),$xt0	# "$xc0"
+	movdqa		0xd0-0x100(%rcx),$xt1	# "$xc1"
+	movdqa		0xe0-0x100(%rcx),$xt2	# "$xc2"
+	movdqa		0xf0-0x100(%rcx),$xt3	# "$xc3"
+	movdqa		0x100-0x100(%rcx),$xd0
+	movdqa		0x110-0x100(%rcx),$xd1
+	movdqa		0x120-0x100(%rcx),$xd2
+	movdqa		0x130-0x100(%rcx),$xd3
+	paddd		.Lfour(%rip),$xd0	# next SIMD counters
+
+.Loop_enter4x:
+	movdqa		$xt2,0x20(%rsp)		# SIMD equivalent of "@x[10]"
+	movdqa		$xt3,0x30(%rsp)		# SIMD equivalent of "@x[11]"
+	movdqa		(%r10),$xt3		# .Lrot16(%rip)
+	mov		\$10,%eax
+	movdqa		$xd0,0x100-0x100(%rcx)	# save SIMD counters
+	jmp		.Loop4x
+
+.align	32
+.Loop4x:
+___
+	foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
+	foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+	dec		%eax
+	jnz		.Loop4x
+
+	paddd		0x40(%rsp),$xa0		# accumulate key material
+	paddd		0x50(%rsp),$xa1
+	paddd		0x60(%rsp),$xa2
+	paddd		0x70(%rsp),$xa3
+
+	movdqa		$xa0,$xt2		# "de-interlace" data
+	punpckldq	$xa1,$xa0
+	movdqa		$xa2,$xt3
+	punpckldq	$xa3,$xa2
+	punpckhdq	$xa1,$xt2
+	punpckhdq	$xa3,$xt3
+	movdqa		$xa0,$xa1
+	punpcklqdq	$xa2,$xa0		# "a0"
+	movdqa		$xt2,$xa3
+	punpcklqdq	$xt3,$xt2		# "a2"
+	punpckhqdq	$xa2,$xa1		# "a1"
+	punpckhqdq	$xt3,$xa3		# "a3"
+___
+	($xa2,$xt2)=($xt2,$xa2);
+$code.=<<___;
+	paddd		0x80-0x100(%rcx),$xb0
+	paddd		0x90-0x100(%rcx),$xb1
+	paddd		0xa0-0x100(%rcx),$xb2
+	paddd		0xb0-0x100(%rcx),$xb3
+
+	movdqa		$xa0,0x00(%rsp)		# offload $xaN
+	movdqa		$xa1,0x10(%rsp)
+	movdqa		0x20(%rsp),$xa0		# "xc2"
+	movdqa		0x30(%rsp),$xa1		# "xc3"
+
+	movdqa		$xb0,$xt2
+	punpckldq	$xb1,$xb0
+	movdqa		$xb2,$xt3
+	punpckldq	$xb3,$xb2
+	punpckhdq	$xb1,$xt2
+	punpckhdq	$xb3,$xt3
+	movdqa		$xb0,$xb1
+	punpcklqdq	$xb2,$xb0		# "b0"
+	movdqa		$xt2,$xb3
+	punpcklqdq	$xt3,$xt2		# "b2"
+	punpckhqdq	$xb2,$xb1		# "b1"
+	punpckhqdq	$xt3,$xb3		# "b3"
+___
+	($xb2,$xt2)=($xt2,$xb2);
+	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
+$code.=<<___;
+	paddd		0xc0-0x100(%rcx),$xc0
+	paddd		0xd0-0x100(%rcx),$xc1
+	paddd		0xe0-0x100(%rcx),$xc2
+	paddd		0xf0-0x100(%rcx),$xc3
+
+	movdqa		$xa2,0x20(%rsp)		# keep offloading $xaN
+	movdqa		$xa3,0x30(%rsp)
+
+	movdqa		$xc0,$xt2
+	punpckldq	$xc1,$xc0
+	movdqa		$xc2,$xt3
+	punpckldq	$xc3,$xc2
+	punpckhdq	$xc1,$xt2
+	punpckhdq	$xc3,$xt3
+	movdqa		$xc0,$xc1
+	punpcklqdq	$xc2,$xc0		# "c0"
+	movdqa		$xt2,$xc3
+	punpcklqdq	$xt3,$xt2		# "c2"
+	punpckhqdq	$xc2,$xc1		# "c1"
+	punpckhqdq	$xt3,$xc3		# "c3"
+___
+	($xc2,$xt2)=($xt2,$xc2);
+	($xt0,$xt1)=($xa2,$xa3);		# use $xaN as temporary
+$code.=<<___;
+	paddd		0x100-0x100(%rcx),$xd0
+	paddd		0x110-0x100(%rcx),$xd1
+	paddd		0x120-0x100(%rcx),$xd2
+	paddd		0x130-0x100(%rcx),$xd3
+
+	movdqa		$xd0,$xt2
+	punpckldq	$xd1,$xd0
+	movdqa		$xd2,$xt3
+	punpckldq	$xd3,$xd2
+	punpckhdq	$xd1,$xt2
+	punpckhdq	$xd3,$xt3
+	movdqa		$xd0,$xd1
+	punpcklqdq	$xd2,$xd0		# "d0"
+	movdqa		$xt2,$xd3
+	punpcklqdq	$xt3,$xt2		# "d2"
+	punpckhqdq	$xd2,$xd1		# "d1"
+	punpckhqdq	$xt3,$xd3		# "d3"
+___
+	($xd2,$xt2)=($xt2,$xd2);
+$code.=<<___;
+	cmp		\$64*4,$len
+	jb		.Ltail4x
+
+	movdqu		0x00($inp),$xt0		# xor with input
+	movdqu		0x10($inp),$xt1
+	movdqu		0x20($inp),$xt2
+	movdqu		0x30($inp),$xt3
+	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
+	pxor		$xb0,$xt1
+	pxor		$xc0,$xt2
+	pxor		$xd0,$xt3
+
+	 movdqu		$xt0,0x00($out)
+	movdqu		0x40($inp),$xt0
+	 movdqu		$xt1,0x10($out)
+	movdqu		0x50($inp),$xt1
+	 movdqu		$xt2,0x20($out)
+	movdqu		0x60($inp),$xt2
+	 movdqu		$xt3,0x30($out)
+	movdqu		0x70($inp),$xt3
+	lea		0x80($inp),$inp		# size optimization
+	pxor		0x10(%rsp),$xt0
+	pxor		$xb1,$xt1
+	pxor		$xc1,$xt2
+	pxor		$xd1,$xt3
+
+	 movdqu		$xt0,0x40($out)
+	movdqu		0x00($inp),$xt0
+	 movdqu		$xt1,0x50($out)
+	movdqu		0x10($inp),$xt1
+	 movdqu		$xt2,0x60($out)
+	movdqu		0x20($inp),$xt2
+	 movdqu		$xt3,0x70($out)
+	 lea		0x80($out),$out		# size optimization
+	movdqu		0x30($inp),$xt3
+	pxor		0x20(%rsp),$xt0
+	pxor		$xb2,$xt1
+	pxor		$xc2,$xt2
+	pxor		$xd2,$xt3
+
+	 movdqu		$xt0,0x00($out)
+	movdqu		0x40($inp),$xt0
+	 movdqu		$xt1,0x10($out)
+	movdqu		0x50($inp),$xt1
+	 movdqu		$xt2,0x20($out)
+	movdqu		0x60($inp),$xt2
+	 movdqu		$xt3,0x30($out)
+	movdqu		0x70($inp),$xt3
+	lea		0x80($inp),$inp		# inp+=64*4
+	pxor		0x30(%rsp),$xt0
+	pxor		$xb3,$xt1
+	pxor		$xc3,$xt2
+	pxor		$xd3,$xt3
+	movdqu		$xt0,0x40($out)
+	movdqu		$xt1,0x50($out)
+	movdqu		$xt2,0x60($out)
+	movdqu		$xt3,0x70($out)
+	lea		0x80($out),$out		# out+=64*4
+
+	sub		\$64*4,$len
+	jnz		.Loop_outer4x
+
+	jmp		.Ldone4x
+
+.Ltail4x:
+	cmp		\$192,$len
+	jae		.L192_or_more4x
+	cmp		\$128,$len
+	jae		.L128_or_more4x
+	cmp		\$64,$len
+	jae		.L64_or_more4x
+
+	#movdqa		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
+	xor		%r10,%r10
+	#movdqa		$xt0,0x00(%rsp)
+	movdqa		$xb0,0x10(%rsp)
+	movdqa		$xc0,0x20(%rsp)
+	movdqa		$xd0,0x30(%rsp)
+	jmp		.Loop_tail4x
+
+.align	32
+.L64_or_more4x:
+	movdqu		0x00($inp),$xt0		# xor with input
+	movdqu		0x10($inp),$xt1
+	movdqu		0x20($inp),$xt2
+	movdqu		0x30($inp),$xt3
+	pxor		0x00(%rsp),$xt0		# $xaxN is offloaded, remember?
+	pxor		$xb0,$xt1
+	pxor		$xc0,$xt2
+	pxor		$xd0,$xt3
+	movdqu		$xt0,0x00($out)
+	movdqu		$xt1,0x10($out)
+	movdqu		$xt2,0x20($out)
+	movdqu		$xt3,0x30($out)
+	je		.Ldone4x
+
+	movdqa		0x10(%rsp),$xt0		# $xaN is offloaded, remember?
+	lea		0x40($inp),$inp		# inp+=64*1
+	xor		%r10,%r10
+	movdqa		$xt0,0x00(%rsp)
+	movdqa		$xb1,0x10(%rsp)
+	lea		0x40($out),$out		# out+=64*1
+	movdqa		$xc1,0x20(%rsp)
+	sub		\$64,$len		# len-=64*1
+	movdqa		$xd1,0x30(%rsp)
+	jmp		.Loop_tail4x
+
+.align	32
+.L128_or_more4x:
+	movdqu		0x00($inp),$xt0		# xor with input
+	movdqu		0x10($inp),$xt1
+	movdqu		0x20($inp),$xt2
+	movdqu		0x30($inp),$xt3
+	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
+	pxor		$xb0,$xt1
+	pxor		$xc0,$xt2
+	pxor		$xd0,$xt3
+
+	 movdqu		$xt0,0x00($out)
+	movdqu		0x40($inp),$xt0
+	 movdqu		$xt1,0x10($out)
+	movdqu		0x50($inp),$xt1
+	 movdqu		$xt2,0x20($out)
+	movdqu		0x60($inp),$xt2
+	 movdqu		$xt3,0x30($out)
+	movdqu		0x70($inp),$xt3
+	pxor		0x10(%rsp),$xt0
+	pxor		$xb1,$xt1
+	pxor		$xc1,$xt2
+	pxor		$xd1,$xt3
+	movdqu		$xt0,0x40($out)
+	movdqu		$xt1,0x50($out)
+	movdqu		$xt2,0x60($out)
+	movdqu		$xt3,0x70($out)
+	je		.Ldone4x
+
+	movdqa		0x20(%rsp),$xt0		# $xaN is offloaded, remember?
+	lea		0x80($inp),$inp		# inp+=64*2
+	xor		%r10,%r10
+	movdqa		$xt0,0x00(%rsp)
+	movdqa		$xb2,0x10(%rsp)
+	lea		0x80($out),$out		# out+=64*2
+	movdqa		$xc2,0x20(%rsp)
+	sub		\$128,$len		# len-=64*2
+	movdqa		$xd2,0x30(%rsp)
+	jmp		.Loop_tail4x
+
+.align	32
+.L192_or_more4x:
+	movdqu		0x00($inp),$xt0		# xor with input
+	movdqu		0x10($inp),$xt1
+	movdqu		0x20($inp),$xt2
+	movdqu		0x30($inp),$xt3
+	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
+	pxor		$xb0,$xt1
+	pxor		$xc0,$xt2
+	pxor		$xd0,$xt3
+
+	 movdqu		$xt0,0x00($out)
+	movdqu		0x40($inp),$xt0
+	 movdqu		$xt1,0x10($out)
+	movdqu		0x50($inp),$xt1
+	 movdqu		$xt2,0x20($out)
+	movdqu		0x60($inp),$xt2
+	 movdqu		$xt3,0x30($out)
+	movdqu		0x70($inp),$xt3
+	lea		0x80($inp),$inp		# size optimization
+	pxor		0x10(%rsp),$xt0
+	pxor		$xb1,$xt1
+	pxor		$xc1,$xt2
+	pxor		$xd1,$xt3
+
+	 movdqu		$xt0,0x40($out)
+	movdqu		0x00($inp),$xt0
+	 movdqu		$xt1,0x50($out)
+	movdqu		0x10($inp),$xt1
+	 movdqu		$xt2,0x60($out)
+	movdqu		0x20($inp),$xt2
+	 movdqu		$xt3,0x70($out)
+	 lea		0x80($out),$out		# size optimization
+	movdqu		0x30($inp),$xt3
+	pxor		0x20(%rsp),$xt0
+	pxor		$xb2,$xt1
+	pxor		$xc2,$xt2
+	pxor		$xd2,$xt3
+	movdqu		$xt0,0x00($out)
+	movdqu		$xt1,0x10($out)
+	movdqu		$xt2,0x20($out)
+	movdqu		$xt3,0x30($out)
+	je		.Ldone4x
+
+	movdqa		0x30(%rsp),$xt0		# $xaN is offloaded, remember?
+	lea		0x40($inp),$inp		# inp+=64*3
+	xor		%r10,%r10
+	movdqa		$xt0,0x00(%rsp)
+	movdqa		$xb3,0x10(%rsp)
+	lea		0x40($out),$out		# out+=64*3
+	movdqa		$xc3,0x20(%rsp)
+	sub		\$192,$len		# len-=64*3
+	movdqa		$xd3,0x30(%rsp)
+
+.Loop_tail4x:
+	movzb		($inp,%r10),%eax
+	movzb		(%rsp,%r10),%ecx
+	lea		1(%r10),%r10
+	xor		%ecx,%eax
+	mov		%al,-1($out,%r10)
+	dec		$len
+	jnz		.Loop_tail4x
+
+.Ldone4x:
+___
+$code.=<<___	if ($win64);
+	movaps		-0xa8(%r9),%xmm6
+	movaps		-0x98(%r9),%xmm7
+	movaps		-0x88(%r9),%xmm8
+	movaps		-0x78(%r9),%xmm9
+	movaps		-0x68(%r9),%xmm10
+	movaps		-0x58(%r9),%xmm11
+	movaps		-0x48(%r9),%xmm12
+	movaps		-0x38(%r9),%xmm13
+	movaps		-0x28(%r9),%xmm14
+	movaps		-0x18(%r9),%xmm15
+___
+$code.=<<___;
+	lea		(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L4x_epilogue:
+	ret
+.cfi_endproc
+.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
+___
+}
+
+########################################################################
+# AVX2 code path
+if ($avx>1) {
+my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
+    $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
+my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
+
+sub AVX2_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
+my @x=map("\"$_\"",@xx);
+
+	# Consider order in which variables are addressed by their
+	# index:
+	#
+	#	a   b   c   d
+	#
+	#	0   4   8  12 < even round
+	#	1   5   9  13
+	#	2   6  10  14
+	#	3   7  11  15
+	#	0   5  10  15 < odd round
+	#	1   6  11  12
+	#	2   7   8  13
+	#	3   4   9  14
+	#
+	# 'a', 'b' and 'd's are permanently allocated in registers,
+	# @x[0..7,12..15], while 'c's are maintained in memory. If
+	# you observe 'c' column, you'll notice that pair of 'c's is
+	# invariant between rounds. This means that we have to reload
+	# them once per round, in the middle. This is why you'll see
+	# bunch of 'c' stores and loads in the middle, but none in
+	# the beginning or end.
+
+	(
+	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
+	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
+	"&vpshufb	(@x[$d0],@x[$d0],$t1)",
+	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
+	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
+	 "&vpshufb	(@x[$d1],@x[$d1],$t1)",
+
+	"&vpaddd	($xc,$xc,@x[$d0])",
+	"&vpxor		(@x[$b0],$xc,@x[$b0])",
+	"&vpslld	($t0,@x[$b0],12)",
+	"&vpsrld	(@x[$b0],@x[$b0],20)",
+	"&vpor		(@x[$b0],$t0,@x[$b0])",
+	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
+	 "&vpaddd	($xc_,$xc_,@x[$d1])",
+	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
+	 "&vpslld	($t1,@x[$b1],12)",
+	 "&vpsrld	(@x[$b1],@x[$b1],20)",
+	 "&vpor		(@x[$b1],$t1,@x[$b1])",
+
+	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
+	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
+	"&vpshufb	(@x[$d0],@x[$d0],$t0)",
+	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
+	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
+	 "&vpshufb	(@x[$d1],@x[$d1],$t0)",
+
+	"&vpaddd	($xc,$xc,@x[$d0])",
+	"&vpxor		(@x[$b0],$xc,@x[$b0])",
+	"&vpslld	($t1,@x[$b0],7)",
+	"&vpsrld	(@x[$b0],@x[$b0],25)",
+	"&vpor		(@x[$b0],$t1,@x[$b0])",
+	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
+	 "&vpaddd	($xc_,$xc_,@x[$d1])",
+	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
+	 "&vpslld	($t0,@x[$b1],7)",
+	 "&vpsrld	(@x[$b1],@x[$b1],25)",
+	 "&vpor		(@x[$b1],$t0,@x[$b1])",
+
+	"&vmovdqa	(\"`32*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
+	 "&vmovdqa	(\"`32*($c1-8)`(%rsp)\",$xc_)",
+	"&vmovdqa	($xc,\"`32*($c2-8)`(%rsp)\")",
+	 "&vmovdqa	($xc_,\"`32*($c3-8)`(%rsp)\")",
+
+	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
+	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
+	"&vpshufb	(@x[$d2],@x[$d2],$t1)",
+	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
+	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
+	 "&vpshufb	(@x[$d3],@x[$d3],$t1)",
+
+	"&vpaddd	($xc,$xc,@x[$d2])",
+	"&vpxor		(@x[$b2],$xc,@x[$b2])",
+	"&vpslld	($t0,@x[$b2],12)",
+	"&vpsrld	(@x[$b2],@x[$b2],20)",
+	"&vpor		(@x[$b2],$t0,@x[$b2])",
+	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
+	 "&vpaddd	($xc_,$xc_,@x[$d3])",
+	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
+	 "&vpslld	($t1,@x[$b3],12)",
+	 "&vpsrld	(@x[$b3],@x[$b3],20)",
+	 "&vpor		(@x[$b3],$t1,@x[$b3])",
+
+	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
+	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
+	"&vpshufb	(@x[$d2],@x[$d2],$t0)",
+	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
+	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
+	 "&vpshufb	(@x[$d3],@x[$d3],$t0)",
+
+	"&vpaddd	($xc,$xc,@x[$d2])",
+	"&vpxor		(@x[$b2],$xc,@x[$b2])",
+	"&vpslld	($t1,@x[$b2],7)",
+	"&vpsrld	(@x[$b2],@x[$b2],25)",
+	"&vpor		(@x[$b2],$t1,@x[$b2])",
+	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
+	 "&vpaddd	($xc_,$xc_,@x[$d3])",
+	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
+	 "&vpslld	($t0,@x[$b3],7)",
+	 "&vpsrld	(@x[$b3],@x[$b3],25)",
+	 "&vpor		(@x[$b3],$t0,@x[$b3])"
+	);
+}
+
+my $xframe = $win64 ? 0xa8 : 8;
+
+$code.=<<___;
+.globl	ChaCha20_ctr32_avx2
+.type	ChaCha20_ctr32_avx2,\@function,5
+.align	32
+ChaCha20_ctr32_avx2:
+.cfi_startproc
+	_CET_ENDBR
+	mov		%rsp,%r9		# frame register
+.cfi_def_cfa_register	r9
+	sub		\$0x280+$xframe,%rsp
+	and		\$-32,%rsp
+___
+$code.=<<___	if ($win64);
+	movaps		%xmm6,-0xa8(%r9)
+	movaps		%xmm7,-0x98(%r9)
+	movaps		%xmm8,-0x88(%r9)
+	movaps		%xmm9,-0x78(%r9)
+	movaps		%xmm10,-0x68(%r9)
+	movaps		%xmm11,-0x58(%r9)
+	movaps		%xmm12,-0x48(%r9)
+	movaps		%xmm13,-0x38(%r9)
+	movaps		%xmm14,-0x28(%r9)
+	movaps		%xmm15,-0x18(%r9)
+.L8x_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	################ stack layout
+	# +0x00		SIMD equivalent of @x[8-12]
+	# ...
+	# +0x80		constant copy of key[0-2] smashed by lanes
+	# ...
+	# +0x200	SIMD counters (with nonce smashed by lanes)
+	# ...
+	# +0x280
+
+	vbroadcasti128	.Lsigma(%rip),$xa3	# key[0]
+	vbroadcasti128	($key),$xb3		# key[1]
+	vbroadcasti128	16($key),$xt3		# key[2]
+	vbroadcasti128	($counter),$xd3		# key[3]
+	lea		0x100(%rsp),%rcx	# size optimization
+	lea		0x200(%rsp),%rax	# size optimization
+	lea		.Lrot16(%rip),%r10
+	lea		.Lrot24(%rip),%r11
+
+	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
+	vpshufd		\$0x55,$xa3,$xa1
+	vmovdqa		$xa0,0x80-0x100(%rcx)	# ... and offload
+	vpshufd		\$0xaa,$xa3,$xa2
+	vmovdqa		$xa1,0xa0-0x100(%rcx)
+	vpshufd		\$0xff,$xa3,$xa3
+	vmovdqa		$xa2,0xc0-0x100(%rcx)
+	vmovdqa		$xa3,0xe0-0x100(%rcx)
+
+	vpshufd		\$0x00,$xb3,$xb0
+	vpshufd		\$0x55,$xb3,$xb1
+	vmovdqa		$xb0,0x100-0x100(%rcx)
+	vpshufd		\$0xaa,$xb3,$xb2
+	vmovdqa		$xb1,0x120-0x100(%rcx)
+	vpshufd		\$0xff,$xb3,$xb3
+	vmovdqa		$xb2,0x140-0x100(%rcx)
+	vmovdqa		$xb3,0x160-0x100(%rcx)
+
+	vpshufd		\$0x00,$xt3,$xt0	# "xc0"
+	vpshufd		\$0x55,$xt3,$xt1	# "xc1"
+	vmovdqa		$xt0,0x180-0x200(%rax)
+	vpshufd		\$0xaa,$xt3,$xt2	# "xc2"
+	vmovdqa		$xt1,0x1a0-0x200(%rax)
+	vpshufd		\$0xff,$xt3,$xt3	# "xc3"
+	vmovdqa		$xt2,0x1c0-0x200(%rax)
+	vmovdqa		$xt3,0x1e0-0x200(%rax)
+
+	vpshufd		\$0x00,$xd3,$xd0
+	vpshufd		\$0x55,$xd3,$xd1
+	vpaddd		.Lincy(%rip),$xd0,$xd0	# don't save counters yet
+	vpshufd		\$0xaa,$xd3,$xd2
+	vmovdqa		$xd1,0x220-0x200(%rax)
+	vpshufd		\$0xff,$xd3,$xd3
+	vmovdqa		$xd2,0x240-0x200(%rax)
+	vmovdqa		$xd3,0x260-0x200(%rax)
+
+	jmp		.Loop_enter8x
+
+.align	32
+.Loop_outer8x:
+	vmovdqa		0x80-0x100(%rcx),$xa0	# re-load smashed key
+	vmovdqa		0xa0-0x100(%rcx),$xa1
+	vmovdqa		0xc0-0x100(%rcx),$xa2
+	vmovdqa		0xe0-0x100(%rcx),$xa3
+	vmovdqa		0x100-0x100(%rcx),$xb0
+	vmovdqa		0x120-0x100(%rcx),$xb1
+	vmovdqa		0x140-0x100(%rcx),$xb2
+	vmovdqa		0x160-0x100(%rcx),$xb3
+	vmovdqa		0x180-0x200(%rax),$xt0	# "xc0"
+	vmovdqa		0x1a0-0x200(%rax),$xt1	# "xc1"
+	vmovdqa		0x1c0-0x200(%rax),$xt2	# "xc2"
+	vmovdqa		0x1e0-0x200(%rax),$xt3	# "xc3"
+	vmovdqa		0x200-0x200(%rax),$xd0
+	vmovdqa		0x220-0x200(%rax),$xd1
+	vmovdqa		0x240-0x200(%rax),$xd2
+	vmovdqa		0x260-0x200(%rax),$xd3
+	vpaddd		.Leight(%rip),$xd0,$xd0	# next SIMD counters
+
+.Loop_enter8x:
+	vmovdqa		$xt2,0x40(%rsp)		# SIMD equivalent of "@x[10]"
+	vmovdqa		$xt3,0x60(%rsp)		# SIMD equivalent of "@x[11]"
+	vbroadcasti128	(%r10),$xt3
+	vmovdqa		$xd0,0x200-0x200(%rax)	# save SIMD counters
+	mov		\$10,%eax
+	jmp		.Loop8x
+
+.align	32
+.Loop8x:
+___
+	foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
+	foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+	dec		%eax
+	jnz		.Loop8x
+
+	lea		0x200(%rsp),%rax	# size optimization
+	vpaddd		0x80-0x100(%rcx),$xa0,$xa0	# accumulate key
+	vpaddd		0xa0-0x100(%rcx),$xa1,$xa1
+	vpaddd		0xc0-0x100(%rcx),$xa2,$xa2
+	vpaddd		0xe0-0x100(%rcx),$xa3,$xa3
+
+	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
+	vpunpckldq	$xa3,$xa2,$xt3
+	vpunpckhdq	$xa1,$xa0,$xa0
+	vpunpckhdq	$xa3,$xa2,$xa2
+	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
+	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
+	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
+___
+	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
+$code.=<<___;
+	vpaddd		0x100-0x100(%rcx),$xb0,$xb0
+	vpaddd		0x120-0x100(%rcx),$xb1,$xb1
+	vpaddd		0x140-0x100(%rcx),$xb2,$xb2
+	vpaddd		0x160-0x100(%rcx),$xb3,$xb3
+
+	vpunpckldq	$xb1,$xb0,$xt2
+	vpunpckldq	$xb3,$xb2,$xt3
+	vpunpckhdq	$xb1,$xb0,$xb0
+	vpunpckhdq	$xb3,$xb2,$xb2
+	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
+	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
+	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
+___
+	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
+$code.=<<___;
+	vperm2i128	\$0x20,$xb0,$xa0,$xt3	# "de-interlace" further
+	vperm2i128	\$0x31,$xb0,$xa0,$xb0
+	vperm2i128	\$0x20,$xb1,$xa1,$xa0
+	vperm2i128	\$0x31,$xb1,$xa1,$xb1
+	vperm2i128	\$0x20,$xb2,$xa2,$xa1
+	vperm2i128	\$0x31,$xb2,$xa2,$xb2
+	vperm2i128	\$0x20,$xb3,$xa3,$xa2
+	vperm2i128	\$0x31,$xb3,$xa3,$xb3
+___
+	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
+	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
+$code.=<<___;
+	vmovdqa		$xa0,0x00(%rsp)		# offload $xaN
+	vmovdqa		$xa1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),$xc2		# $xa0
+	vmovdqa		0x60(%rsp),$xc3		# $xa1
+
+	vpaddd		0x180-0x200(%rax),$xc0,$xc0
+	vpaddd		0x1a0-0x200(%rax),$xc1,$xc1
+	vpaddd		0x1c0-0x200(%rax),$xc2,$xc2
+	vpaddd		0x1e0-0x200(%rax),$xc3,$xc3
+
+	vpunpckldq	$xc1,$xc0,$xt2
+	vpunpckldq	$xc3,$xc2,$xt3
+	vpunpckhdq	$xc1,$xc0,$xc0
+	vpunpckhdq	$xc3,$xc2,$xc2
+	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
+	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
+	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
+___
+	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
+$code.=<<___;
+	vpaddd		0x200-0x200(%rax),$xd0,$xd0
+	vpaddd		0x220-0x200(%rax),$xd1,$xd1
+	vpaddd		0x240-0x200(%rax),$xd2,$xd2
+	vpaddd		0x260-0x200(%rax),$xd3,$xd3
+
+	vpunpckldq	$xd1,$xd0,$xt2
+	vpunpckldq	$xd3,$xd2,$xt3
+	vpunpckhdq	$xd1,$xd0,$xd0
+	vpunpckhdq	$xd3,$xd2,$xd2
+	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
+	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
+	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
+___
+	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
+$code.=<<___;
+	vperm2i128	\$0x20,$xd0,$xc0,$xt3	# "de-interlace" further
+	vperm2i128	\$0x31,$xd0,$xc0,$xd0
+	vperm2i128	\$0x20,$xd1,$xc1,$xc0
+	vperm2i128	\$0x31,$xd1,$xc1,$xd1
+	vperm2i128	\$0x20,$xd2,$xc2,$xc1
+	vperm2i128	\$0x31,$xd2,$xc2,$xd2
+	vperm2i128	\$0x20,$xd3,$xc3,$xc2
+	vperm2i128	\$0x31,$xd3,$xc3,$xd3
+___
+	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
+	($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
+	($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
+	($xa0,$xa1)=($xt2,$xt3);
+$code.=<<___;
+	vmovdqa		0x00(%rsp),$xa0		# $xaN was offloaded, remember?
+	vmovdqa		0x20(%rsp),$xa1
+
+	cmp		\$64*8,$len
+	jb		.Ltail8x
+
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	lea		0x80($inp),$inp		# size optimization
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	lea		0x80($out),$out		# size optimization
+
+	vpxor		0x00($inp),$xa1,$xa1
+	vpxor		0x20($inp),$xb1,$xb1
+	vpxor		0x40($inp),$xc1,$xc1
+	vpxor		0x60($inp),$xd1,$xd1
+	lea		0x80($inp),$inp		# size optimization
+	vmovdqu		$xa1,0x00($out)
+	vmovdqu		$xb1,0x20($out)
+	vmovdqu		$xc1,0x40($out)
+	vmovdqu		$xd1,0x60($out)
+	lea		0x80($out),$out		# size optimization
+
+	vpxor		0x00($inp),$xa2,$xa2
+	vpxor		0x20($inp),$xb2,$xb2
+	vpxor		0x40($inp),$xc2,$xc2
+	vpxor		0x60($inp),$xd2,$xd2
+	lea		0x80($inp),$inp		# size optimization
+	vmovdqu		$xa2,0x00($out)
+	vmovdqu		$xb2,0x20($out)
+	vmovdqu		$xc2,0x40($out)
+	vmovdqu		$xd2,0x60($out)
+	lea		0x80($out),$out		# size optimization
+
+	vpxor		0x00($inp),$xa3,$xa3
+	vpxor		0x20($inp),$xb3,$xb3
+	vpxor		0x40($inp),$xc3,$xc3
+	vpxor		0x60($inp),$xd3,$xd3
+	lea		0x80($inp),$inp		# size optimization
+	vmovdqu		$xa3,0x00($out)
+	vmovdqu		$xb3,0x20($out)
+	vmovdqu		$xc3,0x40($out)
+	vmovdqu		$xd3,0x60($out)
+	lea		0x80($out),$out		# size optimization
+
+	sub		\$64*8,$len
+	jnz		.Loop_outer8x
+
+	jmp		.Ldone8x
+
+.Ltail8x:
+	cmp		\$448,$len
+	jae		.L448_or_more8x
+	cmp		\$384,$len
+	jae		.L384_or_more8x
+	cmp		\$320,$len
+	jae		.L320_or_more8x
+	cmp		\$256,$len
+	jae		.L256_or_more8x
+	cmp		\$192,$len
+	jae		.L192_or_more8x
+	cmp		\$128,$len
+	jae		.L128_or_more8x
+	cmp		\$64,$len
+	jae		.L64_or_more8x
+
+	xor		%r10,%r10
+	vmovdqa		$xa0,0x00(%rsp)
+	vmovdqa		$xb0,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L64_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	je		.Ldone8x
+
+	lea		0x40($inp),$inp		# inp+=64*1
+	xor		%r10,%r10
+	vmovdqa		$xc0,0x00(%rsp)
+	lea		0x40($out),$out		# out+=64*1
+	sub		\$64,$len		# len-=64*1
+	vmovdqa		$xd0,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L128_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	je		.Ldone8x
+
+	lea		0x80($inp),$inp		# inp+=64*2
+	xor		%r10,%r10
+	vmovdqa		$xa1,0x00(%rsp)
+	lea		0x80($out),$out		# out+=64*2
+	sub		\$128,$len		# len-=64*2
+	vmovdqa		$xb1,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L192_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	vpxor		0x80($inp),$xa1,$xa1
+	vpxor		0xa0($inp),$xb1,$xb1
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	vmovdqu		$xa1,0x80($out)
+	vmovdqu		$xb1,0xa0($out)
+	je		.Ldone8x
+
+	lea		0xc0($inp),$inp		# inp+=64*3
+	xor		%r10,%r10
+	vmovdqa		$xc1,0x00(%rsp)
+	lea		0xc0($out),$out		# out+=64*3
+	sub		\$192,$len		# len-=64*3
+	vmovdqa		$xd1,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L256_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	vpxor		0x80($inp),$xa1,$xa1
+	vpxor		0xa0($inp),$xb1,$xb1
+	vpxor		0xc0($inp),$xc1,$xc1
+	vpxor		0xe0($inp),$xd1,$xd1
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	vmovdqu		$xa1,0x80($out)
+	vmovdqu		$xb1,0xa0($out)
+	vmovdqu		$xc1,0xc0($out)
+	vmovdqu		$xd1,0xe0($out)
+	je		.Ldone8x
+
+	lea		0x100($inp),$inp	# inp+=64*4
+	xor		%r10,%r10
+	vmovdqa		$xa2,0x00(%rsp)
+	lea		0x100($out),$out	# out+=64*4
+	sub		\$256,$len		# len-=64*4
+	vmovdqa		$xb2,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L320_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	vpxor		0x80($inp),$xa1,$xa1
+	vpxor		0xa0($inp),$xb1,$xb1
+	vpxor		0xc0($inp),$xc1,$xc1
+	vpxor		0xe0($inp),$xd1,$xd1
+	vpxor		0x100($inp),$xa2,$xa2
+	vpxor		0x120($inp),$xb2,$xb2
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	vmovdqu		$xa1,0x80($out)
+	vmovdqu		$xb1,0xa0($out)
+	vmovdqu		$xc1,0xc0($out)
+	vmovdqu		$xd1,0xe0($out)
+	vmovdqu		$xa2,0x100($out)
+	vmovdqu		$xb2,0x120($out)
+	je		.Ldone8x
+
+	lea		0x140($inp),$inp	# inp+=64*5
+	xor		%r10,%r10
+	vmovdqa		$xc2,0x00(%rsp)
+	lea		0x140($out),$out	# out+=64*5
+	sub		\$320,$len		# len-=64*5
+	vmovdqa		$xd2,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L384_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	vpxor		0x80($inp),$xa1,$xa1
+	vpxor		0xa0($inp),$xb1,$xb1
+	vpxor		0xc0($inp),$xc1,$xc1
+	vpxor		0xe0($inp),$xd1,$xd1
+	vpxor		0x100($inp),$xa2,$xa2
+	vpxor		0x120($inp),$xb2,$xb2
+	vpxor		0x140($inp),$xc2,$xc2
+	vpxor		0x160($inp),$xd2,$xd2
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	vmovdqu		$xa1,0x80($out)
+	vmovdqu		$xb1,0xa0($out)
+	vmovdqu		$xc1,0xc0($out)
+	vmovdqu		$xd1,0xe0($out)
+	vmovdqu		$xa2,0x100($out)
+	vmovdqu		$xb2,0x120($out)
+	vmovdqu		$xc2,0x140($out)
+	vmovdqu		$xd2,0x160($out)
+	je		.Ldone8x
+
+	lea		0x180($inp),$inp	# inp+=64*6
+	xor		%r10,%r10
+	vmovdqa		$xa3,0x00(%rsp)
+	lea		0x180($out),$out	# out+=64*6
+	sub		\$384,$len		# len-=64*6
+	vmovdqa		$xb3,0x20(%rsp)
+	jmp		.Loop_tail8x
+
+.align	32
+.L448_or_more8x:
+	vpxor		0x00($inp),$xa0,$xa0	# xor with input
+	vpxor		0x20($inp),$xb0,$xb0
+	vpxor		0x40($inp),$xc0,$xc0
+	vpxor		0x60($inp),$xd0,$xd0
+	vpxor		0x80($inp),$xa1,$xa1
+	vpxor		0xa0($inp),$xb1,$xb1
+	vpxor		0xc0($inp),$xc1,$xc1
+	vpxor		0xe0($inp),$xd1,$xd1
+	vpxor		0x100($inp),$xa2,$xa2
+	vpxor		0x120($inp),$xb2,$xb2
+	vpxor		0x140($inp),$xc2,$xc2
+	vpxor		0x160($inp),$xd2,$xd2
+	vpxor		0x180($inp),$xa3,$xa3
+	vpxor		0x1a0($inp),$xb3,$xb3
+	vmovdqu		$xa0,0x00($out)
+	vmovdqu		$xb0,0x20($out)
+	vmovdqu		$xc0,0x40($out)
+	vmovdqu		$xd0,0x60($out)
+	vmovdqu		$xa1,0x80($out)
+	vmovdqu		$xb1,0xa0($out)
+	vmovdqu		$xc1,0xc0($out)
+	vmovdqu		$xd1,0xe0($out)
+	vmovdqu		$xa2,0x100($out)
+	vmovdqu		$xb2,0x120($out)
+	vmovdqu		$xc2,0x140($out)
+	vmovdqu		$xd2,0x160($out)
+	vmovdqu		$xa3,0x180($out)
+	vmovdqu		$xb3,0x1a0($out)
+	je		.Ldone8x
+
+	lea		0x1c0($inp),$inp	# inp+=64*7
+	xor		%r10,%r10
+	vmovdqa		$xc3,0x00(%rsp)
+	lea		0x1c0($out),$out	# out+=64*7
+	sub		\$448,$len		# len-=64*7
+	vmovdqa		$xd3,0x20(%rsp)
+
+.Loop_tail8x:
+	movzb		($inp,%r10),%eax
+	movzb		(%rsp,%r10),%ecx
+	lea		1(%r10),%r10
+	xor		%ecx,%eax
+	mov		%al,-1($out,%r10)
+	dec		$len
+	jnz		.Loop_tail8x
+
+.Ldone8x:
+	vzeroall
+___
+$code.=<<___	if ($win64);
+	movaps		-0xa8(%r9),%xmm6
+	movaps		-0x98(%r9),%xmm7
+	movaps		-0x88(%r9),%xmm8
+	movaps		-0x78(%r9),%xmm9
+	movaps		-0x68(%r9),%xmm10
+	movaps		-0x58(%r9),%xmm11
+	movaps		-0x48(%r9),%xmm12
+	movaps		-0x38(%r9),%xmm13
+	movaps		-0x28(%r9),%xmm14
+	movaps		-0x18(%r9),%xmm15
+___
+$code.=<<___;
+	lea		(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L8x_epilogue:
+	ret
+.cfi_endproc
+.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
+___
+}
+
+########################################################################
+# AVX512 code paths were removed
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lno_data(%rip),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lcommon_seh_tail
+
+	lea	64+24+48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R14
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	192($context),%rax	# pull context->R9
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	-0x28(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$4,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lcommon_seh_tail
+.size	ssse3_handler,.-ssse3_handler
+
+.type	full_handler,\@abi-omnipotent
+.align	16
+full_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	192($context),%rax	# pull context->R9
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	-0xa8(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lcommon_seh_tail
+.size	full_handler,.-full_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_ChaCha20_ctr32_nohw
+	.rva	.LSEH_end_ChaCha20_ctr32_nohw
+	.rva	.LSEH_info_ChaCha20_ctr32_nohw
+
+	.rva	.LSEH_begin_ChaCha20_ctr32_ssse3_4x
+	.rva	.LSEH_end_ChaCha20_ctr32_ssse3_4x
+	.rva	.LSEH_info_ChaCha20_ctr32_ssse3_4x
+___
+$code.=<<___ if ($avx>1);
+	.rva	.LSEH_begin_ChaCha20_ctr32_avx2
+	.rva	.LSEH_end_ChaCha20_ctr32_avx2
+	.rva	.LSEH_info_ChaCha20_ctr32_avx2
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_ChaCha20_ctr32_nohw:
+	.byte	9,0,0,0
+	.rva	se_handler
+
+.LSEH_info_ChaCha20_ctr32_ssse3_4x:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.L4x_body,.L4x_epilogue
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_ChaCha20_ctr32_avx2:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/%x#%[yz]/%x/g;	# "down-shift"
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl
new file mode 100644
index 0000000000..06a35e067e
--- /dev/null
+++ b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl
@@ -0,0 +1,1643 @@
+#!/usr/bin/env perl
+
+# Copyright (c) 2020, CloudFlare Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##############################################################################
+#                                                                            #
+# Author:  Vlad Krasnov                                                      #
+#                                                                            #
+##############################################################################
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($oup,$inp,$inl,$adp,$adl,$keyp,$itr1,$itr2) = ("x0","x1","x2","x3","x4","x5","x6","x7");
+my ($acc0,$acc1,$acc2) = map("x$_",(8..10));
+my ($t0,$t1,$t2,$t3) = map("x$_",(11..14));
+my ($one, $r0, $r1) = ("x15","x16","x17");
+my ($t0w) = $t0 =~ s/x/w/r;
+
+my ($A0,$A1,$A2,$A3,$A4,$B0,$B1,$B2,$B3,$B4,$C0,$C1,$C2,$C3,$C4,$D0,$D1,$D2,$D3,$D4) = map("v$_",(0..19));
+my ($T0,$T1,$T2,$T3) = map("v$_",(20..23));
+
+my $CONSTS = "v24";
+my $INC = "v25";
+my $ROL8 = "v26";
+my $CLAMP = "v27";
+
+my ($B_STORE, $C_STORE, $D_STORE) = map("v$_",(28..30));
+
+my $S_STORE = $CLAMP;
+my $LEN_STORE = "v31";
+
+sub chacha_qr {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4");
+$code.=<<___;
+    add   $a.4s, $a.4s, $b.4s
+    eor   $d.16b, $d.16b, $a.16b
+    rev32 $d.8h, $d.8h
+
+    add   $c.4s, $c.4s, $d.4s
+    eor   $b.16b, $b.16b, $c.16b
+    ushr  $t.4s, $b.4s, #20
+    sli   $t.4s, $b.4s, #12
+___
+    ($t,$b) = ($b,$t);
+$code.=<<___;
+    add   $a.4s, $a.4s, $b.4s
+    eor   $d.16b, $d.16b, $a.16b
+    tbl   $d.16b, {$d.16b}, $ROL8.16b
+
+    add   $c.4s, $c.4s, $d.4s
+    eor   $b.16b, $b.16b, $c.16b
+    ushr  $t.4s, $b.4s, #25
+    sli   $t.4s, $b.4s, #7
+___
+    ($t,$b) = ($b,$t);
+$code.=<<___;
+    ext $b.16b, $b.16b, $b.16b, $shift_b
+    ext $c.16b, $c.16b, $c.16b, #8
+    ext $d.16b, $d.16b, $d.16b, $shift_d
+___
+}
+
+sub poly_add {
+my ($src)=@_;
+$code.="ldp  $t0, $t1, [$src], 16
+        adds $acc0, $acc0, $t0
+        adcs $acc1, $acc1, $t1
+        adc  $acc2, $acc2, $one\n";
+}
+
+sub poly_add_vec {
+my ($src)=@_;
+$code.="mov  $t0, $src.d[0]
+        mov  $t1, $src.d[1]
+        adds $acc0, $acc0, $t0
+        adcs $acc1, $acc1, $t1
+        adc  $acc2, $acc2, $one\n";
+}
+
+sub poly_stage1 {
+$code.="mul   $t0, $acc0, $r0     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+        umulh $t1, $acc0, $r0
+        mul   $t2, $acc1, $r0
+        umulh $t3, $acc1, $r0
+        adds  $t1, $t1, $t2
+        mul   $t2, $acc2, $r0
+        adc   $t2, $t2, $t3\n";
+}
+
+sub poly_stage2 {
+$code.="mul   $t3, $acc0, $r1       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+        umulh $acc0, $acc0, $r1
+        adds  $t1, $t1, $t3
+        mul   $t3, $acc1, $r1
+        umulh $acc1, $acc1, $r1
+        adcs  $t3, $t3, $acc0
+        mul   $acc2, $acc2, $r1
+        adc   $acc2, $acc2, $acc1
+        adds  $t2, $t2, $t3
+        adc   $t3, $acc2, xzr\n";
+}
+
+# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of
+# r = [r1:r0] and acc = [acc2:acc1:acc0]
+# r is 124 bits at most (due to clamping) and acc is 131 bits at most
+# (acc2 is at most 4 before the addition and can be at most 6 when we add in
+# the next block) therefore t is at most 255 bits big, and t3 is 63 bits.
+sub poly_reduce_stage {
+$code.="and  $acc2, $t2, #3         // At this point acc2 is 2 bits at most (value of 3)
+        and  $acc0, $t2, #-4
+        extr $t2, $t3, $t2, #2
+        adds $acc0, $acc0, $t0
+        lsr  $t0, $t3, #2
+        adc  $acc1, $t3, $t0        // No carry out since t0 is 61 bits and t3 is 63 bits
+        adds $acc0, $acc0, $t2
+        adcs $acc1, $acc1, $t1
+        adc  $acc2, $acc2, xzr      // At this point acc2 has the value of 4 at most \n";
+}
+
+sub poly_mul {
+    &poly_stage1();
+    &poly_stage2();
+    &poly_reduce_stage();
+}
+
+sub chacha_qr_x3 {
+my ($dir)=@_;
+my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4");
+$code.=<<___;
+    add   $A0.4s, $A0.4s, $B0.4s
+    add   $A1.4s, $A1.4s, $B1.4s
+    add   $A2.4s, $A2.4s, $B2.4s
+    eor   $D0.16b, $D0.16b, $A0.16b
+    eor   $D1.16b, $D1.16b, $A1.16b
+    eor   $D2.16b, $D2.16b, $A2.16b
+    rev32 $D0.8h, $D0.8h
+    rev32 $D1.8h, $D1.8h
+    rev32 $D2.8h, $D2.8h
+
+    add   $C0.4s, $C0.4s, $D0.4s
+    add   $C1.4s, $C1.4s, $D1.4s
+    add   $C2.4s, $C2.4s, $D2.4s
+    eor   $B0.16b, $B0.16b, $C0.16b
+    eor   $B1.16b, $B1.16b, $C1.16b
+    eor   $B2.16b, $B2.16b, $C2.16b
+    ushr  $T0.4s, $B0.4s, #20
+    sli   $T0.4s, $B0.4s, #12
+    ushr  $B0.4s, $B1.4s, #20
+    sli   $B0.4s, $B1.4s, #12
+    ushr  $B1.4s, $B2.4s, #20
+    sli   $B1.4s, $B2.4s, #12
+
+    add   $A0.4s, $A0.4s, $T0.4s
+    add   $A1.4s, $A1.4s, $B0.4s
+    add   $A2.4s, $A2.4s, $B1.4s
+    eor   $D0.16b, $D0.16b, $A0.16b
+    eor   $D1.16b, $D1.16b, $A1.16b
+    eor   $D2.16b, $D2.16b, $A2.16b
+    tbl   $D0.16b, {$D0.16b}, $ROL8.16b
+    tbl   $D1.16b, {$D1.16b}, $ROL8.16b
+    tbl   $D2.16b, {$D2.16b}, $ROL8.16b
+
+    add   $C0.4s, $C0.4s, $D0.4s
+    add   $C1.4s, $C1.4s, $D1.4s
+    add   $C2.4s, $C2.4s, $D2.4s
+    eor   $T0.16b, $T0.16b, $C0.16b
+    eor   $B0.16b, $B0.16b, $C1.16b
+    eor   $B1.16b, $B1.16b, $C2.16b
+    ushr  $B2.4s, $B1.4s, #25
+    sli   $B2.4s, $B1.4s, #7
+    ushr  $B1.4s, $B0.4s, #25
+    sli   $B1.4s, $B0.4s, #7
+    ushr  $B0.4s, $T0.4s, #25
+    sli   $B0.4s, $T0.4s, #7
+
+    ext $B0.16b, $B0.16b, $B0.16b, $shift_b
+    ext $B1.16b, $B1.16b, $B1.16b, $shift_b
+    ext $B2.16b, $B2.16b, $B2.16b, $shift_b
+
+    ext $C0.16b, $C0.16b, $C0.16b, #8
+    ext $C1.16b, $C1.16b, $C1.16b, #8
+    ext $C2.16b, $C2.16b, $C2.16b, #8
+
+    ext $D0.16b, $D0.16b, $D0.16b, $shift_d
+    ext $D1.16b, $D1.16b, $D1.16b, $shift_d
+    ext $D2.16b, $D2.16b, $D2.16b, $shift_d
+___
+}
+
+# When preparing 5 ChaCha20 blocks in parallel, we operate on 4 blocks vertically as introduced by Andrew Moon
+# the fifth block is done horizontally
+sub chacha_qr_x5 {
+my ($dir)=@_;
+my ($a0,$a1,$a2,$a3) = $dir =~ /left/ ? ($A0,$A1,$A2,$A3) : ($A0,$A1,$A2,$A3);
+my ($b0,$b1,$b2,$b3) = $dir =~ /left/ ? ($B0,$B1,$B2,$B3) : ($B1,$B2,$B3,$B0);
+my ($c0,$c1,$c2,$c3) = $dir =~ /left/ ? ($C0,$C1,$C2,$C3) : ($C2,$C3,$C0,$C1);
+my ($d0,$d1,$d2,$d3) = $dir =~ /left/ ? ($D0,$D1,$D2,$D3) : ($D3,$D0,$D1,$D2);
+my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4");
+$code.=<<___;
+    add   $a0.4s, $a0.4s, $b0.4s
+    add   $a1.4s, $a1.4s, $b1.4s
+    add   $a2.4s, $a2.4s, $b2.4s
+    add   $a3.4s, $a3.4s, $b3.4s
+    add   $A4.4s, $A4.4s, $B4.4s
+
+    eor   $d0.16b, $d0.16b, $a0.16b
+    eor   $d1.16b, $d1.16b, $a1.16b
+    eor   $d2.16b, $d2.16b, $a2.16b
+    eor   $d3.16b, $d3.16b, $a3.16b
+    eor   $D4.16b, $D4.16b, $A4.16b
+
+    rev32 $d0.8h, $d0.8h
+    rev32 $d1.8h, $d1.8h
+    rev32 $d2.8h, $d2.8h
+    rev32 $d3.8h, $d3.8h
+    rev32 $D4.8h, $D4.8h
+
+    add   $c0.4s, $c0.4s, $d0.4s
+    add   $c1.4s, $c1.4s, $d1.4s
+    add   $c2.4s, $c2.4s, $d2.4s
+    add   $c3.4s, $c3.4s, $d3.4s
+    add   $C4.4s, $C4.4s, $D4.4s
+
+    eor   $b0.16b, $b0.16b, $c0.16b
+    eor   $b1.16b, $b1.16b, $c1.16b
+    eor   $b2.16b, $b2.16b, $c2.16b
+    eor   $b3.16b, $b3.16b, $c3.16b
+    eor   $B4.16b, $B4.16b, $C4.16b
+
+    ushr  $T0.4s, $b0.4s, #20
+    sli   $T0.4s, $b0.4s, #12
+    ushr  $b0.4s, $b1.4s, #20
+    sli   $b0.4s, $b1.4s, #12
+    ushr  $b1.4s, $b2.4s, #20
+    sli   $b1.4s, $b2.4s, #12
+    ushr  $b2.4s, $b3.4s, #20
+    sli   $b2.4s, $b3.4s, #12
+    ushr  $b3.4s, $B4.4s, #20
+    sli   $b3.4s, $B4.4s, #12
+
+    add   $a0.4s, $a0.4s, $T0.4s
+    add   $a1.4s, $a1.4s, $b0.4s
+    add   $a2.4s, $a2.4s, $b1.4s
+    add   $a3.4s, $a3.4s, $b2.4s
+    add   $A4.4s, $A4.4s, $b3.4s
+
+    eor   $d0.16b, $d0.16b, $a0.16b
+    eor   $d1.16b, $d1.16b, $a1.16b
+    eor   $d2.16b, $d2.16b, $a2.16b
+    eor   $d3.16b, $d3.16b, $a3.16b
+    eor   $D4.16b, $D4.16b, $A4.16b
+
+    tbl   $d0.16b, {$d0.16b}, $ROL8.16b
+    tbl   $d1.16b, {$d1.16b}, $ROL8.16b
+    tbl   $d2.16b, {$d2.16b}, $ROL8.16b
+    tbl   $d3.16b, {$d3.16b}, $ROL8.16b
+    tbl   $D4.16b, {$D4.16b}, $ROL8.16b
+
+    add   $c0.4s, $c0.4s, $d0.4s
+    add   $c1.4s, $c1.4s, $d1.4s
+    add   $c2.4s, $c2.4s, $d2.4s
+    add   $c3.4s, $c3.4s, $d3.4s
+    add   $C4.4s, $C4.4s, $D4.4s
+
+    eor   $T0.16b, $T0.16b, $c0.16b
+    eor   $b0.16b, $b0.16b, $c1.16b
+    eor   $b1.16b, $b1.16b, $c2.16b
+    eor   $b2.16b, $b2.16b, $c3.16b
+    eor   $b3.16b, $b3.16b, $C4.16b
+
+    ushr  $B4.4s, $b3.4s, #25
+    sli   $B4.4s, $b3.4s, #7
+    ushr  $b3.4s, $b2.4s, #25
+    sli   $b3.4s, $b2.4s, #7
+    ushr  $b2.4s, $b1.4s, #25
+    sli   $b2.4s, $b1.4s, #7
+    ushr  $b1.4s, $b0.4s, #25
+    sli   $b1.4s, $b0.4s, #7
+    ushr  $b0.4s, $T0.4s, #25
+    sli   $b0.4s, $T0.4s, #7
+
+    ext $B4.16b, $B4.16b, $B4.16b, $shift_b
+    ext $C4.16b, $C4.16b, $C4.16b, #8
+    ext $D4.16b, $D4.16b, $D4.16b, $shift_d
+___
+}
+
+{
+$code.=<<___;
+.section .rodata
+
+.align 7
+.Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Linc:
+.long 1,2,3,4
+.Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.type   .Lpoly_hash_ad_internal,%function
+.align  6
+.Lpoly_hash_ad_internal:
+    .cfi_startproc
+    cbnz $adl, .Lpoly_hash_intro
+    ret
+
+.Lpoly_hash_intro:
+    cmp $adl, #16
+    b.lt .Lpoly_hash_ad_tail
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+        sub $adl, $adl, #16
+        b .Lpoly_hash_ad_internal
+
+.Lpoly_hash_ad_tail:
+    cbz $adl, .Lpoly_hash_ad_ret
+
+    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the AAD
+    sub $adl, $adl, #1
+
+.Lpoly_hash_tail_16_compose:
+        ext  $T0.16b, $T0.16b, $T0.16b, #15
+        ldrb $t0w, [$adp, $adl]
+        mov  $T0.b[0], $t0w
+        subs $adl, $adl, #1
+        b.ge .Lpoly_hash_tail_16_compose
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+$code.=<<___;
+
+.Lpoly_hash_ad_ret:
+    ret
+    .cfi_endproc
+.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl  chacha20_poly1305_seal
+.type   chacha20_poly1305_seal,%function
+.align  6
+chacha20_poly1305_seal:
+    AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+    stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+    mov x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+    stp d8, d9, [sp, #16]
+    stp d10, d11, [sp, #32]
+    stp d12, d13, [sp, #48]
+    stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+    adrp $t0, :pg_hi21:.Lchacha20_consts
+    add  $t0, $t0, :lo12:.Lchacha20_consts
+
+    ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values
+    ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp]
+
+    mov $one, #1 // Prepare the Poly1305 state
+    mov $acc0, #0
+    mov $acc1, #0
+    mov $acc2, #0
+
+    ldr $t1, [$keyp, #56]   // The total cipher text length includes extra_in_len
+    add $t1, $t1, $inl
+    mov $LEN_STORE.d[0], $adl  // Store the input and aad lengths
+    mov $LEN_STORE.d[1], $t1
+
+    cmp $inl, #128
+    b.le .Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+    ld4r {$A0.4s-$A3.4s}, [$t0]
+    mov $A4.16b, $CONSTS.16b
+
+    ld4r {$B0.4s-$B3.4s}, [$keyp], #16
+    mov $B4.16b, $B_STORE.16b
+
+    ld4r {$C0.4s-$C3.4s}, [$keyp], #16
+    mov $C4.16b, $C_STORE.16b
+
+    ld4r {$D0.4s-$D3.4s}, [$keyp]
+    add $D0.4s, $D0.4s, $INC.4s
+    mov $D4.16b, $D_STORE.16b
+
+    sub $keyp, $keyp, #32
+
+    mov  $itr1, #10
+
+.align 5
+.Lseal_init_rounds:
+___
+        &chacha_qr_x5("left");
+        &chacha_qr_x5("right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+    b.hi .Lseal_init_rounds
+
+    add $D0.4s, $D0.4s, $INC.4s
+    mov $t0, #4
+    dup $T0.4s, $t0w
+    add $INC.4s, $INC.4s, $T0.4s
+
+    zip1 $T0.4s, $A0.4s, $A1.4s
+    zip2 $T1.4s, $A0.4s, $A1.4s
+    zip1 $T2.4s, $A2.4s, $A3.4s
+    zip2 $T3.4s, $A2.4s, $A3.4s
+
+    zip1 $A0.2d, $T0.2d, $T2.2d
+    zip2 $A1.2d, $T0.2d, $T2.2d
+    zip1 $A2.2d, $T1.2d, $T3.2d
+    zip2 $A3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $B0.4s, $B1.4s
+    zip2 $T1.4s, $B0.4s, $B1.4s
+    zip1 $T2.4s, $B2.4s, $B3.4s
+    zip2 $T3.4s, $B2.4s, $B3.4s
+
+    zip1 $B0.2d, $T0.2d, $T2.2d
+    zip2 $B1.2d, $T0.2d, $T2.2d
+    zip1 $B2.2d, $T1.2d, $T3.2d
+    zip2 $B3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $C0.4s, $C1.4s
+    zip2 $T1.4s, $C0.4s, $C1.4s
+    zip1 $T2.4s, $C2.4s, $C3.4s
+    zip2 $T3.4s, $C2.4s, $C3.4s
+
+    zip1 $C0.2d, $T0.2d, $T2.2d
+    zip2 $C1.2d, $T0.2d, $T2.2d
+    zip1 $C2.2d, $T1.2d, $T3.2d
+    zip2 $C3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $D0.4s, $D1.4s
+    zip2 $T1.4s, $D0.4s, $D1.4s
+    zip1 $T2.4s, $D2.4s, $D3.4s
+    zip2 $T3.4s, $D2.4s, $D3.4s
+
+    zip1 $D0.2d, $T0.2d, $T2.2d
+    zip2 $D1.2d, $T0.2d, $T2.2d
+    zip1 $D2.2d, $T1.2d, $T3.2d
+    zip2 $D3.2d, $T1.2d, $T3.2d
+
+    add $A4.4s, $A4.4s, $CONSTS.4s
+    add $B4.4s, $B4.4s, $B_STORE.4s
+    and $A4.16b, $A4.16b, $CLAMP.16b
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+
+    add $A2.4s, $A2.4s, $CONSTS.4s
+    add $B2.4s, $B2.4s, $B_STORE.4s
+    add $C2.4s, $C2.4s, $C_STORE.4s
+    add $D2.4s, $D2.4s, $D_STORE.4s
+
+    add $A3.4s, $A3.4s, $CONSTS.4s
+    add $B3.4s, $B3.4s, $B_STORE.4s
+    add $C3.4s, $C3.4s, $C_STORE.4s
+    add $D3.4s, $D3.4s, $D_STORE.4s
+
+    mov $r0, $A4.d[0] // Move the R key to GPRs
+    mov $r1, $A4.d[1]
+    mov $S_STORE.16b, $B4.16b // Store the S key
+
+    bl  .Lpoly_hash_ad_internal
+
+    mov $adp, $oup
+    cmp $inl, #256
+    b.le .Lseal_tail
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A0.16b
+    eor $T1.16b, $T1.16b, $B0.16b
+    eor $T2.16b, $T2.16b, $C0.16b
+    eor $T3.16b, $T3.16b, $D0.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A1.16b
+    eor $T1.16b, $T1.16b, $B1.16b
+    eor $T2.16b, $T2.16b, $C1.16b
+    eor $T3.16b, $T3.16b, $D1.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A2.16b
+    eor $T1.16b, $T1.16b, $B2.16b
+    eor $T2.16b, $T2.16b, $C2.16b
+    eor $T3.16b, $T3.16b, $D2.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A3.16b
+    eor $T1.16b, $T1.16b, $B3.16b
+    eor $T2.16b, $T2.16b, $C3.16b
+    eor $T3.16b, $T3.16b, $D3.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #256
+
+    mov $itr1, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+    mov $itr2, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+.Lseal_main_loop:
+    adrp $t0, :pg_hi21:.Lchacha20_consts
+    add  $t0, $t0, :lo12:.Lchacha20_consts
+
+    ld4r {$A0.4s-$A3.4s}, [$t0]
+    mov $A4.16b, $CONSTS.16b
+
+    ld4r {$B0.4s-$B3.4s}, [$keyp], #16
+    mov $B4.16b, $B_STORE.16b
+
+    ld4r {$C0.4s-$C3.4s}, [$keyp], #16
+    mov $C4.16b, $C_STORE.16b
+
+    ld4r {$D0.4s-$D3.4s}, [$keyp]
+    add $D0.4s, $D0.4s, $INC.4s
+    mov $D4.16b, $D_STORE.16b
+
+    eor $T0.16b, $T0.16b, $T0.16b //zero
+    not $T1.16b, $T0.16b // -1
+    sub $T1.4s, $INC.4s, $T1.4s // Add +1
+    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
+    add $D4.4s, $D4.4s, $T0.4s
+
+    sub $keyp, $keyp, #32
+.align 5
+.Lseal_main_loop_rounds:
+___
+        &chacha_qr_x5("left");
+        &poly_add($adp);
+        &poly_mul();
+        &chacha_qr_x5("right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+        b.ge .Lseal_main_loop_rounds
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+        subs $itr2, $itr2, #1
+        b.gt .Lseal_main_loop_rounds
+
+    eor $T0.16b, $T0.16b, $T0.16b //zero
+    not $T1.16b, $T0.16b // -1
+    sub $T1.4s, $INC.4s, $T1.4s // Add +1
+    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
+    add $D4.4s, $D4.4s, $T0.4s
+
+    add $D0.4s, $D0.4s, $INC.4s
+    mov $t0, #5
+    dup $T0.4s, $t0w
+    add $INC.4s, $INC.4s, $T0.4s
+
+    zip1 $T0.4s, $A0.4s, $A1.4s
+    zip2 $T1.4s, $A0.4s, $A1.4s
+    zip1 $T2.4s, $A2.4s, $A3.4s
+    zip2 $T3.4s, $A2.4s, $A3.4s
+
+    zip1 $A0.2d, $T0.2d, $T2.2d
+    zip2 $A1.2d, $T0.2d, $T2.2d
+    zip1 $A2.2d, $T1.2d, $T3.2d
+    zip2 $A3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $B0.4s, $B1.4s
+    zip2 $T1.4s, $B0.4s, $B1.4s
+    zip1 $T2.4s, $B2.4s, $B3.4s
+    zip2 $T3.4s, $B2.4s, $B3.4s
+
+    zip1 $B0.2d, $T0.2d, $T2.2d
+    zip2 $B1.2d, $T0.2d, $T2.2d
+    zip1 $B2.2d, $T1.2d, $T3.2d
+    zip2 $B3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $C0.4s, $C1.4s
+    zip2 $T1.4s, $C0.4s, $C1.4s
+    zip1 $T2.4s, $C2.4s, $C3.4s
+    zip2 $T3.4s, $C2.4s, $C3.4s
+
+    zip1 $C0.2d, $T0.2d, $T2.2d
+    zip2 $C1.2d, $T0.2d, $T2.2d
+    zip1 $C2.2d, $T1.2d, $T3.2d
+    zip2 $C3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $D0.4s, $D1.4s
+    zip2 $T1.4s, $D0.4s, $D1.4s
+    zip1 $T2.4s, $D2.4s, $D3.4s
+    zip2 $T3.4s, $D2.4s, $D3.4s
+
+    zip1 $D0.2d, $T0.2d, $T2.2d
+    zip2 $D1.2d, $T0.2d, $T2.2d
+    zip1 $D2.2d, $T1.2d, $T3.2d
+    zip2 $D3.2d, $T1.2d, $T3.2d
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+
+    add $A2.4s, $A2.4s, $CONSTS.4s
+    add $B2.4s, $B2.4s, $B_STORE.4s
+    add $C2.4s, $C2.4s, $C_STORE.4s
+    add $D2.4s, $D2.4s, $D_STORE.4s
+
+    add $A3.4s, $A3.4s, $CONSTS.4s
+    add $B3.4s, $B3.4s, $B_STORE.4s
+    add $C3.4s, $C3.4s, $C_STORE.4s
+    add $D3.4s, $D3.4s, $D_STORE.4s
+
+    add $A4.4s, $A4.4s, $CONSTS.4s
+    add $B4.4s, $B4.4s, $B_STORE.4s
+    add $C4.4s, $C4.4s, $C_STORE.4s
+    add $D4.4s, $D4.4s, $D_STORE.4s
+
+    cmp $inl, #320
+    b.le .Lseal_tail
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A0.16b
+    eor $T1.16b, $T1.16b, $B0.16b
+    eor $T2.16b, $T2.16b, $C0.16b
+    eor $T3.16b, $T3.16b, $D0.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A1.16b
+    eor $T1.16b, $T1.16b, $B1.16b
+    eor $T2.16b, $T2.16b, $C1.16b
+    eor $T3.16b, $T3.16b, $D1.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A2.16b
+    eor $T1.16b, $T1.16b, $B2.16b
+    eor $T2.16b, $T2.16b, $C2.16b
+    eor $T3.16b, $T3.16b, $D2.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A3.16b
+    eor $T1.16b, $T1.16b, $B3.16b
+    eor $T2.16b, $T2.16b, $C3.16b
+    eor $T3.16b, $T3.16b, $D3.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A4.16b
+    eor $T1.16b, $T1.16b, $B4.16b
+    eor $T2.16b, $T2.16b, $C4.16b
+    eor $T3.16b, $T3.16b, $D4.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #320
+
+    mov $itr1, #0
+    mov $itr2, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+    b .Lseal_main_loop
+
+.Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+    cmp $inl, #64
+    b.lt .Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+
+    eor $T0.16b, $T0.16b, $A0.16b
+    eor $T1.16b, $T1.16b, $B0.16b
+    eor $T2.16b, $T2.16b, $C0.16b
+    eor $T3.16b, $T3.16b, $D0.16b
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+    &poly_add_vec($T1);
+    &poly_mul();
+    &poly_add_vec($T2);
+    &poly_mul();
+    &poly_add_vec($T3);
+    &poly_mul();
+$code.=<<___;
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+    sub $inl, $inl, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+    mov $A0.16b, $A1.16b
+    mov $B0.16b, $B1.16b
+    mov $C0.16b, $C1.16b
+    mov $D0.16b, $D1.16b
+
+    mov $A1.16b, $A2.16b
+    mov $B1.16b, $B2.16b
+    mov $C1.16b, $C2.16b
+    mov $D1.16b, $D2.16b
+
+    mov $A2.16b, $A3.16b
+    mov $B2.16b, $B3.16b
+    mov $C2.16b, $C3.16b
+    mov $D2.16b, $D3.16b
+
+    mov $A3.16b, $A4.16b
+    mov $B3.16b, $B4.16b
+    mov $C3.16b, $C4.16b
+    mov $D3.16b, $D4.16b
+
+    b .Lseal_tail
+
+.Lseal_tail_64:
+    ldp $adp, $adl, [$keyp, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+    cmp $inl, #16
+    b.lt .Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+    ld1 {$T0.16b}, [$inp], #16
+    eor $T0.16b, $T0.16b, $A0.16b
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+$code.=<<___;
+    st1 {$T0.16b}, [$oup], #16
+
+    sub $inl, $inl, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+    mov $A0.16b, $B0.16b
+    mov $B0.16b, $C0.16b
+    mov $C0.16b, $D0.16b
+
+    b .Lseal_tail_64
+
+.Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+    cbz $inl, .Lseal_hash_extra
+
+    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the plaintext/extra in
+    eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+    not $T2.16b, $T0.16b
+
+    mov $itr1, $inl
+    add $inp, $inp, $inl
+
+    cbz $adl, .Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+    mov $itr2, #16          // We need to load some extra_in first for padding
+    sub $itr2, $itr2, $inl
+    cmp $adl, $itr2
+    csel $itr2, $adl, $itr2, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+    mov $t1, $itr2
+    add $adp, $adp, $itr2
+    sub $adl, $adl, $itr2
+
+.Lseal_tail16_compose_extra_in:
+        ext  $T0.16b, $T0.16b, $T0.16b, #15
+        ldrb $t0w, [$adp, #-1]!
+        mov  $T0.b[0], $t0w
+        subs $itr2, $itr2, #1
+        b.gt .Lseal_tail16_compose_extra_in
+
+    add $adp, $adp, $t1
+
+.Lseal_tail_16_compose:
+        ext  $T0.16b, $T0.16b, $T0.16b, #15
+        ldrb $t0w, [$inp, #-1]!
+        mov  $T0.b[0], $t0w
+        ext  $T1.16b, $T2.16b, $T1.16b, #15
+        subs $inl, $inl, #1
+        b.gt .Lseal_tail_16_compose
+
+    and $A0.16b, $A0.16b, $T1.16b
+    eor $T0.16b, $T0.16b, $A0.16b
+    mov $T1.16b, $T0.16b
+
+.Lseal_tail_16_store:
+        umov $t0w, $T0.b[0]
+        strb $t0w, [$oup], #1
+        ext  $T0.16b, $T0.16b, $T0.16b, #1
+        subs $itr1, $itr1, #1
+        b.gt .Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+___
+    &poly_add_vec($T1);
+    &poly_mul();
+$code.=<<___;
+
+.Lseal_hash_extra:
+    cbz $adl, .Lseal_finalize
+
+.Lseal_hash_extra_loop:
+    cmp $adl, #16
+    b.lt .Lseal_hash_extra_tail
+    ld1 {$T0.16b}, [$adp], #16
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+$code.=<<___;
+    sub $adl, $adl, #16
+    b .Lseal_hash_extra_loop
+
+.Lseal_hash_extra_tail:
+    cbz $adl, .Lseal_finalize
+    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the remaining extra ciphertext
+    add $adp, $adp, $adl
+
+.Lseal_hash_extra_load:
+        ext  $T0.16b, $T0.16b, $T0.16b, #15
+        ldrb $t0w, [$adp, #-1]!
+        mov  $T0.b[0], $t0w
+        subs $adl, $adl, #1
+        b.gt .Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+$code.=<<___;
+
+.Lseal_finalize:
+___
+    &poly_add_vec($LEN_STORE);
+    &poly_mul();
+$code.=<<___;
+    // Final reduction step
+    sub  $t1, xzr, $one
+    orr  $t2, xzr, #3
+    subs $t0, $acc0, #-5
+    sbcs $t1, $acc1, $t1
+    sbcs $t2, $acc2, $t2
+    csel $acc0, $t0, $acc0, cs
+    csel $acc1, $t1, $acc1, cs
+    csel $acc2, $t2, $acc2, cs
+___
+    &poly_add_vec($S_STORE);
+$code.=<<___;
+
+    stp  $acc0, $acc1, [$keyp]
+
+    ldp d8, d9, [sp, #16]
+    ldp d10, d11, [sp, #32]
+    ldp d12, d13, [sp, #48]
+    ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+    ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+    AARCH64_VALIDATE_LINK_REGISTER
+    ret
+
+.Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+    eor $INC.16b, $INC.16b, $INC.16b
+    mov $t0, #1
+    mov $INC.s[0], $t0w
+    mov $A0.16b, $CONSTS.16b
+    mov $A1.16b, $CONSTS.16b
+    mov $A2.16b, $CONSTS.16b
+    mov $B0.16b, $B_STORE.16b
+    mov $B1.16b, $B_STORE.16b
+    mov $B2.16b, $B_STORE.16b
+    mov $C0.16b, $C_STORE.16b
+    mov $C1.16b, $C_STORE.16b
+    mov $C2.16b, $C_STORE.16b
+    mov $D2.16b, $D_STORE.16b
+    add $D0.4s, $D2.4s, $INC.4s
+    add $D1.4s, $D0.4s, $INC.4s
+
+    mov  $itr1, #10
+
+.Lseal_128_rounds:
+___
+        &chacha_qr_x3("left");
+        &chacha_qr_x3("right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+    b.hi .Lseal_128_rounds
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $A2.4s, $A2.4s, $CONSTS.4s
+
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $B2.4s, $B2.4s, $B_STORE.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating $C2 and $D2.
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+
+    add $D_STORE.4s, $D_STORE.4s, $INC.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+    add $D_STORE.4s, $D_STORE.4s, $INC.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+
+    and $A2.16b, $A2.16b, $CLAMP.16b
+    mov $r0, $A2.d[0] // Move the R key to GPRs
+    mov $r1, $A2.d[1]
+    mov $S_STORE.16b, $B2.16b // Store the S key
+
+    bl  .Lpoly_hash_ad_internal
+    b   .Lseal_tail
+.cfi_endproc
+.size chacha20_poly1305_seal,.-chacha20_poly1305_seal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	chacha20_poly1305_open
+.type	chacha20_poly1305_open,%function
+.align	6
+chacha20_poly1305_open:
+    AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+    stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+    mov x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+    stp	d8, d9, [sp, #16]
+    stp	d10, d11, [sp, #32]
+    stp	d12, d13, [sp, #48]
+    stp	d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+    adrp $t0, :pg_hi21:.Lchacha20_consts
+    add  $t0, $t0, :lo12:.Lchacha20_consts
+
+    ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values
+    ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp]
+
+    mov $one, #1 // Prepare the Poly1305 state
+    mov $acc0, #0
+    mov $acc1, #0
+    mov $acc2, #0
+
+    mov $LEN_STORE.d[0], $adl  // Store the input and aad lengths
+    mov $LEN_STORE.d[1], $inl
+
+    cmp $inl, #128
+    b.le .Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+    mov $A0.16b, $CONSTS.16b
+    mov $B0.16b, $B_STORE.16b
+    mov $C0.16b, $C_STORE.16b
+    mov $D0.16b, $D_STORE.16b
+
+    mov  $itr1, #10
+
+.align 5
+.Lopen_init_rounds:
+___
+        &chacha_qr($A0, $B0, $C0, $D0, $T0, "left");
+        &chacha_qr($A0, $B0, $C0, $D0, $T0, "right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+    b.hi .Lopen_init_rounds
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+
+    and $A0.16b, $A0.16b, $CLAMP.16b
+    mov $r0, $A0.d[0] // Move the R key to GPRs
+    mov $r1, $A0.d[1]
+    mov $S_STORE.16b, $B0.16b // Store the S key
+
+    bl  .Lpoly_hash_ad_internal
+
+.Lopen_ad_done:
+    mov $adp, $inp
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+.Lopen_main_loop:
+
+    cmp $inl, #192
+    b.lt .Lopen_tail
+
+    adrp $t0, :pg_hi21:.Lchacha20_consts
+    add  $t0, $t0, :lo12:.Lchacha20_consts
+
+    ld4r {$A0.4s-$A3.4s}, [$t0]
+    mov $A4.16b, $CONSTS.16b
+
+    ld4r {$B0.4s-$B3.4s}, [$keyp], #16
+    mov $B4.16b, $B_STORE.16b
+
+    ld4r {$C0.4s-$C3.4s}, [$keyp], #16
+    mov $C4.16b, $C_STORE.16b
+
+    ld4r {$D0.4s-$D3.4s}, [$keyp]
+    sub $keyp, $keyp, #32
+    add $D0.4s, $D0.4s, $INC.4s
+    mov $D4.16b, $D_STORE.16b
+
+    eor $T0.16b, $T0.16b, $T0.16b //zero
+    not $T1.16b, $T0.16b // -1
+    sub $T1.4s, $INC.4s, $T1.4s // Add +1
+    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
+    add $D4.4s, $D4.4s, $T0.4s
+
+    lsr $adl, $inl, #4 // How many whole blocks we have to hash, will always be at least 12
+    sub $adl, $adl, #10
+
+    mov $itr2, #10
+    subs $itr1, $itr2, $adl
+    subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 320 bytes to hash
+    csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+    cbz $itr2, .Lopen_main_loop_rounds_short
+
+.align 5
+.Lopen_main_loop_rounds:
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+.Lopen_main_loop_rounds_short:
+___
+        &chacha_qr_x5("left");
+        &poly_add($adp);
+        &poly_mul();
+        &chacha_qr_x5("right");
+$code.=<<___;
+        subs $itr2, $itr2, #1
+        b.gt .Lopen_main_loop_rounds
+        subs $itr1, $itr1, #1
+        b.ge .Lopen_main_loop_rounds_short
+___
+$code.=<<___;
+
+    eor $T0.16b, $T0.16b, $T0.16b //zero
+    not $T1.16b, $T0.16b // -1
+    sub $T1.4s, $INC.4s, $T1.4s // Add +1
+    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
+    add $D4.4s, $D4.4s, $T0.4s
+
+    add $D0.4s, $D0.4s, $INC.4s
+    mov $t0, #5
+    dup $T0.4s, $t0w
+    add $INC.4s, $INC.4s, $T0.4s
+
+    zip1 $T0.4s, $A0.4s, $A1.4s
+    zip2 $T1.4s, $A0.4s, $A1.4s
+    zip1 $T2.4s, $A2.4s, $A3.4s
+    zip2 $T3.4s, $A2.4s, $A3.4s
+
+    zip1 $A0.2d, $T0.2d, $T2.2d
+    zip2 $A1.2d, $T0.2d, $T2.2d
+    zip1 $A2.2d, $T1.2d, $T3.2d
+    zip2 $A3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $B0.4s, $B1.4s
+    zip2 $T1.4s, $B0.4s, $B1.4s
+    zip1 $T2.4s, $B2.4s, $B3.4s
+    zip2 $T3.4s, $B2.4s, $B3.4s
+
+    zip1 $B0.2d, $T0.2d, $T2.2d
+    zip2 $B1.2d, $T0.2d, $T2.2d
+    zip1 $B2.2d, $T1.2d, $T3.2d
+    zip2 $B3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $C0.4s, $C1.4s
+    zip2 $T1.4s, $C0.4s, $C1.4s
+    zip1 $T2.4s, $C2.4s, $C3.4s
+    zip2 $T3.4s, $C2.4s, $C3.4s
+
+    zip1 $C0.2d, $T0.2d, $T2.2d
+    zip2 $C1.2d, $T0.2d, $T2.2d
+    zip1 $C2.2d, $T1.2d, $T3.2d
+    zip2 $C3.2d, $T1.2d, $T3.2d
+
+    zip1 $T0.4s, $D0.4s, $D1.4s
+    zip2 $T1.4s, $D0.4s, $D1.4s
+    zip1 $T2.4s, $D2.4s, $D3.4s
+    zip2 $T3.4s, $D2.4s, $D3.4s
+
+    zip1 $D0.2d, $T0.2d, $T2.2d
+    zip2 $D1.2d, $T0.2d, $T2.2d
+    zip1 $D2.2d, $T1.2d, $T3.2d
+    zip2 $D3.2d, $T1.2d, $T3.2d
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+
+    add $A2.4s, $A2.4s, $CONSTS.4s
+    add $B2.4s, $B2.4s, $B_STORE.4s
+    add $C2.4s, $C2.4s, $C_STORE.4s
+    add $D2.4s, $D2.4s, $D_STORE.4s
+
+    add $A3.4s, $A3.4s, $CONSTS.4s
+    add $B3.4s, $B3.4s, $B_STORE.4s
+    add $C3.4s, $C3.4s, $C_STORE.4s
+    add $D3.4s, $D3.4s, $D_STORE.4s
+
+    add $A4.4s, $A4.4s, $CONSTS.4s
+    add $B4.4s, $B4.4s, $B_STORE.4s
+    add $C4.4s, $C4.4s, $C_STORE.4s
+    add $D4.4s, $D4.4s, $D_STORE.4s
+
+    // We can always safely store 192 bytes
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A0.16b
+    eor $T1.16b, $T1.16b, $B0.16b
+    eor $T2.16b, $T2.16b, $C0.16b
+    eor $T3.16b, $T3.16b, $D0.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A1.16b
+    eor $T1.16b, $T1.16b, $B1.16b
+    eor $T2.16b, $T2.16b, $C1.16b
+    eor $T3.16b, $T3.16b, $D1.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A2.16b
+    eor $T1.16b, $T1.16b, $B2.16b
+    eor $T2.16b, $T2.16b, $C2.16b
+    eor $T3.16b, $T3.16b, $D2.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #192
+
+    mov $A0.16b, $A3.16b
+    mov $B0.16b, $B3.16b
+    mov $C0.16b, $C3.16b
+    mov $D0.16b, $D3.16b
+
+    cmp $inl, #64
+    b.lt .Lopen_tail_64_store
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A3.16b
+    eor $T1.16b, $T1.16b, $B3.16b
+    eor $T2.16b, $T2.16b, $C3.16b
+    eor $T3.16b, $T3.16b, $D3.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #64
+
+    mov $A0.16b, $A4.16b
+    mov $B0.16b, $B4.16b
+    mov $C0.16b, $C4.16b
+    mov $D0.16b, $D4.16b
+
+    cmp $inl, #64
+    b.lt .Lopen_tail_64_store
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+    eor $T0.16b, $T0.16b, $A4.16b
+    eor $T1.16b, $T1.16b, $B4.16b
+    eor $T2.16b, $T2.16b, $C4.16b
+    eor $T3.16b, $T3.16b, $D4.16b
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #64
+    b .Lopen_main_loop
+
+.Lopen_tail:
+
+    cbz $inl, .Lopen_finalize
+
+    lsr $adl, $inl, #4 // How many whole blocks we have to hash
+
+    cmp $inl, #64
+    b.le .Lopen_tail_64
+    cmp $inl, #128
+    b.le .Lopen_tail_128
+
+.Lopen_tail_192:
+     // We need three more blocks
+    mov $A0.16b, $CONSTS.16b
+    mov $A1.16b, $CONSTS.16b
+    mov $A2.16b, $CONSTS.16b
+    mov $B0.16b, $B_STORE.16b
+    mov $B1.16b, $B_STORE.16b
+    mov $B2.16b, $B_STORE.16b
+    mov $C0.16b, $C_STORE.16b
+    mov $C1.16b, $C_STORE.16b
+    mov $C2.16b, $C_STORE.16b
+    mov $D0.16b, $D_STORE.16b
+    mov $D1.16b, $D_STORE.16b
+    mov $D2.16b, $D_STORE.16b
+    eor $T3.16b, $T3.16b, $T3.16b
+    eor $T1.16b, $T1.16b, $T1.16b
+    ins $T3.s[0], $INC.s[0]
+    ins $T1.d[0], $one
+
+    add $T2.4s, $T3.4s, $T1.4s
+    add $T1.4s, $T2.4s, $T1.4s
+
+    add $D0.4s, $D0.4s, $T1.4s
+    add $D1.4s, $D1.4s, $T3.4s
+    add $D2.4s, $D2.4s, $T2.4s
+
+    mov $itr2, #10
+    subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 160 bytes to hash
+    csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+    sub $adl, $adl, $itr2
+
+    cbz $itr2, .Lopen_tail_192_rounds_no_hash
+
+.Lopen_tail_192_rounds:
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+.Lopen_tail_192_rounds_no_hash:
+___
+        &chacha_qr_x3("left");
+        &chacha_qr_x3("right");
+$code.=<<___;
+        subs $itr2, $itr2, #1
+        b.gt .Lopen_tail_192_rounds
+        subs $itr1, $itr1, #1
+        b.ge .Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+.Lopen_tail_192_hash:
+    cbz $adl, .Lopen_tail_192_hash_done
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+    sub $adl, $adl, #1
+    b .Lopen_tail_192_hash
+
+.Lopen_tail_192_hash_done:
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $A2.4s, $A2.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $B2.4s, $B2.4s, $B_STORE.4s
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+    add $C2.4s, $C2.4s, $C_STORE.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+    add $D2.4s, $D2.4s, $D_STORE.4s
+
+    add $D0.4s, $D0.4s, $T1.4s
+    add $D1.4s, $D1.4s, $T3.4s
+    add $D2.4s, $D2.4s, $T2.4s
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+
+    eor $T0.16b, $T0.16b, $A1.16b
+    eor $T1.16b, $T1.16b, $B1.16b
+    eor $T2.16b, $T2.16b, $C1.16b
+    eor $T3.16b, $T3.16b, $D1.16b
+
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+
+    eor $T0.16b, $T0.16b, $A2.16b
+    eor $T1.16b, $T1.16b, $B2.16b
+    eor $T2.16b, $T2.16b, $C2.16b
+    eor $T3.16b, $T3.16b, $D2.16b
+
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #128
+    b .Lopen_tail_64_store
+
+.Lopen_tail_128:
+     // We need two more blocks
+    mov $A0.16b, $CONSTS.16b
+    mov $A1.16b, $CONSTS.16b
+    mov $B0.16b, $B_STORE.16b
+    mov $B1.16b, $B_STORE.16b
+    mov $C0.16b, $C_STORE.16b
+    mov $C1.16b, $C_STORE.16b
+    mov $D0.16b, $D_STORE.16b
+    mov $D1.16b, $D_STORE.16b
+    eor $T3.16b, $T3.16b, $T3.16b
+    eor $T2.16b, $T2.16b, $T2.16b
+    ins $T3.s[0], $INC.s[0]
+    ins $T2.d[0], $one
+    add $T2.4s, $T2.4s, $T3.4s
+
+    add $D0.4s, $D0.4s, $T2.4s
+    add $D1.4s, $D1.4s, $T3.4s
+
+    mov $itr1, #10
+    sub $itr1, $itr1, $adl
+
+.Lopen_tail_128_rounds:
+___
+        &chacha_qr($A0, $B0, $C0, $D0, $T0, "left");
+        &chacha_qr($A1, $B1, $C1, $D1, $T0, "left");
+        &chacha_qr($A0, $B0, $C0, $D0, $T0, "right");
+        &chacha_qr($A1, $B1, $C1, $D1, $T0, "right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+        b.gt .Lopen_tail_128_rounds
+        cbz $adl, .Lopen_tail_128_rounds_done
+        subs $adl, $adl, #1
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+    b .Lopen_tail_128_rounds
+
+.Lopen_tail_128_rounds_done:
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+    add $D0.4s, $D0.4s, $T2.4s
+    add $D1.4s, $D1.4s, $T3.4s
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+
+    eor $T0.16b, $T0.16b, $A1.16b
+    eor $T1.16b, $T1.16b, $B1.16b
+    eor $T2.16b, $T2.16b, $C1.16b
+    eor $T3.16b, $T3.16b, $D1.16b
+
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+    sub $inl, $inl, #64
+
+    b .Lopen_tail_64_store
+
+.Lopen_tail_64:
+    // We just need a single block
+    mov $A0.16b, $CONSTS.16b
+    mov $B0.16b, $B_STORE.16b
+    mov $C0.16b, $C_STORE.16b
+    mov $D0.16b, $D_STORE.16b
+    eor $T3.16b, $T3.16b, $T3.16b
+    ins $T3.s[0], $INC.s[0]
+    add $D0.4s, $D0.4s, $T3.4s
+
+    mov $itr1, #10
+    sub $itr1, $itr1, $adl
+
+.Lopen_tail_64_rounds:
+___
+        &chacha_qr($A0, $B0, $C0, $D0, $T0, "left");
+        &chacha_qr($A0, $B0, $C0, $D0, $T0, "right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+        b.gt .Lopen_tail_64_rounds
+        cbz $adl, .Lopen_tail_64_rounds_done
+        subs $adl, $adl, #1
+___
+        &poly_add($adp);
+        &poly_mul();
+$code.=<<___;
+    b .Lopen_tail_64_rounds
+
+.Lopen_tail_64_rounds_done:
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+    add $D0.4s, $D0.4s, $T3.4s
+
+.Lopen_tail_64_store:
+    cmp $inl, #16
+    b.lt .Lopen_tail_16
+
+    ld1 {$T0.16b}, [$inp], #16
+    eor $T0.16b, $T0.16b, $A0.16b
+    st1 {$T0.16b}, [$oup], #16
+    mov $A0.16b, $B0.16b
+    mov $B0.16b, $C0.16b
+    mov $C0.16b, $D0.16b
+    sub $inl, $inl, #16
+    b .Lopen_tail_64_store
+
+.Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+    cbz $inl, .Lopen_finalize
+
+    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the ciphertext
+    eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask
+    not $T2.16b, $T0.16b
+
+    add $itr2, $inp, $inl
+    mov $itr1, $inl
+
+.Lopen_tail_16_compose:
+    ext  $T0.16b, $T0.16b, $T0.16b, #15
+    ldrb $t0w, [$itr2, #-1]!
+    mov  $T0.b[0], $t0w
+    ext  $T1.16b, $T2.16b, $T1.16b, #15
+    subs $inl, $inl, #1
+    b.gt .Lopen_tail_16_compose
+
+    and $T0.16b, $T0.16b, $T1.16b
+    // Hash in the final padded block
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+$code.=<<___;
+    eor $T0.16b, $T0.16b, $A0.16b
+
+.Lopen_tail_16_store:
+    umov $t0w, $T0.b[0]
+    strb $t0w, [$oup], #1
+    ext  $T0.16b, $T0.16b, $T0.16b, #1
+    subs $itr1, $itr1, #1
+    b.gt .Lopen_tail_16_store
+
+.Lopen_finalize:
+___
+    &poly_add_vec($LEN_STORE);
+    &poly_mul();
+$code.=<<___;
+    // Final reduction step
+    sub  $t1, xzr, $one
+    orr  $t2, xzr, #3
+    subs $t0, $acc0, #-5
+    sbcs $t1, $acc1, $t1
+    sbcs $t2, $acc2, $t2
+    csel $acc0, $t0, $acc0, cs
+    csel $acc1, $t1, $acc1, cs
+    csel $acc2, $t2, $acc2, cs
+___
+    &poly_add_vec($S_STORE);
+$code.=<<___;
+
+    stp  $acc0, $acc1, [$keyp]
+
+    ldp	d8, d9, [sp, #16]
+    ldp	d10, d11, [sp, #32]
+    ldp	d12, d13, [sp, #48]
+    ldp	d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+    ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+    AARCH64_VALIDATE_LINK_REGISTER
+    ret
+
+.Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+    eor $INC.16b, $INC.16b, $INC.16b
+    mov $t0, #1
+    mov $INC.s[0], $t0w
+    mov $A0.16b, $CONSTS.16b
+    mov $A1.16b, $CONSTS.16b
+    mov $A2.16b, $CONSTS.16b
+    mov $B0.16b, $B_STORE.16b
+    mov $B1.16b, $B_STORE.16b
+    mov $B2.16b, $B_STORE.16b
+    mov $C0.16b, $C_STORE.16b
+    mov $C1.16b, $C_STORE.16b
+    mov $C2.16b, $C_STORE.16b
+    mov $D2.16b, $D_STORE.16b
+    add $D0.4s, $D2.4s, $INC.4s
+    add $D1.4s, $D0.4s, $INC.4s
+
+    mov  $itr1, #10
+
+.Lopen_128_rounds:
+___
+        &chacha_qr_x3("left");
+        &chacha_qr_x3("right");
+$code.=<<___;
+        subs $itr1, $itr1, #1
+    b.hi .Lopen_128_rounds
+
+    add $A0.4s, $A0.4s, $CONSTS.4s
+    add $A1.4s, $A1.4s, $CONSTS.4s
+    add $A2.4s, $A2.4s, $CONSTS.4s
+
+    add $B0.4s, $B0.4s, $B_STORE.4s
+    add $B1.4s, $B1.4s, $B_STORE.4s
+    add $B2.4s, $B2.4s, $B_STORE.4s
+
+    add $C0.4s, $C0.4s, $C_STORE.4s
+    add $C1.4s, $C1.4s, $C_STORE.4s
+
+    add $D_STORE.4s, $D_STORE.4s, $INC.4s
+    add $D0.4s, $D0.4s, $D_STORE.4s
+    add $D_STORE.4s, $D_STORE.4s, $INC.4s
+    add $D1.4s, $D1.4s, $D_STORE.4s
+
+    and $A2.16b, $A2.16b, $CLAMP.16b
+    mov $r0, $A2.d[0] // Move the R key to GPRs
+    mov $r1, $A2.d[1]
+    mov $S_STORE.16b, $B2.16b // Store the S key
+
+    bl  .Lpoly_hash_ad_internal
+
+.Lopen_128_store:
+    cmp $inl, #64
+    b.lt .Lopen_128_store_64
+
+    ld1 {$T0.16b - $T3.16b}, [$inp], #64
+
+___
+    &poly_add_vec($T0);
+    &poly_mul();
+    &poly_add_vec($T1);
+    &poly_mul();
+    &poly_add_vec($T2);
+    &poly_mul();
+    &poly_add_vec($T3);
+    &poly_mul();
+$code.=<<___;
+
+    eor $T0.16b, $T0.16b, $A0.16b
+    eor $T1.16b, $T1.16b, $B0.16b
+    eor $T2.16b, $T2.16b, $C0.16b
+    eor $T3.16b, $T3.16b, $D0.16b
+
+    st1 {$T0.16b - $T3.16b}, [$oup], #64
+
+    sub $inl, $inl, #64
+
+    mov $A0.16b, $A1.16b
+    mov $B0.16b, $B1.16b
+    mov $C0.16b, $C1.16b
+    mov $D0.16b, $D1.16b
+
+.Lopen_128_store_64:
+
+    lsr $adl, $inl, #4
+    mov $adp, $inp
+
+.Lopen_128_hash_64:
+    cbz $adl, .Lopen_tail_64_store
+___
+    &poly_add($adp);
+    &poly_mul();
+$code.=<<___;
+    sub $adl, $adl, #1
+    b .Lopen_128_hash_64
+.cfi_endproc
+.size chacha20_poly1305_open,.-chacha20_poly1305_open
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT";
diff --git a/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl
new file mode 100644
index 0000000000..1abc178090
--- /dev/null
+++ b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl
@@ -0,0 +1,2609 @@
+#!/usr/bin/env perl
+
+# Copyright (c) 2015, CloudFlare Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##############################################################################
+#                                                                            #
+# Author:  Vlad Krasnov                                                      #
+#                                                                            #
+##############################################################################
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$avx = 2;
+
+$code.=<<___;
+.section .rodata
+.align 64
+chacha20_poly1305_constants:
+.Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lrol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.Lavx2_init:
+.long 0,0,0,0
+.Lsse_inc:
+.long 1,0,0,0
+.Lavx2_inc:
+.long 2,0,0,0,2,0,0,0
+.Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align 16
+.Land_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text
+___
+
+my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8");
+my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
+my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
+my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
+my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
+my $xmm_storage = 0;
+if ($win64) {
+    $xmm_storage = 10*16;
+}
+my $xmm_store="0*16(%rbp)";
+my $r_store="$xmm_storage+0*16(%rbp)";
+my $s_store="$xmm_storage+1*16(%rbp)";
+my $len_store="$xmm_storage+2*16(%rbp)";
+my $state1_store="$xmm_storage+3*16(%rbp)";
+my $state2_store="$xmm_storage+4*16(%rbp)";
+my $tmp_store="$xmm_storage+5*16(%rbp)";
+my $ctr0_store="$xmm_storage+6*16(%rbp)";
+my $ctr1_store="$xmm_storage+7*16(%rbp)";
+my $ctr2_store="$xmm_storage+8*16(%rbp)";
+my $ctr3_store="$xmm_storage+9*16(%rbp)";
+
+sub chacha_qr {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
+$code.="paddd $b, $a
+        pxor $a, $d
+        pshufb .Lrol16(%rip), $d
+        paddd $d, $c
+        pxor $c, $b
+        movdqa $b, $t
+        pslld \$12, $t
+        psrld \$20, $b
+        pxor $t, $b
+        paddd $b, $a
+        pxor $a, $d
+        pshufb .Lrol8(%rip), $d
+        paddd $d, $c
+        pxor $c, $b
+        movdqa $b, $t
+        pslld \$7, $t
+        psrld \$25, $b
+        pxor $t, $b\n";
+$code.="palignr \$4, $b, $b
+        palignr \$8, $c, $c
+        palignr \$12, $d, $d\n" if ($dir =~ /left/);
+$code.="palignr \$12, $b, $b
+        palignr \$8, $c, $c
+        palignr \$4, $d, $d\n" if ($dir =~ /right/);
+$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
+}
+
+sub poly_add {
+my ($src)=@_;
+$code.="add 0+$src, $acc0
+        adc 8+$src, $acc1
+        adc \$1, $acc2\n";
+}
+
+sub poly_stage1 {
+$code.="mov 0+$r_store, %rax
+        mov %rax, $t2
+        mul $acc0
+        mov %rax, $t0
+        mov %rdx, $t1
+        mov 0+$r_store, %rax
+        mul $acc1
+        imulq $acc2, $t2
+        add %rax, $t1
+        adc %rdx, $t2\n";
+}
+
+sub poly_stage2 {
+$code.="mov 8+$r_store, %rax
+        mov %rax, $t3
+        mul $acc0
+        add %rax, $t1
+        adc \$0, %rdx
+        mov %rdx, $acc0
+        mov 8+$r_store, %rax
+        mul $acc1
+        add %rax, $t2
+        adc \$0, %rdx\n";
+}
+
+sub poly_stage3 {
+$code.="imulq $acc2, $t3
+        add $acc0, $t2
+        adc %rdx, $t3\n";
+}
+
+# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of
+# r = [r1:r0] and acc = [acc2:acc1:acc0]
+# r is 124 bits at most (due to clamping) and acc is 131 bits at most
+# (acc2 is at most 4 before the addition and can be at most 6 when we add in
+# the next block) therefore t is at most 255 bits big, and t3 is 63 bits.
+sub poly_reduce_stage {
+$code.="mov $t0, $acc0
+        mov $t1, $acc1
+        mov $t2, $acc2
+        and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3)
+        mov $t2, $t0
+        and \$-4, $t0
+        mov $t3, $t1
+        shrd \$2, $t3, $t2
+        shr \$2, $t3
+        add $t0, $t2
+        adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits
+        add $t2, $acc0
+        adc $t3, $acc1
+        adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most
+}
+
+sub poly_mul {
+    &poly_stage1();
+    &poly_stage2();
+    &poly_stage3();
+    &poly_reduce_stage();
+}
+
+sub prep_state {
+my ($n)=@_;
+$code.="movdqa .Lchacha20_consts(%rip), $A0
+        movdqa $state1_store, $B0
+        movdqa $state2_store, $C0\n";
+$code.="movdqa $A0, $A1
+        movdqa $B0, $B1
+        movdqa $C0, $C1\n" if ($n ge 2);
+$code.="movdqa $A0, $A2
+        movdqa $B0, $B2
+        movdqa $C0, $C2\n" if ($n ge 3);
+$code.="movdqa $A0, $A3
+        movdqa $B0, $B3
+        movdqa $C0, $C3\n" if ($n ge 4);
+$code.="movdqa $ctr0_store, $D0
+        paddd .Lsse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store\n" if ($n eq 1);
+$code.="movdqa $ctr0_store, $D1
+        paddd .Lsse_inc(%rip), $D1
+        movdqa $D1, $D0
+        paddd .Lsse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store
+        movdqa $D1, $ctr1_store\n" if ($n eq 2);
+$code.="movdqa $ctr0_store, $D2
+        paddd .Lsse_inc(%rip), $D2
+        movdqa $D2, $D1
+        paddd .Lsse_inc(%rip), $D1
+        movdqa $D1, $D0
+        paddd .Lsse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store
+        movdqa $D1, $ctr1_store
+        movdqa $D2, $ctr2_store\n" if ($n eq 3);
+$code.="movdqa $ctr0_store, $D3
+        paddd .Lsse_inc(%rip), $D3
+        movdqa $D3, $D2
+        paddd .Lsse_inc(%rip), $D2
+        movdqa $D2, $D1
+        paddd .Lsse_inc(%rip), $D1
+        movdqa $D1, $D0
+        paddd .Lsse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store
+        movdqa $D1, $ctr1_store
+        movdqa $D2, $ctr2_store
+        movdqa $D3, $ctr3_store\n" if ($n eq 4);
+}
+
+sub finalize_state {
+my ($n)=@_;
+$code.="paddd .Lchacha20_consts(%rip), $A3
+        paddd $state1_store, $B3
+        paddd $state2_store, $C3
+        paddd $ctr3_store, $D3\n" if ($n eq 4);
+$code.="paddd .Lchacha20_consts(%rip), $A2
+        paddd $state1_store, $B2
+        paddd $state2_store, $C2
+        paddd $ctr2_store, $D2\n" if ($n ge 3);
+$code.="paddd .Lchacha20_consts(%rip), $A1
+        paddd $state1_store, $B1
+        paddd $state2_store, $C1
+        paddd $ctr1_store, $D1\n" if ($n ge 2);
+$code.="paddd .Lchacha20_consts(%rip), $A0
+        paddd $state1_store, $B0
+        paddd $state2_store, $C0
+        paddd $ctr0_store, $D0\n";
+}
+
+sub xor_stream {
+my ($A, $B, $C, $D, $offset)=@_;
+$code.="movdqu 0*16 + $offset($inp), $A3
+        movdqu 1*16 + $offset($inp), $B3
+        movdqu 2*16 + $offset($inp), $C3
+        movdqu 3*16 + $offset($inp), $D3
+        pxor $A3, $A
+        pxor $B3, $B
+        pxor $C3, $C
+        pxor $D, $D3
+        movdqu $A, 0*16 + $offset($oup)
+        movdqu $B, 1*16 + $offset($oup)
+        movdqu $C, 2*16 + $offset($oup)
+        movdqu $D3, 3*16 + $offset($oup)\n";
+}
+
+sub xor_stream_using_temp {
+my ($A, $B, $C, $D, $offset, $temp)=@_;
+$code.="movdqa $temp, $tmp_store
+        movdqu 0*16 + $offset($inp), $temp
+        pxor $A, $temp
+        movdqu $temp, 0*16 + $offset($oup)
+        movdqu 1*16 + $offset($inp), $temp
+        pxor $B, $temp
+        movdqu $temp, 1*16 + $offset($oup)
+        movdqu 2*16 + $offset($inp), $temp
+        pxor $C, $temp
+        movdqu $temp, 2*16 + $offset($oup)
+        movdqu 3*16 + $offset($inp), $temp
+        pxor $D, $temp
+        movdqu $temp, 3*16 + $offset($oup)\n";
+}
+
+sub gen_chacha_round {
+my ($rot1, $rot2, $shift)=@_;
+my $round="";
+$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
+$round.="movdqa $rot2, $C0
+         paddd $B3, $A3
+         paddd $B2, $A2
+         paddd $B1, $A1
+         paddd $B0, $A0
+         pxor $A3, $D3
+         pxor $A2, $D2
+         pxor $A1, $D1
+         pxor $A0, $D0
+         pshufb $C0, $D3
+         pshufb $C0, $D2
+         pshufb $C0, $D1
+         pshufb $C0, $D0
+         movdqa $tmp_store, $C0
+         paddd $D3, $C3
+         paddd $D2, $C2
+         paddd $D1, $C1
+         paddd $D0, $C0
+         pxor $C3, $B3
+         pxor $C2, $B2
+         pxor $C1, $B1
+         pxor $C0, $B0
+         movdqa $C0, $tmp_store
+         movdqa $B3, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B3
+         pxor $C0, $B3
+         movdqa $B2, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B2
+         pxor $C0, $B2
+         movdqa $B1, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B1
+         pxor $C0, $B1
+         movdqa $B0, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B0
+         pxor $C0, $B0\n";
+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
+$round.="movdqa $tmp_store, $C0
+         palignr \$$s1, $B3, $B3
+         palignr \$$s2, $C3, $C3
+         palignr \$$s3, $D3, $D3
+         palignr \$$s1, $B2, $B2
+         palignr \$$s2, $C2, $C2
+         palignr \$$s3, $D2, $D2
+         palignr \$$s1, $B1, $B1
+         palignr \$$s2, $C1, $C1
+         palignr \$$s3, $D1, $D1
+         palignr \$$s1, $B0, $B0
+         palignr \$$s2, $C0, $C0
+         palignr \$$s3, $D0, $D0\n"
+if (($shift =~ /left/) || ($shift =~ /right/));
+return $round;
+};
+
+$chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") .
+               &gen_chacha_round(25, ".Lrol8(%rip)", "left") .
+               &gen_chacha_round(20, ".Lrol16(%rip)") .
+               &gen_chacha_round(25, ".Lrol8(%rip)", "right");
+
+my @loop_body = split /\n/, $chacha_body;
+
+sub emit_body {
+my ($n)=@_;
+    for (my $i=0; $i < $n; $i++) {
+        $code=$code.shift(@loop_body)."\n";
+    };
+}
+
+{
+################################################################################
+# void poly_hash_ad_internal();
+$code.="
+.type poly_hash_ad_internal,\@abi-omnipotent
+.align 64
+poly_hash_ad_internal:
+.cfi_startproc
+.cfi_def_cfa rsp, 8
+    xor $acc0, $acc0
+    xor $acc1, $acc1
+    xor $acc2, $acc2
+    cmp \$13,  $itr2
+    jne .Lhash_ad_loop
+.Lpoly_fast_tls_ad:
+    # Special treatment for the TLS case of 13 bytes
+    mov ($adp), $acc0
+    mov 5($adp), $acc1
+    shr \$24, $acc1
+    mov \$1, $acc2\n";
+    &poly_mul(); $code.="
+    ret
+.Lhash_ad_loop:
+        # Hash in 16 byte chunk
+        cmp \$16, $itr2
+        jb .Lhash_ad_tail\n";
+        &poly_add("0($adp)");
+        &poly_mul(); $code.="
+        lea 1*16($adp), $adp
+        sub \$16, $itr2
+    jmp .Lhash_ad_loop
+.Lhash_ad_tail:
+    cmp \$0, $itr2
+    je .Lhash_ad_done
+    # Hash last < 16 byte tail
+    xor $t0, $t0
+    xor $t1, $t1
+    xor $t2, $t2
+    add $itr2, $adp
+.Lhash_ad_tail_loop:
+        shld \$8, $t0, $t1
+        shl \$8, $t0
+        movzxb -1($adp), $t2
+        xor $t2, $t0
+        dec $adp
+        dec $itr2
+    jne .Lhash_ad_tail_loop
+
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+    # Finished AD
+.Lhash_ad_done:
+    ret
+.cfi_endproc
+.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
+}
+
+{
+################################################################################
+# void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext,
+#                             size_t plaintext_len, const uint8_t *ad,
+#                             size_t ad_len,
+#                             union chacha20_poly1305_open_data *aead_data)
+#
+$code.="
+.globl chacha20_poly1305_open_sse41
+.type chacha20_poly1305_open_sse41,\@function,6
+.align 64
+chacha20_poly1305_open_sse41:
+.cfi_startproc
+    _CET_ENDBR
+    push %rbp
+.cfi_push %rbp
+    push %rbx
+.cfi_push %rbx
+    push %r12
+.cfi_push %r12
+    push %r13
+.cfi_push %r13
+    push %r14
+.cfi_push %r14
+    push %r15
+.cfi_push %r15
+    # We write the calculated authenticator back to keyp at the end, so save
+    # the pointer on the stack too.
+    push $keyp
+.cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store
+
+    cmp \$128, $inl
+    jbe .Lopen_sse_128
+    # For long buffers, prepare the poly key first
+    movdqa .Lchacha20_consts(%rip), $A0
+    movdqu 0*16($keyp), $B0
+    movdqu 1*16($keyp), $C0
+    movdqu 2*16($keyp), $D0
+
+    movdqa $D0, $T1
+    # Store on stack, to free keyp
+    movdqa $B0, $state1_store
+    movdqa $C0, $state2_store
+    movdqa $D0, $ctr0_store
+    mov \$10, $acc0
+.Lopen_sse_init_rounds:\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        dec $acc0
+    jne .Lopen_sse_init_rounds
+    # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+    paddd .Lchacha20_consts(%rip), $A0
+    paddd $state1_store, $B0
+    # Clamp and store the key
+    pand .Lclamp(%rip), $A0
+    movdqa $A0, $r_store
+    movdqa $B0, $s_store
+    # Hash
+    mov $adl, $itr2
+    call poly_hash_ad_internal
+.Lopen_sse_main_loop:
+        cmp \$16*16, $inl
+        jb .Lopen_sse_tail
+        # Load state, increment counter blocks\n";
+        &prep_state(4); $code.="
+        # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
+        # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+        mov \$4, $itr1
+        mov $inp, $itr2
+.Lopen_sse_main_loop_rounds:\n";
+            &emit_body(20);
+            &poly_add("0($itr2)"); $code.="
+            lea 2*8($itr2), $itr2\n";
+            &emit_body(20);
+            &poly_stage1();
+            &emit_body(20);
+            &poly_stage2();
+            &emit_body(20);
+            &poly_stage3();
+            &emit_body(20);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            dec $itr1
+        jge .Lopen_sse_main_loop_rounds\n";
+            &poly_add("0($itr2)");
+            &poly_mul(); $code.="
+            lea 2*8($itr2), $itr2
+            cmp \$-6, $itr1
+        jg .Lopen_sse_main_loop_rounds\n";
+        &finalize_state(4);
+        &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
+        &xor_stream($A2, $B2, $C2, $D2, "4*16");
+        &xor_stream($A1, $B1, $C1, $D1, "8*16");
+        &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
+        lea 16*16($inp), $inp
+        lea 16*16($oup), $oup
+        sub \$16*16, $inl
+    jmp .Lopen_sse_main_loop
+.Lopen_sse_tail:
+    # Handle the various tail sizes efficiently
+    test $inl, $inl
+    jz .Lopen_sse_finalize
+    cmp \$12*16, $inl
+    ja .Lopen_sse_tail_256
+    cmp \$8*16, $inl
+    ja .Lopen_sse_tail_192
+    cmp \$4*16, $inl
+    ja .Lopen_sse_tail_128\n";
+###############################################################################
+    # At most 64 bytes are left
+    &prep_state(1); $code.="
+    xor $itr2, $itr2
+    mov $inl, $itr1
+    cmp \$16, $itr1
+    jb .Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
+        &poly_mul(); $code.="
+        sub \$16, $itr1
+.Lopen_sse_tail_64_rounds:
+        add \$16, $itr2\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        cmp \$16, $itr1
+    jae .Lopen_sse_tail_64_rounds_and_x1hash
+        cmp \$10*16, $itr2
+    jne .Lopen_sse_tail_64_rounds\n";
+    &finalize_state(1); $code.="
+    jmp .Lopen_sse_tail_64_dec_loop
+###############################################################################
+.Lopen_sse_tail_128:\n";
+    # 65 - 128 bytes are left
+    &prep_state(2); $code.="
+    mov $inl, $itr1
+    and \$-16, $itr1
+    xor $itr2, $itr2
+.Lopen_sse_tail_128_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
+        &poly_mul(); $code.="
+.Lopen_sse_tail_128_rounds:
+        add \$16, $itr2\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
+        cmp $itr1, $itr2
+    jb .Lopen_sse_tail_128_rounds_and_x1hash
+        cmp \$10*16, $itr2
+    jne .Lopen_sse_tail_128_rounds\n";
+    &finalize_state(2);
+    &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
+    sub \$4*16, $inl
+    lea 4*16($inp), $inp
+    lea 4*16($oup), $oup
+    jmp .Lopen_sse_tail_64_dec_loop
+###############################################################################
+.Lopen_sse_tail_192:\n";
+    # 129 - 192 bytes are left
+    &prep_state(3); $code.="
+    mov $inl, $itr1
+    mov \$10*16, $itr2
+    cmp \$10*16, $itr1
+    cmovg $itr2, $itr1
+    and \$-16, $itr1
+    xor $itr2, $itr2
+.Lopen_sse_tail_192_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
+        &poly_mul(); $code.="
+.Lopen_sse_tail_192_rounds:
+        add \$16, $itr2\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb .Lopen_sse_tail_192_rounds_and_x1hash
+        cmp \$10*16, $itr2
+    jne .Lopen_sse_tail_192_rounds
+    cmp \$11*16, $inl
+    jb .Lopen_sse_tail_192_finish\n";
+    &poly_add("10*16($inp)");
+    &poly_mul(); $code.="
+    cmp \$12*16, $inl
+    jb .Lopen_sse_tail_192_finish\n";
+    &poly_add("11*16($inp)");
+    &poly_mul(); $code.="
+.Lopen_sse_tail_192_finish: \n";
+    &finalize_state(3);
+    &xor_stream($A2, $B2, $C2, $D2, "0*16");
+    &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
+    sub \$8*16, $inl
+    lea 8*16($inp), $inp
+    lea 8*16($oup), $oup
+    jmp .Lopen_sse_tail_64_dec_loop
+###############################################################################
+.Lopen_sse_tail_256:\n";
+    # 193 - 255 bytes are left
+    &prep_state(4); $code.="
+    xor $itr2, $itr2
+.Lopen_sse_tail_256_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
+        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
+        &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
+        &poly_stage1();
+        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
+        &poly_stage2();
+        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
+        &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
+        &poly_stage3();
+        &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
+        &poly_reduce_stage();
+        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
+        add \$16, $itr2
+        cmp \$10*16, $itr2
+    jb .Lopen_sse_tail_256_rounds_and_x1hash
+
+    mov $inl, $itr1
+    and \$-16, $itr1
+.Lopen_sse_tail_256_hash: \n";
+        &poly_add("0($inp,$itr2)");
+        &poly_mul(); $code.="
+        add \$16, $itr2
+        cmp $itr1, $itr2
+    jb .Lopen_sse_tail_256_hash\n";
+    &finalize_state(4);
+    &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
+    &xor_stream($A2, $B2, $C2, $D2, "4*16");
+    &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
+    movdqa $tmp_store, $D0
+    sub \$12*16, $inl
+    lea 12*16($inp), $inp
+    lea 12*16($oup), $oup
+###############################################################################
+    # Decrypt the remaining data, 16B at a time, using existing stream
+.Lopen_sse_tail_64_dec_loop:
+    cmp \$16, $inl
+    jb .Lopen_sse_tail_16_init
+        sub \$16, $inl
+        movdqu ($inp), $T0
+        pxor $T0, $A0
+        movdqu $A0, ($oup)
+        lea 16($inp), $inp
+        lea 16($oup), $oup
+        movdqa $B0, $A0
+        movdqa $C0, $B0
+        movdqa $D0, $C0
+    jmp .Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
+    movdqa $A0, $A1
+
+    # Decrypt up to 16 bytes at the end.
+.Lopen_sse_tail_16:
+    test $inl, $inl
+    jz .Lopen_sse_finalize
+
+    # Read the final bytes into $T0. They need to be read in reverse order so
+    # that they end up in the correct order in $T0.
+    pxor $T0, $T0
+    lea -1($inp,$inl), $inp
+    movq $inl, $itr2
+.Lopen_sse_tail_16_compose:
+        pslldq \$1, $T0
+        pinsrb \$0, ($inp), $T0
+        sub \$1, $inp
+        sub \$1, $itr2
+        jnz .Lopen_sse_tail_16_compose
+
+    movq $T0, $t0
+    pextrq \$1, $T0, $t1
+    # The final bytes of keystream are in $A1.
+    pxor $A1, $T0
+
+    # Copy the plaintext bytes out.
+.Lopen_sse_tail_16_extract:
+        pextrb \$0, $T0, ($oup)
+        psrldq \$1, $T0
+        add \$1, $oup
+        sub \$1, $inl
+    jne .Lopen_sse_tail_16_extract
+
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+
+.Lopen_sse_finalize:\n";
+    &poly_add($len_store);
+    &poly_mul(); $code.="
+    # Final reduce
+    mov $acc0, $t0
+    mov $acc1, $t1
+    mov $acc2, $t2
+    sub \$-5, $acc0
+    sbb \$-1, $acc1
+    sbb \$3, $acc2
+    cmovc $t0, $acc0
+    cmovc $t1, $acc1
+    cmovc $t2, $acc2
+    # Add in s part of the key
+    add 0+$s_store, $acc0
+    adc 8+$s_store, $acc1\n";
+
+$code.="
+    movaps 16*0+$xmm_store, %xmm6
+    movaps 16*1+$xmm_store, %xmm7
+    movaps 16*2+$xmm_store, %xmm8
+    movaps 16*3+$xmm_store, %xmm9
+    movaps 16*4+$xmm_store, %xmm10
+    movaps 16*5+$xmm_store, %xmm11
+    movaps 16*6+$xmm_store, %xmm12
+    movaps 16*7+$xmm_store, %xmm13
+    movaps 16*8+$xmm_store, %xmm14
+    movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
+$code.="
+.cfi_remember_state
+    add \$288 + $xmm_storage + 32, %rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+    # The tag replaces the key on return
+    pop $keyp
+.cfi_pop $keyp
+    mov $acc0, ($keyp)
+    mov $acc1, 8($keyp)
+    pop %r15
+.cfi_pop %r15
+    pop %r14
+.cfi_pop %r14
+    pop %r13
+.cfi_pop %r13
+    pop %r12
+.cfi_pop %r12
+    pop %rbx
+.cfi_pop %rbx
+    pop %rbp
+.cfi_pop %rbp
+    ret
+###############################################################################
+.Lopen_sse_128:
+.cfi_restore_state
+    movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
+    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
+    movdqu 2*16($keyp), $D0
+    movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
+    movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2
+    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
+    mov \$10, $acc0
+
+.Lopen_sse_128_rounds:  \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+    dec $acc0
+    jnz .Lopen_sse_128_rounds
+    paddd .Lchacha20_consts(%rip), $A0
+    paddd .Lchacha20_consts(%rip), $A1
+    paddd .Lchacha20_consts(%rip), $A2
+    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
+    paddd $T2, $C1\npaddd $T2, $C2
+    paddd $T3, $D1
+    paddd .Lsse_inc(%rip), $T3
+    paddd $T3, $D2
+    # Clamp and store the key
+    pand .Lclamp(%rip), $A0
+    movdqa $A0, $r_store
+    movdqa $B0, $s_store
+    # Hash
+    mov $adl, $itr2
+    call poly_hash_ad_internal
+.Lopen_sse_128_xor_hash:
+        cmp \$16, $inl
+        jb .Lopen_sse_tail_16
+        sub \$16, $inl\n";
+        # Load for hashing
+        &poly_add("0*8($inp)"); $code.="
+        # Load for decryption
+        movdqu 0*16($inp), $T0
+        pxor $T0, $A1
+        movdqu $A1, 0*16($oup)
+        lea 1*16($inp), $inp
+        lea 1*16($oup), $oup\n";
+        &poly_mul(); $code.="
+        # Shift the stream left
+        movdqa $B1, $A1
+        movdqa $C1, $B1
+        movdqa $D1, $C1
+        movdqa $A2, $D1
+        movdqa $B2, $A2
+        movdqa $C2, $B2
+        movdqa $D2, $C2
+    jmp .Lopen_sse_128_xor_hash
+.size chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41
+.cfi_endproc
+
+################################################################################
+################################################################################
+# void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
+#                             size_t plaintext_len, const uint8_t *ad,
+#                             size_t ad_len,
+#                             union chacha20_poly1305_seal_data *data);
+.globl  chacha20_poly1305_seal_sse41
+.type chacha20_poly1305_seal_sse41,\@function,6
+.align 64
+chacha20_poly1305_seal_sse41:
+.cfi_startproc
+    _CET_ENDBR
+    push %rbp
+.cfi_push %rbp
+    push %rbx
+.cfi_push %rbx
+    push %r12
+.cfi_push %r12
+    push %r13
+.cfi_push %r13
+    push %r14
+.cfi_push %r14
+    push %r15
+.cfi_push %r15   
+# We write the calculated authenticator back to keyp at the end, so save
+# the pointer on the stack too.
+    push $keyp
+.cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov 56($keyp), $inl  # extra_in_len
+    addq %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store
+    mov %rdx, $inl
+
+    cmp \$128, $inl
+    jbe .Lseal_sse_128
+    # For longer buffers, prepare the poly key + some stream
+    movdqa .Lchacha20_consts(%rip), $A0
+    movdqu 0*16($keyp), $B0
+    movdqu 1*16($keyp), $C0
+    movdqu 2*16($keyp), $D0
+
+    movdqa $A0, $A1
+    movdqa $A0, $A2
+    movdqa $A0, $A3
+    movdqa $B0, $B1
+    movdqa $B0, $B2
+    movdqa $B0, $B3
+    movdqa $C0, $C1
+    movdqa $C0, $C2
+    movdqa $C0, $C3
+    movdqa $D0, $D3
+    paddd .Lsse_inc(%rip), $D0
+    movdqa $D0, $D2
+    paddd .Lsse_inc(%rip), $D0
+    movdqa $D0, $D1
+    paddd .Lsse_inc(%rip), $D0
+    # Store on stack
+    movdqa $B0, $state1_store
+    movdqa $C0, $state2_store
+    movdqa $D0, $ctr0_store
+    movdqa $D1, $ctr1_store
+    movdqa $D2, $ctr2_store
+    movdqa $D3, $ctr3_store
+    mov \$10, $acc0
+.Lseal_sse_init_rounds:  \n";
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        dec $acc0
+    jnz .Lseal_sse_init_rounds\n";
+    &finalize_state(4); $code.="
+    # Clamp and store the key
+    pand .Lclamp(%rip), $A3
+    movdqa $A3, $r_store
+    movdqa $B3, $s_store
+    # Hash
+    mov $adl, $itr2
+    call poly_hash_ad_internal\n";
+    &xor_stream($A2,$B2,$C2,$D2,"0*16");
+    &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
+    cmp \$12*16, $inl
+    ja .Lseal_sse_main_init
+    mov \$8*16, $itr1
+    sub \$8*16, $inl
+    lea 8*16($inp), $inp
+    jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:\n";
+    &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
+    mov \$12*16, $itr1
+    sub \$12*16, $inl
+    lea 12*16($inp), $inp
+    mov \$2, $itr1
+    mov \$8, $itr2
+    cmp \$4*16, $inl
+    jbe .Lseal_sse_tail_64
+    cmp \$8*16, $inl
+    jbe .Lseal_sse_tail_128
+    cmp \$12*16, $inl
+    jbe .Lseal_sse_tail_192
+
+.Lseal_sse_main_loop: \n";
+    # The main loop
+        &prep_state(4); $code.="
+.align 32
+.Lseal_sse_main_rounds: \n";
+            &emit_body(20);
+            &poly_add("0($oup)");
+            &emit_body(20);
+            &poly_stage1();
+            &emit_body(20);
+            &poly_stage2();
+            &emit_body(20);
+            &poly_stage3();
+            &emit_body(20);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            lea 16($oup), $oup
+            dec $itr2
+        jge .Lseal_sse_main_rounds\n";
+            &poly_add("0*8($oup)");
+            &poly_mul(); $code.="
+            lea 16($oup), $oup
+            dec $itr1
+        jg .Lseal_sse_main_rounds\n";
+
+        &finalize_state(4);$code.="
+        movdqa $D2, $tmp_store\n";
+        &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
+        movdqa $tmp_store, $D2\n";
+        &xor_stream($A2,$B2,$C2,$D2, 4*16);
+        &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
+        cmp \$16*16, $inl
+        ja .Lseal_sse_main_loop_xor
+
+        mov \$12*16, $itr1
+        sub \$12*16, $inl
+        lea 12*16($inp), $inp
+        jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor: \n";
+        &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
+        lea 16*16($inp), $inp
+        sub \$16*16, $inl
+        mov \$6, $itr1
+        mov \$4, $itr2
+        cmp \$12*16, $inl
+    jg .Lseal_sse_main_loop
+    mov $inl, $itr1
+    test $inl, $inl
+    je .Lseal_sse_128_tail_hash
+    mov \$6, $itr1
+    cmp \$8*16, $inl
+    ja .Lseal_sse_tail_192
+    cmp \$4*16, $inl
+    ja .Lseal_sse_tail_128
+###############################################################################
+.Lseal_sse_tail_64: \n";
+    &prep_state(1); $code.="
+.Lseal_sse_tail_64_rounds_and_x2hash: \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+.Lseal_sse_tail_64_rounds_and_x1hash: \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+    dec $itr1
+    jg .Lseal_sse_tail_64_rounds_and_x2hash
+    dec $itr2
+    jge .Lseal_sse_tail_64_rounds_and_x1hash\n";
+    &finalize_state(1); $code.="
+    jmp .Lseal_sse_128_tail_xor
+###############################################################################
+.Lseal_sse_tail_128:\n";
+    &prep_state(2); $code.="
+.Lseal_sse_tail_128_rounds_and_x2hash: \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+.Lseal_sse_tail_128_rounds_and_x1hash: \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &poly_add("0($oup)");
+        &poly_mul();
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+        lea 16($oup), $oup
+    dec $itr1
+    jg .Lseal_sse_tail_128_rounds_and_x2hash
+    dec $itr2
+    jge .Lseal_sse_tail_128_rounds_and_x1hash\n";
+    &finalize_state(2);
+    &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
+    mov \$4*16, $itr1
+    sub \$4*16, $inl
+    lea 4*16($inp), $inp
+    jmp .Lseal_sse_128_tail_hash
+###############################################################################
+.Lseal_sse_tail_192:\n";
+    &prep_state(3); $code.="
+.Lseal_sse_tail_192_rounds_and_x2hash: \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+.Lseal_sse_tail_192_rounds_and_x1hash: \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &poly_add("0($oup)");
+        &poly_mul();
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        lea 16($oup), $oup
+    dec $itr1
+    jg .Lseal_sse_tail_192_rounds_and_x2hash
+    dec $itr2
+    jge .Lseal_sse_tail_192_rounds_and_x1hash\n";
+    &finalize_state(3);
+    &xor_stream($A2,$B2,$C2,$D2,0*16);
+    &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
+    mov \$8*16, $itr1
+    sub \$8*16, $inl
+    lea 8*16($inp), $inp
+###############################################################################
+.Lseal_sse_128_tail_hash:
+        cmp \$16, $itr1
+        jb .Lseal_sse_128_tail_xor\n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        sub \$16, $itr1
+        lea 16($oup), $oup
+    jmp .Lseal_sse_128_tail_hash
+
+.Lseal_sse_128_tail_xor:
+        cmp \$16, $inl
+        jb .Lseal_sse_tail_16
+        sub \$16, $inl
+        # Load for decryption
+        movdqu 0*16($inp), $T0
+        pxor $T0, $A0
+        movdqu $A0, 0*16($oup)
+        # Then hash
+        add 0*8($oup), $acc0
+        adc 1*8($oup), $acc1
+        adc \$1, $acc2
+        lea 1*16($inp), $inp
+        lea 1*16($oup), $oup\n";
+        &poly_mul(); $code.="
+        # Shift the stream left
+        movdqa $B0, $A0
+        movdqa $C0, $B0
+        movdqa $D0, $C0
+        movdqa $A1, $D0
+        movdqa $B1, $A1
+        movdqa $C1, $B1
+        movdqa $D1, $C1
+    jmp .Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_16:
+    test $inl, $inl
+    jz .Lprocess_blocks_of_extra_in
+    # We can only load the PT one byte at a time to avoid buffer overread
+    mov $inl, $itr2
+    mov $inl, $itr1
+    lea -1($inp,$inl), $inp
+    pxor $T3, $T3
+.Lseal_sse_tail_16_compose:
+        pslldq \$1, $T3
+        pinsrb \$0, ($inp), $T3
+        lea -1($inp), $inp
+        dec $itr1
+        jne .Lseal_sse_tail_16_compose
+
+    # XOR the keystream with the plaintext.
+    pxor $A0, $T3
+
+    # Write ciphertext out, byte-by-byte.
+    movq $inl, $itr1
+    movdqu $T3, $A0
+.Lseal_sse_tail_16_extract:
+        pextrb \$0, $A0, ($oup)
+        psrldq \$1, $A0
+        add \$1, $oup
+        sub \$1, $itr1
+        jnz .Lseal_sse_tail_16_extract
+
+    # $T3 contains the final (partial, non-empty) block of ciphertext which
+    # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
+    # are valid. We need to fill it with extra_in bytes until full, or until we
+    # run out of bytes.
+    #
+    # $keyp points to the tag output, which is actually a struct with the
+    # extra_in pointer and length at offset 48.
+    movq 288 + $xmm_storage + 32(%rsp), $keyp
+    movq 56($keyp), $t1  # extra_in_len
+    movq 48($keyp), $t0  # extra_in
+    test $t1, $t1
+    jz .Lprocess_partial_block  # Common case: no bytes of extra_in
+
+    movq \$16, $t2
+    subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
+    cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
+                    # (note that AT&T syntax reverses the arguments)
+    jge .Lload_extra_in
+    movq $t1, $t2
+
+.Lload_extra_in:
+    # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
+    # into $T3. They are loaded in reverse order.
+    leaq -1($t0,$t2), $inp
+    # Update extra_in and extra_in_len to reflect the bytes that are about to
+    # be read.
+    addq $t2, $t0
+    subq $t2, $t1
+    movq $t0, 48($keyp)
+    movq $t1, 56($keyp)
+
+    # Update $itr2, which is used to select the mask later on, to reflect the
+    # extra bytes about to be added.
+    addq $t2, $itr2
+
+    # Load $t2 bytes of extra_in into $T2.
+    pxor $T2, $T2
+.Lload_extra_load_loop:
+        pslldq \$1, $T2
+        pinsrb \$0, ($inp), $T2
+        lea -1($inp), $inp
+        sub \$1, $t2
+        jnz .Lload_extra_load_loop
+
+    # Shift $T2 up the length of the remainder from the main encryption. Sadly,
+    # the shift for an XMM register has to be a constant, thus we loop to do
+    # this.
+    movq $inl, $t2
+
+.Lload_extra_shift_loop:
+        pslldq \$1, $T2
+        sub \$1, $t2
+        jnz .Lload_extra_shift_loop
+
+    # Mask $T3 (the remainder from the main encryption) so that superfluous
+    # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
+    # disjoint and so we can merge them with an OR.
+    lea .Land_masks(%rip), $t2
+    shl \$4, $inl
+    pand -16($t2,$inl), $T3
+
+    # Merge $T2 into $T3, forming the remainder block.
+    por $T2, $T3
+
+    # The block of ciphertext + extra_in is ready to be included in the
+    # Poly1305 state.
+    movq $T3, $t0
+    pextrq \$1, $T3, $t1
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+
+.Lprocess_blocks_of_extra_in:
+    # There may be additional bytes of extra_in to process.
+    movq 288+32+$xmm_storage (%rsp), $keyp
+    movq 48($keyp), $inp   # extra_in
+    movq 56($keyp), $itr2  # extra_in_len
+    movq $itr2, $itr1
+    shr \$4, $itr2         # number of blocks
+
+.Lprocess_extra_hash_loop:
+        jz process_extra_in_trailer\n";
+        &poly_add("0($inp)");
+        &poly_mul(); $code.="
+        leaq 16($inp), $inp
+        subq \$1, $itr2
+        jmp .Lprocess_extra_hash_loop
+process_extra_in_trailer:
+    andq \$15, $itr1       # remaining num bytes (<16) of extra_in
+    movq $itr1, $inl
+    jz .Ldo_length_block
+    leaq -1($inp,$itr1), $inp
+
+.Lprocess_extra_in_trailer_load:
+        pslldq \$1, $T3
+        pinsrb \$0, ($inp), $T3
+        lea -1($inp), $inp
+        sub \$1, $itr1
+        jnz .Lprocess_extra_in_trailer_load
+
+.Lprocess_partial_block:
+    # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
+    lea .Land_masks(%rip), $t2
+    shl \$4, $inl
+    pand -16($t2,$inl), $T3
+    movq $T3, $t0
+    pextrq \$1, $T3, $t1
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+
+.Ldo_length_block:\n";
+    &poly_add($len_store);
+    &poly_mul(); $code.="
+    # Final reduce
+    mov $acc0, $t0
+    mov $acc1, $t1
+    mov $acc2, $t2
+    sub \$-5, $acc0
+    sbb \$-1, $acc1
+    sbb \$3, $acc2
+    cmovc $t0, $acc0
+    cmovc $t1, $acc1
+    cmovc $t2, $acc2
+    # Add in s part of the key
+    add 0+$s_store, $acc0
+    adc 8+$s_store, $acc1\n";
+
+$code.="
+    movaps 16*0+$xmm_store, %xmm6
+    movaps 16*1+$xmm_store, %xmm7
+    movaps 16*2+$xmm_store, %xmm8
+    movaps 16*3+$xmm_store, %xmm9
+    movaps 16*4+$xmm_store, %xmm10
+    movaps 16*5+$xmm_store, %xmm11
+    movaps 16*6+$xmm_store, %xmm12
+    movaps 16*7+$xmm_store, %xmm13
+    movaps 16*8+$xmm_store, %xmm14
+    movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
+$code.="
+.cfi_remember_state
+    add \$288 + $xmm_storage + 32, %rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+    # The tag replaces the key on return
+    pop $keyp
+.cfi_pop $keyp
+    mov $acc0, ($keyp)
+    mov $acc1, 8($keyp)
+    pop %r15
+.cfi_pop %r15
+    pop %r14
+.cfi_pop %r14
+    pop %r13
+.cfi_pop %r13
+    pop %r12
+.cfi_pop %r12
+    pop %rbx
+.cfi_pop %rbx
+    pop %rbp
+.cfi_pop %rbp
+    ret
+################################################################################
+.Lseal_sse_128:
+.cfi_restore_state
+    movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
+    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
+    movdqu 2*16($keyp), $D2
+    movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0
+    movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
+    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
+    mov \$10, $acc0
+
+.Lseal_sse_128_rounds:\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        dec $acc0
+    jnz .Lseal_sse_128_rounds
+    paddd .Lchacha20_consts(%rip), $A0
+    paddd .Lchacha20_consts(%rip), $A1
+    paddd .Lchacha20_consts(%rip), $A2
+    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
+    paddd $T2, $C0\npaddd $T2, $C1
+    paddd $T3, $D0
+    paddd .Lsse_inc(%rip), $T3
+    paddd $T3, $D1
+    # Clamp and store the key
+    pand .Lclamp(%rip), $A2
+    movdqa $A2, $r_store
+    movdqa $B2, $s_store
+    # Hash
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+    jmp .Lseal_sse_128_tail_xor
+.size chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41
+.cfi_endproc\n";
+}
+
+if ($avx>1) {
+
+($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
+my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
+($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
+$state1_store="$xmm_storage+2*32(%rbp)";
+$state2_store="$xmm_storage+3*32(%rbp)";
+$tmp_store="$xmm_storage+4*32(%rbp)";
+$ctr0_store="$xmm_storage+5*32(%rbp)";
+$ctr1_store="$xmm_storage+6*32(%rbp)";
+$ctr2_store="$xmm_storage+7*32(%rbp)";
+$ctr3_store="$xmm_storage+8*32(%rbp)";
+
+sub chacha_qr_avx2 {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+$code.=<<___ if ($dir =~ /store/);
+    vmovdqa $t, $tmp_store
+___
+$code.=<<___;
+    vpaddd $b, $a, $a
+    vpxor $a, $d, $d
+    vpshufb .Lrol16(%rip), $d, $d
+    vpaddd $d, $c, $c
+    vpxor $c, $b, $b
+    vpsrld \$20, $b, $t
+    vpslld \$12, $b, $b
+    vpxor $t, $b, $b
+    vpaddd $b, $a, $a
+    vpxor $a, $d, $d
+    vpshufb .Lrol8(%rip), $d, $d
+    vpaddd $d, $c, $c
+    vpxor $c, $b, $b
+    vpslld \$7, $b, $t
+    vpsrld \$25, $b, $b
+    vpxor $t, $b, $b
+___
+$code.=<<___ if ($dir =~ /left/);
+    vpalignr \$12, $d, $d, $d
+    vpalignr \$8, $c, $c, $c
+    vpalignr \$4, $b, $b, $b
+___
+$code.=<<___ if ($dir =~ /right/);
+    vpalignr \$4, $d, $d, $d
+    vpalignr \$8, $c, $c, $c
+    vpalignr \$12, $b, $b, $b
+___
+$code.=<<___ if ($dir =~ /load/);
+    vmovdqa $tmp_store, $t
+___
+}
+
+sub prep_state_avx2 {
+my ($n)=@_;
+$code.=<<___;
+    vmovdqa .Lchacha20_consts(%rip), $A0
+    vmovdqa $state1_store, $B0
+    vmovdqa $state2_store, $C0
+___
+$code.=<<___ if ($n ge 2);
+    vmovdqa $A0, $A1
+    vmovdqa $B0, $B1
+    vmovdqa $C0, $C1
+___
+$code.=<<___ if ($n ge 3);
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C2
+___
+$code.=<<___ if ($n ge 4);
+    vmovdqa $A0, $A3
+    vmovdqa $B0, $B3
+    vmovdqa $C0, $C3
+___
+$code.=<<___ if ($n eq 1);
+    vmovdqa .Lavx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D0
+    vmovdqa $D0, $ctr0_store
+___
+$code.=<<___ if ($n eq 2);
+    vmovdqa .Lavx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D1
+    vpaddd $D1, $D0, $D0
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+___
+$code.=<<___ if ($n eq 3);
+    vmovdqa .Lavx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D2
+    vpaddd $D2, $D0, $D1
+    vpaddd $D1, $D0, $D0
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+___
+$code.=<<___ if ($n eq 4);
+    vmovdqa .Lavx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D3
+    vpaddd $D3, $D0, $D2
+    vpaddd $D2, $D0, $D1
+    vpaddd $D1, $D0, $D0
+    vmovdqa $D3, $ctr3_store
+    vmovdqa $D2, $ctr2_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D0, $ctr0_store
+___
+}
+
+sub finalize_state_avx2 {
+my ($n)=@_;
+$code.=<<___ if ($n eq 4);
+    vpaddd .Lchacha20_consts(%rip), $A3, $A3
+    vpaddd $state1_store, $B3, $B3
+    vpaddd $state2_store, $C3, $C3
+    vpaddd $ctr3_store, $D3, $D3
+___
+$code.=<<___ if ($n ge 3);
+    vpaddd .Lchacha20_consts(%rip), $A2, $A2
+    vpaddd $state1_store, $B2, $B2
+    vpaddd $state2_store, $C2, $C2
+    vpaddd $ctr2_store, $D2, $D2
+___
+$code.=<<___ if ($n ge 2);
+    vpaddd .Lchacha20_consts(%rip), $A1, $A1
+    vpaddd $state1_store, $B1, $B1
+    vpaddd $state2_store, $C1, $C1
+    vpaddd $ctr1_store, $D1, $D1
+___
+$code.=<<___;
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
+    vpaddd $state1_store, $B0, $B0
+    vpaddd $state2_store, $C0, $C0
+    vpaddd $ctr0_store, $D0, $D0
+___
+}
+
+sub xor_stream_avx2 {
+my ($A, $B, $C, $D, $offset, $hlp)=@_;
+$code.=<<___;
+    vperm2i128 \$0x02, $A, $B, $hlp
+    vperm2i128 \$0x13, $A, $B, $B
+    vperm2i128 \$0x02, $C, $D, $A
+    vperm2i128 \$0x13, $C, $D, $C
+    vpxor 0*32+$offset($inp), $hlp, $hlp
+    vpxor 1*32+$offset($inp), $A, $A
+    vpxor 2*32+$offset($inp), $B, $B
+    vpxor 3*32+$offset($inp), $C, $C
+    vmovdqu $hlp, 0*32+$offset($oup)
+    vmovdqu $A, 1*32+$offset($oup)
+    vmovdqu $B, 2*32+$offset($oup)
+    vmovdqu $C, 3*32+$offset($oup)
+___
+}
+
+sub finish_stream_avx2 {
+my ($A, $B, $C, $D, $hlp)=@_;
+$code.=<<___;
+    vperm2i128 \$0x13, $A, $B, $hlp
+    vperm2i128 \$0x02, $A, $B, $A
+    vperm2i128 \$0x02, $C, $D, $B
+    vperm2i128 \$0x13, $C, $D, $D
+    vmovdqa $hlp, $C
+___
+}
+
+sub poly_stage1_mulx {
+$code.=<<___;
+    mov 0+$r_store, %rdx
+    mov %rdx, $t2
+    mulx $acc0, $t0, $t1
+    mulx $acc1, %rax, %rdx
+    imulq $acc2, $t2
+    add %rax, $t1
+    adc %rdx, $t2
+___
+}
+
+sub poly_stage2_mulx {
+$code.=<<___;
+    mov 8+$r_store, %rdx
+    mulx $acc0, $acc0, %rax
+    add $acc0, $t1
+    mulx $acc1, $acc1, $t3
+    adc $acc1, $t2
+    adc \$0, $t3
+    imulq $acc2, %rdx
+___
+}
+
+sub poly_stage3_mulx {
+$code.=<<___;
+    add %rax, $t2
+    adc %rdx, $t3
+___
+}
+
+sub poly_mul_mulx {
+    &poly_stage1_mulx();
+    &poly_stage2_mulx();
+    &poly_stage3_mulx();
+    &poly_reduce_stage();
+}
+
+sub gen_chacha_round_avx2 {
+my ($rot1, $rot2, $shift)=@_;
+my $round="";
+$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
+$round=$round ."vmovdqa $rot2, $C0
+                vpaddd $B3, $A3, $A3
+                vpaddd $B2, $A2, $A2
+                vpaddd $B1, $A1, $A1
+                vpaddd $B0, $A0, $A0
+                vpxor $A3, $D3, $D3
+                vpxor $A2, $D2, $D2
+                vpxor $A1, $D1, $D1
+                vpxor $A0, $D0, $D0
+                vpshufb $C0, $D3, $D3
+                vpshufb $C0, $D2, $D2
+                vpshufb $C0, $D1, $D1
+                vpshufb $C0, $D0, $D0
+                vpaddd $D3, $C3, $C3
+                vpaddd $D2, $C2, $C2
+                vpaddd $D1, $C1, $C1
+                vpaddd $tmp_store, $D0, $C0
+                vpxor $C3, $B3, $B3
+                vpxor $C2, $B2, $B2
+                vpxor $C1, $B1, $B1
+                vpxor $C0, $B0, $B0
+                vmovdqa $C0, $tmp_store
+                vpsrld \$$rot1, $B3, $C0
+                vpslld \$32-$rot1, $B3, $B3
+                vpxor $C0, $B3, $B3
+                vpsrld \$$rot1, $B2, $C0
+                vpslld \$32-$rot1, $B2, $B2
+                vpxor $C0, $B2, $B2
+                vpsrld \$$rot1, $B1, $C0
+                vpslld \$32-$rot1, $B1, $B1
+                vpxor $C0, $B1, $B1
+                vpsrld \$$rot1, $B0, $C0
+                vpslld \$32-$rot1, $B0, $B0
+                vpxor $C0, $B0, $B0\n";
+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
+$round=$round ."vmovdqa $tmp_store, $C0
+                vpalignr \$$s1, $B3, $B3, $B3
+                vpalignr \$$s2, $C3, $C3, $C3
+                vpalignr \$$s3, $D3, $D3, $D3
+                vpalignr \$$s1, $B2, $B2, $B2
+                vpalignr \$$s2, $C2, $C2, $C2
+                vpalignr \$$s3, $D2, $D2, $D2
+                vpalignr \$$s1, $B1, $B1, $B1
+                vpalignr \$$s2, $C1, $C1, $C1
+                vpalignr \$$s3, $D1, $D1, $D1
+                vpalignr \$$s1, $B0, $B0, $B0
+                vpalignr \$$s2, $C0, $C0, $C0
+                vpalignr \$$s3, $D0, $D0, $D0\n"
+if (($shift =~ /left/) || ($shift =~ /right/));
+return $round;
+};
+
+$chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") .
+               &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") .
+               &gen_chacha_round_avx2(20, ".Lrol16(%rip)") .
+               &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right");
+
+@loop_body = split /\n/, $chacha_body;
+
+$code.="
+###############################################################################
+.globl chacha20_poly1305_open_avx2
+.type chacha20_poly1305_open_avx2,\@function,6
+.align 64
+chacha20_poly1305_open_avx2:
+.cfi_startproc
+    _CET_ENDBR
+    push %rbp
+.cfi_push %rbp
+    push %rbx
+.cfi_push %rbx
+    push %r12
+.cfi_push %r12
+    push %r13
+.cfi_push %r13
+    push %r14
+.cfi_push %r14
+    push %r15
+.cfi_push %r15
+    # We write the calculated authenticator back to keyp at the end, so save
+    # the pointer on the stack too.
+    push $keyp
+.cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store
+
+    vzeroupper
+    vmovdqa .Lchacha20_consts(%rip), $A0
+    vbroadcasti128 0*16($keyp), $B0
+    vbroadcasti128 1*16($keyp), $C0
+    vbroadcasti128 2*16($keyp), $D0
+    vpaddd .Lavx2_init(%rip), $D0, $D0
+    cmp \$6*32, $inl
+    jbe .Lopen_avx2_192
+    cmp \$10*32, $inl
+    jbe .Lopen_avx2_320
+
+    vmovdqa $B0, $state1_store
+    vmovdqa $C0, $state2_store
+    vmovdqa $D0, $ctr0_store
+    mov \$10, $acc0
+.Lopen_avx2_init_rounds:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        dec $acc0
+    jne .Lopen_avx2_init_rounds
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
+    vpaddd $state1_store, $B0, $B0
+    vpaddd $state2_store, $C0, $C0
+    vpaddd $ctr0_store, $D0, $D0
+
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store key
+    vpand .Lclamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for the first 64 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    # Hash AD + first 64 bytes
+    mov $adl, $itr2
+    call poly_hash_ad_internal
+    # Hash first 64 bytes
+    xor $itr1, $itr1
+.Lopen_avx2_init_hash: \n";
+       &poly_add("0($inp,$itr1)");
+       &poly_mul(); $code.="
+       add \$16, $itr1
+       cmp \$2*32, $itr1
+    jne .Lopen_avx2_init_hash
+    # Decrypt first 64 bytes
+    vpxor 0*32($inp), $A0, $A0
+    vpxor 1*32($inp), $B0, $B0
+    # Store first 64 bytes of decrypted data
+    vmovdqu $A0, 0*32($oup)
+    vmovdqu $B0, 1*32($oup)
+    lea 2*32($inp), $inp
+    lea 2*32($oup), $oup
+    sub \$2*32, $inl
+.Lopen_avx2_main_loop:
+        # Hash and decrypt 512 bytes each iteration
+        cmp \$16*32, $inl
+        jb .Lopen_avx2_main_loop_done\n";
+        &prep_state_avx2(4); $code.=" 
+        xor $itr1, $itr1
+.Lopen_avx2_main_loop_rounds: \n";
+            &poly_add("0*8($inp,$itr1)");
+            &emit_body(10);
+            &poly_stage1_mulx();
+            &emit_body(9);
+            &poly_stage2_mulx();
+            &emit_body(12);
+            &poly_stage3_mulx();
+            &emit_body(10);
+            &poly_reduce_stage();
+            &emit_body(9);
+            &poly_add("2*8($inp,$itr1)");
+            &emit_body(8);
+            &poly_stage1_mulx();
+            &emit_body(18);
+            &poly_stage2_mulx();
+            &emit_body(18);
+            &poly_stage3_mulx();
+            &emit_body(9);
+            &poly_reduce_stage();
+            &emit_body(8);
+            &poly_add("4*8($inp,$itr1)"); $code.="
+            lea 6*8($itr1), $itr1\n";
+            &emit_body(18);
+            &poly_stage1_mulx();
+            &emit_body(8);
+            &poly_stage2_mulx();
+            &emit_body(8);
+            &poly_stage3_mulx();
+            &emit_body(18);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            cmp \$10*6*8, $itr1
+        jne .Lopen_avx2_main_loop_rounds\n";
+        &finalize_state_avx2(4); $code.="
+        vmovdqa $A0, $tmp_store\n";
+        &poly_add("10*6*8($inp)");
+        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+        vmovdqa $tmp_store, $A0\n";
+        &poly_mul();
+        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+        &poly_add("10*6*8+2*8($inp)");
+        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+        &poly_mul();
+        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
+        lea 16*32($inp), $inp
+        lea 16*32($oup), $oup
+        sub \$16*32, $inl
+    jmp .Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
+    test $inl, $inl
+    vzeroupper
+    je .Lopen_sse_finalize
+
+    cmp \$12*32, $inl
+    ja .Lopen_avx2_tail_512
+    cmp \$8*32, $inl
+    ja .Lopen_avx2_tail_384
+    cmp \$4*32, $inl
+    ja .Lopen_avx2_tail_256\n";
+###############################################################################
+    # 1-128 bytes left
+    &prep_state_avx2(1); $code.="
+    xor $itr2, $itr2
+    mov $inl, $itr1
+    and \$-16, $itr1
+    test $itr1, $itr1
+    je .Lopen_avx2_tail_128_rounds # Have nothing to hash
+.Lopen_avx2_tail_128_rounds_and_x1hash: \n";
+        &poly_add("0*8($inp,$itr2)");
+        &poly_mul(); $code.="
+.Lopen_avx2_tail_128_rounds:
+        add \$16, $itr2\n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb .Lopen_avx2_tail_128_rounds_and_x1hash
+        cmp \$160, $itr2
+    jne .Lopen_avx2_tail_128_rounds\n";
+    &finalize_state_avx2(1);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    jmp .Lopen_avx2_tail_128_xor
+###############################################################################
+.Lopen_avx2_tail_256: \n";
+    # 129-256 bytes left
+    &prep_state_avx2(2); $code.="
+    mov $inl, $tmp_store
+    mov $inl, $itr1
+    sub \$4*32, $itr1
+    shr \$4, $itr1
+    mov \$10, $itr2
+    cmp \$10, $itr1
+    cmovg $itr2, $itr1
+    mov $inp, $inl
+    xor $itr2, $itr2
+.Lopen_avx2_tail_256_rounds_and_x1hash: \n";
+        &poly_add("0*8($inl)");
+        &poly_mul_mulx(); $code.="
+        lea 16($inl), $inl
+.Lopen_avx2_tail_256_rounds: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
+        inc $itr2\n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb .Lopen_avx2_tail_256_rounds_and_x1hash
+        cmp \$10, $itr2
+    jne .Lopen_avx2_tail_256_rounds
+    mov $inl, $itr2
+    sub $inp, $inl
+    mov $inl, $itr1
+    mov $tmp_store, $inl
+.Lopen_avx2_tail_256_hash:
+        add \$16, $itr1
+        cmp $inl, $itr1
+        jg .Lopen_avx2_tail_256_done\n";
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 16($itr2), $itr2
+    jmp .Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done: \n";
+    &finalize_state_avx2(2);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
+    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
+    lea 4*32($inp), $inp
+    lea 4*32($oup), $oup
+    sub \$4*32, $inl
+    jmp .Lopen_avx2_tail_128_xor
+###############################################################################
+.Lopen_avx2_tail_384: \n";
+    # 257-383 bytes left
+    &prep_state_avx2(3); $code.="
+    mov $inl, $tmp_store
+    mov $inl, $itr1
+    sub \$8*32, $itr1
+    shr \$4, $itr1
+    add \$6, $itr1
+    mov \$10, $itr2
+    cmp \$10, $itr1
+    cmovg $itr2, $itr1
+    mov $inp, $inl
+    xor $itr2, $itr2
+.Lopen_avx2_tail_384_rounds_and_x2hash: \n";
+        &poly_add("0*8($inl)");
+        &poly_mul_mulx(); $code.="
+        lea 16($inl), $inl
+.Lopen_avx2_tail_384_rounds_and_x1hash: \n";
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &poly_add("0*8($inl)");
+        &poly_mul(); $code.="
+        lea 16($inl), $inl
+        inc $itr2\n";
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb  .Lopen_avx2_tail_384_rounds_and_x2hash
+        cmp \$10, $itr2
+    jne .Lopen_avx2_tail_384_rounds_and_x1hash
+    mov $inl, $itr2
+    sub $inp, $inl
+    mov $inl, $itr1
+    mov $tmp_store, $inl
+.Lopen_avx2_384_tail_hash:
+        add \$16, $itr1
+        cmp $inl, $itr1
+        jg .Lopen_avx2_384_tail_done\n";
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 16($itr2), $itr2
+    jmp .Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done: \n";
+    &finalize_state_avx2(3);
+    &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
+    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
+    lea 8*32($inp), $inp
+    lea 8*32($oup), $oup
+    sub \$8*32, $inl
+    jmp .Lopen_avx2_tail_128_xor
+###############################################################################
+.Lopen_avx2_tail_512: \n";
+    # 384-512 bytes left
+    &prep_state_avx2(4); $code.="
+    xor $itr1, $itr1
+    mov $inp, $itr2
+.Lopen_avx2_tail_512_rounds_and_x2hash: \n";
+        &poly_add("0*8($itr2)");
+        &poly_mul(); $code.="
+        lea 2*8($itr2), $itr2
+.Lopen_avx2_tail_512_rounds_and_x1hash: \n";
+        &emit_body(37);
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx();
+        &emit_body(48);
+        &poly_add("2*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 4*8($itr2), $itr2\n";
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        inc $itr1
+        cmp \$4, $itr1
+    jl  .Lopen_avx2_tail_512_rounds_and_x2hash
+        cmp \$10, $itr1
+    jne .Lopen_avx2_tail_512_rounds_and_x1hash
+    mov $inl, $itr1
+    sub \$12*32, $itr1
+    and \$-16, $itr1
+.Lopen_avx2_tail_512_hash:
+        test $itr1, $itr1
+        je .Lopen_avx2_tail_512_done\n";
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 2*8($itr2), $itr2
+        sub \$2*8, $itr1
+    jmp .Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done: \n";
+    &finalize_state_avx2(4); $code.="
+    vmovdqa $A0, $tmp_store\n";
+    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+    vmovdqa $tmp_store, $A0\n";
+    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+    &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
+    lea 12*32($inp), $inp
+    lea 12*32($oup), $oup
+    sub \$12*32, $inl
+.Lopen_avx2_tail_128_xor:
+    cmp \$32, $inl
+    jb .Lopen_avx2_tail_32_xor
+        sub \$32, $inl
+        vpxor ($inp), $A0, $A0
+        vmovdqu $A0, ($oup)
+        lea 1*32($inp), $inp
+        lea 1*32($oup), $oup
+        vmovdqa $B0, $A0
+        vmovdqa $C0, $B0
+        vmovdqa $D0, $C0
+    jmp .Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
+    cmp \$16, $inl
+    vmovdqa $A0x, $A1x
+    jb .Lopen_avx2_exit
+    sub \$16, $inl
+    #load for decryption
+    vpxor ($inp), $A0x, $A1x
+    vmovdqu $A1x, ($oup)
+    lea 1*16($inp), $inp
+    lea 1*16($oup), $oup
+    vperm2i128 \$0x11, $A0, $A0, $A0
+    vmovdqa $A0x, $A1x
+.Lopen_avx2_exit:
+    vzeroupper
+    jmp .Lopen_sse_tail_16
+###############################################################################
+.Lopen_avx2_192:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
+    vmovdqa $D0, $T2
+    vmovdqa $D1, $T3
+    mov \$10, $acc0
+.Lopen_avx2_192_rounds: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+        dec $acc0
+    jne .Lopen_avx2_192_rounds
+    vpaddd $A2, $A0, $A0
+    vpaddd $A2, $A1, $A1
+    vpaddd $B2, $B0, $B0
+    vpaddd $B2, $B1, $B1
+    vpaddd $C2, $C0, $C0
+    vpaddd $C2, $C1, $C1
+    vpaddd $T2, $D0, $D0
+    vpaddd $T3, $D1, $D1
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .Lclamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 192 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+.Lopen_avx2_short:
+    mov $adl, $itr2
+    call poly_hash_ad_internal
+.Lopen_avx2_short_hash_and_xor_loop:
+        cmp \$32, $inl
+        jb .Lopen_avx2_short_tail_32
+        sub \$32, $inl\n";
+        # Load + hash
+        &poly_add("0*8($inp)");
+        &poly_mul();
+        &poly_add("2*8($inp)");
+        &poly_mul(); $code.="
+        # Load + decrypt
+        vpxor ($inp), $A0, $A0
+        vmovdqu $A0, ($oup)
+        lea 1*32($inp), $inp
+        lea 1*32($oup), $oup
+        # Shift stream
+        vmovdqa $B0, $A0
+        vmovdqa $C0, $B0
+        vmovdqa $D0, $C0
+        vmovdqa $A1, $D0
+        vmovdqa $B1, $A1
+        vmovdqa $C1, $B1
+        vmovdqa $D1, $C1
+        vmovdqa $A2, $D1
+        vmovdqa $B2, $A2
+    jmp .Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
+    cmp \$16, $inl
+    vmovdqa $A0x, $A1x
+    jb .Lopen_avx2_short_tail_32_exit
+    sub \$16, $inl\n";
+    &poly_add("0*8($inp)");
+    &poly_mul(); $code.="
+    vpxor ($inp), $A0x, $A3x
+    vmovdqu $A3x, ($oup)
+    lea 1*16($inp), $inp
+    lea 1*16($oup), $oup
+    vextracti128 \$1, $A0, $A1x
+.Lopen_avx2_short_tail_32_exit:
+    vzeroupper
+    jmp .Lopen_sse_tail_16
+###############################################################################
+.Lopen_avx2_320:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
+    vpaddd .Lavx2_inc(%rip), $D1, $D2
+    vmovdqa $B0, $T1
+    vmovdqa $C0, $T2
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+    mov \$10, $acc0
+.Lopen_avx2_320_rounds:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        dec $acc0
+    jne .Lopen_avx2_320_rounds
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
+    vpaddd .Lchacha20_consts(%rip), $A1, $A1
+    vpaddd .Lchacha20_consts(%rip), $A2, $A2
+    vpaddd $T1, $B0, $B0
+    vpaddd $T1, $B1, $B1
+    vpaddd $T1, $B2, $B2
+    vpaddd $T2, $C0, $C0
+    vpaddd $T2, $C1, $C1
+    vpaddd $T2, $C2, $C2
+    vpaddd $ctr0_store, $D0, $D0
+    vpaddd $ctr1_store, $D1, $D1
+    vpaddd $ctr2_store, $D2, $D2
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .Lclamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 320 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+    vperm2i128 \$0x02, $A2, $B2, $C1
+    vperm2i128 \$0x02, $C2, $D2, $D1
+    vperm2i128 \$0x13, $A2, $B2, $A2
+    vperm2i128 \$0x13, $C2, $D2, $B2
+    jmp .Lopen_avx2_short
+.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc
+###############################################################################
+###############################################################################
+.globl chacha20_poly1305_seal_avx2
+.type chacha20_poly1305_seal_avx2,\@function,6
+.align 64
+chacha20_poly1305_seal_avx2:
+.cfi_startproc
+    _CET_ENDBR
+    push %rbp
+.cfi_push %rbp
+    push %rbx
+.cfi_push %rbx
+    push %r12
+.cfi_push %r12
+    push %r13
+.cfi_push %r13
+    push %r14
+.cfi_push %r14
+    push %r15
+.cfi_push %r15   
+# We write the calculated authenticator back to keyp at the end, so save
+# the pointer on the stack too.
+    push $keyp
+.cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov 56($keyp), $inl  # extra_in_len
+    addq %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store
+    mov %rdx, $inl
+
+    vzeroupper
+    vmovdqa .Lchacha20_consts(%rip), $A0
+    vbroadcasti128 0*16($keyp), $B0
+    vbroadcasti128 1*16($keyp), $C0
+    vbroadcasti128 2*16($keyp), $D0
+    vpaddd .Lavx2_init(%rip), $D0, $D0
+    cmp \$6*32, $inl
+    jbe .Lseal_avx2_192
+    cmp \$10*32, $inl
+    jbe .Lseal_avx2_320
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $A0, $A3
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $B0, $B3
+    vmovdqa $B0, $state1_store
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vmovdqa $C0, $C3
+    vmovdqa $C0, $state2_store
+    vmovdqa $D0, $D3
+    vpaddd .Lavx2_inc(%rip), $D3, $D2
+    vpaddd .Lavx2_inc(%rip), $D2, $D1
+    vpaddd .Lavx2_inc(%rip), $D1, $D0
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+    vmovdqa $D3, $ctr3_store
+    mov \$10, $acc0
+.Lseal_avx2_init_rounds: \n";
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        dec $acc0
+        jnz .Lseal_avx2_init_rounds\n";
+    &finalize_state_avx2(4); $code.="
+    vperm2i128 \$0x13, $C3, $D3, $C3
+    vperm2i128 \$0x02, $A3, $B3, $D3
+    vperm2i128 \$0x13, $A3, $B3, $A3
+    vpand .Lclamp(%rip), $D3, $D3
+    vmovdqa $D3, $r_store
+    mov $adl, $itr2
+    call poly_hash_ad_internal
+    # Safely store 320 bytes (otherwise would handle with optimized call)
+    vpxor 0*32($inp), $A3, $A3
+    vpxor 1*32($inp), $C3, $C3
+    vmovdqu $A3, 0*32($oup)
+    vmovdqu $C3, 1*32($oup)\n";
+    &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
+    &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
+    lea 10*32($inp), $inp
+    sub \$10*32, $inl
+    mov \$10*32, $itr1
+    cmp \$4*32, $inl
+    jbe .Lseal_avx2_short_hash_remainder
+    vpxor 0*32($inp), $A0, $A0
+    vpxor 1*32($inp), $B0, $B0
+    vpxor 2*32($inp), $C0, $C0
+    vpxor 3*32($inp), $D0, $D0
+    vmovdqu $A0, 10*32($oup)
+    vmovdqu $B0, 11*32($oup)
+    vmovdqu $C0, 12*32($oup)
+    vmovdqu $D0, 13*32($oup)
+    lea 4*32($inp), $inp
+    sub \$4*32, $inl
+    mov \$8, $itr1
+    mov \$2, $itr2
+    cmp \$4*32, $inl
+    jbe .Lseal_avx2_tail_128
+    cmp \$8*32, $inl
+    jbe .Lseal_avx2_tail_256
+    cmp \$12*32, $inl
+    jbe .Lseal_avx2_tail_384
+    cmp \$16*32, $inl
+    jbe .Lseal_avx2_tail_512\n";
+    # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
+    &prep_state_avx2(4);
+    foreach $l (@loop_body) {$code.=$l."\n";}
+    @loop_body = split /\n/, $chacha_body;
+    &emit_body(41);
+    @loop_body = split /\n/, $chacha_body; $code.="
+    sub \$16, $oup
+    mov \$9, $itr1
+    jmp .Lseal_avx2_main_loop_rounds_entry
+.align 32
+.Lseal_avx2_main_loop: \n";
+        &prep_state_avx2(4); $code.="
+        mov \$10, $itr1
+.align 32
+.Lseal_avx2_main_loop_rounds: \n";
+            &poly_add("0*8($oup)");
+            &emit_body(10);
+            &poly_stage1_mulx();
+            &emit_body(9);
+            &poly_stage2_mulx();
+            &emit_body(12);
+            &poly_stage3_mulx();
+            &emit_body(10);
+            &poly_reduce_stage(); $code.="
+.Lseal_avx2_main_loop_rounds_entry: \n";
+            &emit_body(9);
+            &poly_add("2*8($oup)");
+            &emit_body(8);
+            &poly_stage1_mulx();
+            &emit_body(18);
+            &poly_stage2_mulx();
+            &emit_body(18);
+            &poly_stage3_mulx();
+            &emit_body(9);
+            &poly_reduce_stage();
+            &emit_body(8);
+            &poly_add("4*8($oup)"); $code.="
+            lea 6*8($oup), $oup\n";
+            &emit_body(18);
+            &poly_stage1_mulx();
+            &emit_body(8);
+            &poly_stage2_mulx();
+            &emit_body(8);
+            &poly_stage3_mulx();
+            &emit_body(18);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            dec $itr1
+        jne .Lseal_avx2_main_loop_rounds\n";
+        &finalize_state_avx2(4); $code.="
+        vmovdqa $A0, $tmp_store\n";
+        &poly_add("0*8($oup)");
+        &poly_mul_mulx();
+        &poly_add("2*8($oup)");
+        &poly_mul_mulx(); $code.="
+        lea 4*8($oup), $oup\n";
+        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+        vmovdqa $tmp_store, $A0\n";
+        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
+        lea 16*32($inp), $inp
+        sub \$16*32, $inl
+        cmp \$16*32, $inl
+    jg .Lseal_avx2_main_loop
+\n";
+    &poly_add("0*8($oup)");
+    &poly_mul_mulx();
+    &poly_add("2*8($oup)");
+    &poly_mul_mulx(); $code.="
+    lea 4*8($oup), $oup
+    mov \$10, $itr1
+    xor $itr2, $itr2
+
+    cmp \$12*32, $inl
+    ja  .Lseal_avx2_tail_512
+    cmp \$8*32, $inl
+    ja  .Lseal_avx2_tail_384
+    cmp \$4*32, $inl
+    ja  .Lseal_avx2_tail_256
+###############################################################################
+.Lseal_avx2_tail_128:\n";
+    &prep_state_avx2(1); $code.="
+.Lseal_avx2_tail_128_rounds_and_3xhash: \n";
+        &poly_add("0($oup)");
+        &poly_mul_mulx(); $code.="
+        lea 2*8($oup), $oup
+.Lseal_avx2_tail_128_rounds_and_2xhash: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &poly_add("0*8($oup)");
+        &poly_mul_mulx();
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &poly_add("2*8($oup)");
+        &poly_mul_mulx(); $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg  .Lseal_avx2_tail_128_rounds_and_3xhash
+        dec $itr2
+    jge .Lseal_avx2_tail_128_rounds_and_2xhash\n";
+    &finalize_state_avx2(1);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    jmp .Lseal_avx2_short_loop
+###############################################################################
+.Lseal_avx2_tail_256:\n";
+    &prep_state_avx2(2); $code.="
+.Lseal_avx2_tail_256_rounds_and_3xhash: \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 2*8($oup), $oup
+.Lseal_avx2_tail_256_rounds_and_2xhash: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &poly_add("2*8($oup)");
+        &poly_mul(); $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg  .Lseal_avx2_tail_256_rounds_and_3xhash
+        dec $itr2
+    jge .Lseal_avx2_tail_256_rounds_and_2xhash\n";
+    &finalize_state_avx2(2);
+    &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    mov \$4*32, $itr1
+    lea 4*32($inp), $inp
+    sub \$4*32, $inl
+    jmp .Lseal_avx2_short_hash_remainder
+###############################################################################
+.Lseal_avx2_tail_384:\n";
+    &prep_state_avx2(3); $code.="
+.Lseal_avx2_tail_384_rounds_and_3xhash: \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 2*8($oup), $oup
+.Lseal_avx2_tail_384_rounds_and_2xhash: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &poly_add("2*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg  .Lseal_avx2_tail_384_rounds_and_3xhash
+        dec $itr2
+    jge .Lseal_avx2_tail_384_rounds_and_2xhash\n";
+    &finalize_state_avx2(3);
+    &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
+    &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    mov \$8*32, $itr1
+    lea 8*32($inp), $inp
+    sub \$8*32, $inl
+    jmp .Lseal_avx2_short_hash_remainder
+###############################################################################
+.Lseal_avx2_tail_512:\n";
+    &prep_state_avx2(4); $code.="
+.Lseal_avx2_tail_512_rounds_and_3xhash: \n";
+        &poly_add("0($oup)");
+        &poly_mul_mulx(); $code.="
+        lea 2*8($oup), $oup
+.Lseal_avx2_tail_512_rounds_and_2xhash: \n";
+        &emit_body(20);
+        &poly_add("0*8($oup)");
+        &emit_body(20);
+        &poly_stage1_mulx();
+        &emit_body(20);
+        &poly_stage2_mulx();
+        &emit_body(20);
+        &poly_stage3_mulx();
+        &emit_body(20);
+        &poly_reduce_stage();
+        &emit_body(20);
+        &poly_add("2*8($oup)");
+        &emit_body(20);
+        &poly_stage1_mulx();
+        &emit_body(20);
+        &poly_stage2_mulx();
+        &emit_body(20);
+        &poly_stage3_mulx();
+        &emit_body(20);
+        &poly_reduce_stage();
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg .Lseal_avx2_tail_512_rounds_and_3xhash
+        dec $itr2
+    jge .Lseal_avx2_tail_512_rounds_and_2xhash\n";
+    &finalize_state_avx2(4); $code.="
+    vmovdqa $A0, $tmp_store\n";
+    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+    vmovdqa $tmp_store, $A0\n";
+    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    mov \$12*32, $itr1
+    lea 12*32($inp), $inp
+    sub \$12*32, $inl
+    jmp .Lseal_avx2_short_hash_remainder
+################################################################################
+.Lseal_avx2_320:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
+    vpaddd .Lavx2_inc(%rip), $D1, $D2
+    vmovdqa $B0, $T1
+    vmovdqa $C0, $T2
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+    mov \$10, $acc0
+.Lseal_avx2_320_rounds: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        dec $acc0
+    jne .Lseal_avx2_320_rounds
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
+    vpaddd .Lchacha20_consts(%rip), $A1, $A1
+    vpaddd .Lchacha20_consts(%rip), $A2, $A2
+    vpaddd $T1, $B0, $B0
+    vpaddd $T1, $B1, $B1
+    vpaddd $T1, $B2, $B2
+    vpaddd $T2, $C0, $C0
+    vpaddd $T2, $C1, $C1
+    vpaddd $T2, $C2, $C2
+    vpaddd $ctr0_store, $D0, $D0
+    vpaddd $ctr1_store, $D1, $D1
+    vpaddd $ctr2_store, $D2, $D2
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .Lclamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 320 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+    vperm2i128 \$0x02, $A2, $B2, $C1
+    vperm2i128 \$0x02, $C2, $D2, $D1
+    vperm2i128 \$0x13, $A2, $B2, $A2
+    vperm2i128 \$0x13, $C2, $D2, $B2
+    jmp .Lseal_avx2_short
+################################################################################
+.Lseal_avx2_192:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
+    vmovdqa $D0, $T2
+    vmovdqa $D1, $T3
+    mov \$10, $acc0
+.Lseal_avx2_192_rounds: \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+        dec $acc0
+    jne .Lseal_avx2_192_rounds
+    vpaddd $A2, $A0, $A0
+    vpaddd $A2, $A1, $A1
+    vpaddd $B2, $B0, $B0
+    vpaddd $B2, $B1, $B1
+    vpaddd $C2, $C0, $C0
+    vpaddd $C2, $C1, $C1
+    vpaddd $T2, $D0, $D0
+    vpaddd $T3, $D1, $D1
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .Lclamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 192 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+.Lseal_avx2_short:
+    mov $adl, $itr2
+    call poly_hash_ad_internal
+    xor $itr1, $itr1
+.Lseal_avx2_short_hash_remainder:
+        cmp \$16, $itr1
+        jb .Lseal_avx2_short_loop\n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        sub \$16, $itr1
+        add \$16, $oup
+    jmp .Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
+        cmp \$32, $inl
+        jb .Lseal_avx2_short_tail
+        sub \$32, $inl
+        # Encrypt
+        vpxor ($inp), $A0, $A0
+        vmovdqu $A0, ($oup)
+        lea 1*32($inp), $inp
+        # Load + hash\n";
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &poly_add("2*8($oup)");
+        &poly_mul(); $code.="
+        lea 1*32($oup), $oup
+        # Shift stream
+        vmovdqa $B0, $A0
+        vmovdqa $C0, $B0
+        vmovdqa $D0, $C0
+        vmovdqa $A1, $D0
+        vmovdqa $B1, $A1
+        vmovdqa $C1, $B1
+        vmovdqa $D1, $C1
+        vmovdqa $A2, $D1
+        vmovdqa $B2, $A2
+    jmp .Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
+    cmp \$16, $inl
+    jb .Lseal_avx2_exit
+    sub \$16, $inl
+    vpxor ($inp), $A0x, $A3x
+    vmovdqu $A3x, ($oup)
+    lea 1*16($inp), $inp\n";
+    &poly_add("0*8($oup)");
+    &poly_mul(); $code.="
+    lea 1*16($oup), $oup
+    vextracti128 \$1, $A0, $A0x
+.Lseal_avx2_exit:
+    vzeroupper
+    jmp .Lseal_sse_tail_16
+.cfi_endproc
+.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
+";
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/constant_time_test.c b/ring-0.17.14/crypto/constant_time_test.c
new file mode 100644
index 0000000000..ed079f6a23
--- /dev/null
+++ b/ring-0.17.14/crypto/constant_time_test.c
@@ -0,0 +1,110 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "internal.h"
+
+int bssl_constant_time_test_main(void);
+
+static int test_binary_op_w(crypto_word_t (*op)(crypto_word_t a, crypto_word_t b),
+                            crypto_word_t a, crypto_word_t b, int is_true) {
+  crypto_word_t c = op(a, b);
+  if (is_true && c != CONSTTIME_TRUE_W) {
+    return 1;
+  } else if (!is_true && c != CONSTTIME_FALSE_W) {
+    return 1;
+  }
+  return 0;
+}
+
+static int test_is_zero_w(crypto_word_t a) {
+  crypto_word_t c = constant_time_is_zero_w(a);
+  if (a == 0 && c != CONSTTIME_TRUE_W) {
+    return 1;
+  } else if (a != 0 && c != CONSTTIME_FALSE_W) {
+    return 1;
+  }
+
+  c = constant_time_is_nonzero_w(a);
+  if (a == 0 && c != CONSTTIME_FALSE_W) {
+    return 1;
+  } else if (a != 0 && c != CONSTTIME_TRUE_W) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static int test_select_w(crypto_word_t a, crypto_word_t b) {
+  crypto_word_t selected = constant_time_select_w(CONSTTIME_TRUE_W, a, b);
+  if (selected != a) {
+    return 1;
+  }
+  selected = constant_time_select_w(CONSTTIME_FALSE_W, a, b);
+  if (selected != b) {
+    return 1;
+  }
+  return 0;
+}
+
+static crypto_word_t test_values_w[] = {
+    0,
+    1,
+    1024,
+    12345,
+    32000,
+#if defined(OPENSSL_64_BIT)
+    0xffffffff / 2 - 1,
+    0xffffffff / 2,
+    0xffffffff / 2 + 1,
+    0xffffffff - 1,
+    0xffffffff,
+#endif
+    SIZE_MAX / 2 - 1,
+    SIZE_MAX / 2,
+    SIZE_MAX / 2 + 1,
+    SIZE_MAX - 1,
+    SIZE_MAX
+};
+
+int bssl_constant_time_test_main(void) {
+  int num_failed = 0;
+
+  for (size_t i = 0;
+       i < sizeof(test_values_w) / sizeof(test_values_w[0]); ++i) {
+    crypto_word_t a = test_values_w[i];
+    num_failed += test_is_zero_w(a);
+    for (size_t j = 0;
+         j < sizeof(test_values_w) / sizeof(test_values_w[0]); ++j) {
+      crypto_word_t b = test_values_w[j];
+      num_failed += test_binary_op_w(&constant_time_eq_w, a, b, a == b);
+      num_failed += test_binary_op_w(&constant_time_eq_w, b, a, b == a);
+      num_failed += test_select_w(a, b);
+    }
+  }
+
+  return num_failed == 0;
+}
+
+// Exposes `constant_time_conditional_memcpy` to Rust for tests only.
+void bssl_constant_time_test_conditional_memcpy(uint8_t dst[256], const uint8_t src[256],
+                                                crypto_word_t b) {
+    constant_time_conditional_memcpy(dst, src, 256, b);
+ }
+
+// Exposes `constant_time_conditional_memxor` to Rust for tests only.
+void bssl_constant_time_test_conditional_memxor(uint8_t dst[256],
+                                               const uint8_t src[256],
+                                               crypto_word_t b) {
+  constant_time_conditional_memxor(dst, src, 256, b);
+}
diff --git a/ring-0.17.14/crypto/cpu_intel.c b/ring-0.17.14/crypto/cpu_intel.c
new file mode 100644
index 0000000000..6e792b6ba4
--- /dev/null
+++ b/ring-0.17.14/crypto/cpu_intel.c
@@ -0,0 +1,198 @@
+// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ring-core/base.h>
+
+
+#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push, 3)
+#include <immintrin.h>
+#include <intrin.h>
+#pragma warning(pop)
+#endif
+
+#include "internal.h"
+
+
+// OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX
+// is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through
+// |*out_edx|.
+static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx,
+                          uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) {
+#if defined(_MSC_VER) && !defined(__clang__)
+  int tmp[4];
+  __cpuid(tmp, (int)leaf);
+  *out_eax = (uint32_t)tmp[0];
+  *out_ebx = (uint32_t)tmp[1];
+  *out_ecx = (uint32_t)tmp[2];
+  *out_edx = (uint32_t)tmp[3];
+#elif defined(__pic__) && defined(OPENSSL_32_BIT)
+  // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602.
+  __asm__ volatile (
+    "xor %%ecx, %%ecx\n"
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx)
+    : "a"(leaf)
+  );
+#else
+  __asm__ volatile (
+    "xor %%ecx, %%ecx\n"
+    "cpuid\n"
+    : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx)
+    : "a"(leaf)
+  );
+#endif
+}
+
+// OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR).
+// Currently only XCR0 is defined by Intel so |xcr| should always be zero.
+//
+// See https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+static uint64_t OPENSSL_xgetbv(uint32_t xcr) {
+#if defined(_MSC_VER) && !defined(__clang__)
+  return (uint64_t)_xgetbv(xcr);
+#else
+  uint32_t eax, edx;
+  __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
+  return (((uint64_t)edx) << 32) | eax;
+#endif
+}
+
+void OPENSSL_cpuid_setup(uint32_t OPENSSL_ia32cap_P[4]) {
+  // Determine the vendor and maximum input value.
+  uint32_t eax, ebx, ecx, edx;
+  OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0);
+
+  uint32_t num_ids = eax;
+
+  int is_intel = ebx == 0x756e6547 /* Genu */ &&
+                 edx == 0x49656e69 /* ineI */ &&
+                 ecx == 0x6c65746e /* ntel */;
+
+  uint32_t extended_features[2] = {0};
+  if (num_ids >= 7) {
+    OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7);
+    extended_features[0] = ebx;
+    extended_features[1] = ecx;
+  }
+
+  OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1);
+
+  const uint32_t base_family = (eax >> 8) & 15;
+  const uint32_t base_model = (eax >> 4) & 15;
+
+  uint32_t family = base_family;
+  uint32_t model = base_model;
+  if (base_family == 15) {
+    const uint32_t ext_family = (eax >> 20) & 255;
+    family += ext_family;
+  }
+  if (base_family == 6 || base_family == 15) {
+    const uint32_t ext_model = (eax >> 16) & 15;
+    model |= ext_model << 4;
+  }
+
+  // Reserved bit #30 is repurposed to signal an Intel CPU.
+  if (is_intel) {
+    edx |= (1u << 30);
+  } else {
+    edx &= ~(1u << 30);
+  }
+
+  uint64_t xcr0 = 0;
+  if (ecx & (1u << 27)) {
+    // XCR0 may only be queried if the OSXSAVE bit is set.
+    xcr0 = OPENSSL_xgetbv(0);
+  }
+  // See Intel manual, volume 1, section 14.3.
+  if ((xcr0 & 6) != 6) {
+    // YMM registers cannot be used.
+    ecx &= ~(1u << 28);  // AVX
+    ecx &= ~(1u << 12);  // FMA
+    ecx &= ~(1u << 11);  // AMD XOP
+    extended_features[0] &= ~(1u << 5);   // AVX2
+    extended_features[1] &= ~(1u << 9);   // VAES
+    extended_features[1] &= ~(1u << 10);  // VPCLMULQDQ
+  }
+  // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation
+  // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups
+  // Operating at 256 and 128-bit Vector Lengths").
+  if ((xcr0 & 0xe6) != 0xe6) {
+    // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM
+    // registers, masking, SIMD registers 16-31 (even if accessed as YMM or
+    // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only
+    // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on
+    // shorter vectors, since AVX512 ties everything to the availability of
+    // 512-bit vectors. See the above-mentioned sections of the Intel manual,
+    // which say that *all* these XCR0 bits must be checked even when just using
+    // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD
+    // Equations for EVEX") which says that all EVEX-coded instructions raise an
+    // undefined-instruction exception if any of these XCR0 bits is zero.
+    //
+    // AVX10 fixes this by reorganizing the features that used to be part of
+    // "AVX512" and allowing them to be used independently of 512-bit support.
+    // TODO: add AVX10 detection.
+    extended_features[0] &= ~(1u << 16);  // AVX512F
+    extended_features[0] &= ~(1u << 17);  // AVX512DQ
+    extended_features[0] &= ~(1u << 21);  // AVX512IFMA
+    extended_features[0] &= ~(1u << 26);  // AVX512PF
+    extended_features[0] &= ~(1u << 27);  // AVX512ER
+    extended_features[0] &= ~(1u << 28);  // AVX512CD
+    extended_features[0] &= ~(1u << 30);  // AVX512BW
+    extended_features[0] &= ~(1u << 31);  // AVX512VL
+    extended_features[1] &= ~(1u << 1);   // AVX512VBMI
+    extended_features[1] &= ~(1u << 6);   // AVX512VBMI2
+    extended_features[1] &= ~(1u << 11);  // AVX512VNNI
+    extended_features[1] &= ~(1u << 12);  // AVX512BITALG
+    extended_features[1] &= ~(1u << 14);  // AVX512VPOPCNTDQ
+  }
+
+  // Repurpose the bit for the removed MPX feature to indicate when using zmm
+  // registers should be avoided even when they are supported. (When set, AVX512
+  // features can still be used, but only using ymm or xmm registers.) Skylake
+  // suffered from severe downclocking when zmm registers were used, which
+  // affected unrelated code running on the system, making zmm registers not too
+  // useful outside of benchmarks. The situation improved significantly by Ice
+  // Lake, but a small amount of downclocking remained. (See
+  // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/)
+  // We take a conservative approach of not allowing zmm registers until after
+  // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side.
+  //
+  // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported
+  // to have any downclocking problem when zmm registers are used.
+  if (is_intel && family == 6 &&
+      (model == 85 ||    // Skylake, Cascade Lake, Cooper Lake (server)
+       model == 106 ||   // Ice Lake (server)
+       model == 108 ||   // Ice Lake (micro server)
+       model == 125 ||   // Ice Lake (client)
+       model == 126 ||   // Ice Lake (mobile)
+       model == 140 ||   // Tiger Lake (mobile)
+       model == 141)) {  // Tiger Lake (client)
+    extended_features[0] |= 1u << 14;
+  } else {
+    extended_features[0] &= ~(1u << 14);
+  }
+
+  OPENSSL_ia32cap_P[0] = edx;
+  OPENSSL_ia32cap_P[1] = ecx;
+  OPENSSL_ia32cap_P[2] = extended_features[0];
+  OPENSSL_ia32cap_P[3] = extended_features[1];
+}
+
+#endif  // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64)
diff --git a/ring-0.17.14/crypto/crypto.c b/ring-0.17.14/crypto/crypto.c
new file mode 100644
index 0000000000..153f97f821
--- /dev/null
+++ b/ring-0.17.14/crypto/crypto.c
@@ -0,0 +1,34 @@
+// Copyright 2014 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ring-core/base.h>
+
+// Our assembly does not use the GOT to reference symbols, which means
+// references to visible symbols will often require a TEXTREL. This is
+// undesirable, so all assembly-referenced symbols should be hidden. CPU
+// capabilities are the only such symbols defined in C. Explicitly hide them,
+// rather than rely on being built with -fvisibility=hidden.
+#if defined(OPENSSL_WINDOWS)
+#define HIDDEN
+#else
+#define HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+#if defined(OPENSSL_X86_64)
+// These are declared as `AtomicU32` on the Rust side.
+HIDDEN uint32_t avx2_available = 0;
+HIDDEN uint32_t adx_bmi2_available = 0;
+#elif defined(OPENSSL_ARM)
+HIDDEN uint32_t neon_available = 0;
+#endif
diff --git a/ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S b/ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S
new file mode 100644
index 0000000000..c085734102
--- /dev/null
+++ b/ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S
@@ -0,0 +1,2124 @@
+// Copyright 2015 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* This file is taken from crypto_scalarmult/curve25519/neon2/scalarmult.s in
+ * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public
+ * domain licensed but the standard Apache 2.0 license is included above to keep
+ * licensing simple. */
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+
+.fpu neon
+.text
+.align 4
+
+.global x25519_NEON
+.hidden x25519_NEON
+.type x25519_NEON, %function
+x25519_NEON:
+vpush {q4,q5,q6,q7}
+mov r12,sp
+sub sp,sp,#736
+and sp,sp,#0xffffffe0
+strd r4,[sp,#0]
+strd r6,[sp,#8]
+strd r8,[sp,#16]
+strd r10,[sp,#24]
+str r12,[sp,#480]
+str r14,[sp,#484]
+mov r0,r0
+mov r1,r1
+mov r2,r2
+add r3,sp,#32
+ldr r4,=0
+ldr r5,=254
+vmov.i32 q0,#1
+vshr.u64 q1,q0,#7
+vshr.u64 q0,q0,#8
+vmov.i32 d4,#19
+vmov.i32 d5,#38
+add r6,sp,#512
+vst1.8 {d2-d3},[r6,: 128]
+add r6,sp,#528
+vst1.8 {d0-d1},[r6,: 128]
+add r6,sp,#544
+vst1.8 {d4-d5},[r6,: 128]
+add r6,r3,#0
+vmov.i32 q2,#0
+vst1.8 {d4-d5},[r6,: 128]!
+vst1.8 {d4-d5},[r6,: 128]!
+vst1.8 d4,[r6,: 64]
+add r6,r3,#0
+ldr r7,=960
+sub r7,r7,#2
+neg r7,r7
+sub r7,r7,r7,LSL #7
+str r7,[r6]
+add r6,sp,#704
+vld1.8 {d4-d5},[r1]!
+vld1.8 {d6-d7},[r1]
+vst1.8 {d4-d5},[r6,: 128]!
+vst1.8 {d6-d7},[r6,: 128]
+sub r1,r6,#16
+ldrb r6,[r1]
+and r6,r6,#248
+strb r6,[r1]
+ldrb r6,[r1,#31]
+and r6,r6,#127
+orr r6,r6,#64
+strb r6,[r1,#31]
+vmov.i64 q2,#0xffffffff
+vshr.u64 q3,q2,#7
+vshr.u64 q2,q2,#6
+vld1.8 {d8},[r2]
+vld1.8 {d10},[r2]
+add r2,r2,#6
+vld1.8 {d12},[r2]
+vld1.8 {d14},[r2]
+add r2,r2,#6
+vld1.8 {d16},[r2]
+add r2,r2,#4
+vld1.8 {d18},[r2]
+vld1.8 {d20},[r2]
+add r2,r2,#6
+vld1.8 {d22},[r2]
+add r2,r2,#2
+vld1.8 {d24},[r2]
+vld1.8 {d26},[r2]
+vshr.u64 q5,q5,#26
+vshr.u64 q6,q6,#3
+vshr.u64 q7,q7,#29
+vshr.u64 q8,q8,#6
+vshr.u64 q10,q10,#25
+vshr.u64 q11,q11,#3
+vshr.u64 q12,q12,#12
+vshr.u64 q13,q13,#38
+vand q4,q4,q2
+vand q6,q6,q2
+vand q8,q8,q2
+vand q10,q10,q2
+vand q2,q12,q2
+vand q5,q5,q3
+vand q7,q7,q3
+vand q9,q9,q3
+vand q11,q11,q3
+vand q3,q13,q3
+add r2,r3,#48
+vadd.i64 q12,q4,q1
+vadd.i64 q13,q10,q1
+vshr.s64 q12,q12,#26
+vshr.s64 q13,q13,#26
+vadd.i64 q5,q5,q12
+vshl.i64 q12,q12,#26
+vadd.i64 q14,q5,q0
+vadd.i64 q11,q11,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q15,q11,q0
+vsub.i64 q4,q4,q12
+vshr.s64 q12,q14,#25
+vsub.i64 q10,q10,q13
+vshr.s64 q13,q15,#25
+vadd.i64 q6,q6,q12
+vshl.i64 q12,q12,#25
+vadd.i64 q14,q6,q1
+vadd.i64 q2,q2,q13
+vsub.i64 q5,q5,q12
+vshr.s64 q12,q14,#26
+vshl.i64 q13,q13,#25
+vadd.i64 q14,q2,q1
+vadd.i64 q7,q7,q12
+vshl.i64 q12,q12,#26
+vadd.i64 q15,q7,q0
+vsub.i64 q11,q11,q13
+vshr.s64 q13,q14,#26
+vsub.i64 q6,q6,q12
+vshr.s64 q12,q15,#25
+vadd.i64 q3,q3,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q14,q3,q0
+vadd.i64 q8,q8,q12
+vshl.i64 q12,q12,#25
+vadd.i64 q15,q8,q1
+add r2,r2,#8
+vsub.i64 q2,q2,q13
+vshr.s64 q13,q14,#25
+vsub.i64 q7,q7,q12
+vshr.s64 q12,q15,#26
+vadd.i64 q14,q13,q13
+vadd.i64 q9,q9,q12
+vtrn.32 d12,d14
+vshl.i64 q12,q12,#26
+vtrn.32 d13,d15
+vadd.i64 q0,q9,q0
+vadd.i64 q4,q4,q14
+vst1.8 d12,[r2,: 64]!
+vshl.i64 q6,q13,#4
+vsub.i64 q7,q8,q12
+vshr.s64 q0,q0,#25
+vadd.i64 q4,q4,q6
+vadd.i64 q6,q10,q0
+vshl.i64 q0,q0,#25
+vadd.i64 q8,q6,q1
+vadd.i64 q4,q4,q13
+vshl.i64 q10,q13,#25
+vadd.i64 q1,q4,q1
+vsub.i64 q0,q9,q0
+vshr.s64 q8,q8,#26
+vsub.i64 q3,q3,q10
+vtrn.32 d14,d0
+vshr.s64 q1,q1,#26
+vtrn.32 d15,d1
+vadd.i64 q0,q11,q8
+vst1.8 d14,[r2,: 64]
+vshl.i64 q7,q8,#26
+vadd.i64 q5,q5,q1
+vtrn.32 d4,d6
+vshl.i64 q1,q1,#26
+vtrn.32 d5,d7
+vsub.i64 q3,q6,q7
+add r2,r2,#16
+vsub.i64 q1,q4,q1
+vst1.8 d4,[r2,: 64]
+vtrn.32 d6,d0
+vtrn.32 d7,d1
+sub r2,r2,#8
+vtrn.32 d2,d10
+vtrn.32 d3,d11
+vst1.8 d6,[r2,: 64]
+sub r2,r2,#24
+vst1.8 d2,[r2,: 64]
+add r2,r3,#96
+vmov.i32 q0,#0
+vmov.i64 d2,#0xff
+vmov.i64 d3,#0
+vshr.u32 q1,q1,#7
+vst1.8 {d2-d3},[r2,: 128]!
+vst1.8 {d0-d1},[r2,: 128]!
+vst1.8 d0,[r2,: 64]
+add r2,r3,#144
+vmov.i32 q0,#0
+vst1.8 {d0-d1},[r2,: 128]!
+vst1.8 {d0-d1},[r2,: 128]!
+vst1.8 d0,[r2,: 64]
+add r2,r3,#240
+vmov.i32 q0,#0
+vmov.i64 d2,#0xff
+vmov.i64 d3,#0
+vshr.u32 q1,q1,#7
+vst1.8 {d2-d3},[r2,: 128]!
+vst1.8 {d0-d1},[r2,: 128]!
+vst1.8 d0,[r2,: 64]
+add r2,r3,#48
+add r6,r3,#192
+vld1.8 {d0-d1},[r2,: 128]!
+vld1.8 {d2-d3},[r2,: 128]!
+vld1.8 {d4},[r2,: 64]
+vst1.8 {d0-d1},[r6,: 128]!
+vst1.8 {d2-d3},[r6,: 128]!
+vst1.8 d4,[r6,: 64]
+._mainloop:
+mov r2,r5,LSR #3
+and r6,r5,#7
+ldrb r2,[r1,r2]
+mov r2,r2,LSR r6
+and r2,r2,#1
+str r5,[sp,#488]
+eor r4,r4,r2
+str r2,[sp,#492]
+neg r2,r4
+add r4,r3,#96
+add r5,r3,#192
+add r6,r3,#144
+vld1.8 {d8-d9},[r4,: 128]!
+add r7,r3,#240
+vld1.8 {d10-d11},[r5,: 128]!
+veor q6,q4,q5
+vld1.8 {d14-d15},[r6,: 128]!
+vdup.i32 q8,r2
+vld1.8 {d18-d19},[r7,: 128]!
+veor q10,q7,q9
+vld1.8 {d22-d23},[r4,: 128]!
+vand q6,q6,q8
+vld1.8 {d24-d25},[r5,: 128]!
+vand q10,q10,q8
+vld1.8 {d26-d27},[r6,: 128]!
+veor q4,q4,q6
+vld1.8 {d28-d29},[r7,: 128]!
+veor q5,q5,q6
+vld1.8 {d0},[r4,: 64]
+veor q6,q7,q10
+vld1.8 {d2},[r5,: 64]
+veor q7,q9,q10
+vld1.8 {d4},[r6,: 64]
+veor q9,q11,q12
+vld1.8 {d6},[r7,: 64]
+veor q10,q0,q1
+sub r2,r4,#32
+vand q9,q9,q8
+sub r4,r5,#32
+vand q10,q10,q8
+sub r5,r6,#32
+veor q11,q11,q9
+sub r6,r7,#32
+veor q0,q0,q10
+veor q9,q12,q9
+veor q1,q1,q10
+veor q10,q13,q14
+veor q12,q2,q3
+vand q10,q10,q8
+vand q8,q12,q8
+veor q12,q13,q10
+veor q2,q2,q8
+veor q10,q14,q10
+veor q3,q3,q8
+vadd.i32 q8,q4,q6
+vsub.i32 q4,q4,q6
+vst1.8 {d16-d17},[r2,: 128]!
+vadd.i32 q6,q11,q12
+vst1.8 {d8-d9},[r5,: 128]!
+vsub.i32 q4,q11,q12
+vst1.8 {d12-d13},[r2,: 128]!
+vadd.i32 q6,q0,q2
+vst1.8 {d8-d9},[r5,: 128]!
+vsub.i32 q0,q0,q2
+vst1.8 d12,[r2,: 64]
+vadd.i32 q2,q5,q7
+vst1.8 d0,[r5,: 64]
+vsub.i32 q0,q5,q7
+vst1.8 {d4-d5},[r4,: 128]!
+vadd.i32 q2,q9,q10
+vst1.8 {d0-d1},[r6,: 128]!
+vsub.i32 q0,q9,q10
+vst1.8 {d4-d5},[r4,: 128]!
+vadd.i32 q2,q1,q3
+vst1.8 {d0-d1},[r6,: 128]!
+vsub.i32 q0,q1,q3
+vst1.8 d4,[r4,: 64]
+vst1.8 d0,[r6,: 64]
+add r2,sp,#544
+add r4,r3,#96
+add r5,r3,#144
+vld1.8 {d0-d1},[r2,: 128]
+vld1.8 {d2-d3},[r4,: 128]!
+vld1.8 {d4-d5},[r5,: 128]!
+vzip.i32 q1,q2
+vld1.8 {d6-d7},[r4,: 128]!
+vld1.8 {d8-d9},[r5,: 128]!
+vshl.i32 q5,q1,#1
+vzip.i32 q3,q4
+vshl.i32 q6,q2,#1
+vld1.8 {d14},[r4,: 64]
+vshl.i32 q8,q3,#1
+vld1.8 {d15},[r5,: 64]
+vshl.i32 q9,q4,#1
+vmul.i32 d21,d7,d1
+vtrn.32 d14,d15
+vmul.i32 q11,q4,q0
+vmul.i32 q0,q7,q0
+vmull.s32 q12,d2,d2
+vmlal.s32 q12,d11,d1
+vmlal.s32 q12,d12,d0
+vmlal.s32 q12,d13,d23
+vmlal.s32 q12,d16,d22
+vmlal.s32 q12,d7,d21
+vmull.s32 q10,d2,d11
+vmlal.s32 q10,d4,d1
+vmlal.s32 q10,d13,d0
+vmlal.s32 q10,d6,d23
+vmlal.s32 q10,d17,d22
+vmull.s32 q13,d10,d4
+vmlal.s32 q13,d11,d3
+vmlal.s32 q13,d13,d1
+vmlal.s32 q13,d16,d0
+vmlal.s32 q13,d17,d23
+vmlal.s32 q13,d8,d22
+vmull.s32 q1,d10,d5
+vmlal.s32 q1,d11,d4
+vmlal.s32 q1,d6,d1
+vmlal.s32 q1,d17,d0
+vmlal.s32 q1,d8,d23
+vmull.s32 q14,d10,d6
+vmlal.s32 q14,d11,d13
+vmlal.s32 q14,d4,d4
+vmlal.s32 q14,d17,d1
+vmlal.s32 q14,d18,d0
+vmlal.s32 q14,d9,d23
+vmull.s32 q11,d10,d7
+vmlal.s32 q11,d11,d6
+vmlal.s32 q11,d12,d5
+vmlal.s32 q11,d8,d1
+vmlal.s32 q11,d19,d0
+vmull.s32 q15,d10,d8
+vmlal.s32 q15,d11,d17
+vmlal.s32 q15,d12,d6
+vmlal.s32 q15,d13,d5
+vmlal.s32 q15,d19,d1
+vmlal.s32 q15,d14,d0
+vmull.s32 q2,d10,d9
+vmlal.s32 q2,d11,d8
+vmlal.s32 q2,d12,d7
+vmlal.s32 q2,d13,d6
+vmlal.s32 q2,d14,d1
+vmull.s32 q0,d15,d1
+vmlal.s32 q0,d10,d14
+vmlal.s32 q0,d11,d19
+vmlal.s32 q0,d12,d8
+vmlal.s32 q0,d13,d17
+vmlal.s32 q0,d6,d6
+add r2,sp,#512
+vld1.8 {d18-d19},[r2,: 128]
+vmull.s32 q3,d16,d7
+vmlal.s32 q3,d10,d15
+vmlal.s32 q3,d11,d14
+vmlal.s32 q3,d12,d9
+vmlal.s32 q3,d13,d8
+add r2,sp,#528
+vld1.8 {d8-d9},[r2,: 128]
+vadd.i64 q5,q12,q9
+vadd.i64 q6,q15,q9
+vshr.s64 q5,q5,#26
+vshr.s64 q6,q6,#26
+vadd.i64 q7,q10,q5
+vshl.i64 q5,q5,#26
+vadd.i64 q8,q7,q4
+vadd.i64 q2,q2,q6
+vshl.i64 q6,q6,#26
+vadd.i64 q10,q2,q4
+vsub.i64 q5,q12,q5
+vshr.s64 q8,q8,#25
+vsub.i64 q6,q15,q6
+vshr.s64 q10,q10,#25
+vadd.i64 q12,q13,q8
+vshl.i64 q8,q8,#25
+vadd.i64 q13,q12,q9
+vadd.i64 q0,q0,q10
+vsub.i64 q7,q7,q8
+vshr.s64 q8,q13,#26
+vshl.i64 q10,q10,#25
+vadd.i64 q13,q0,q9
+vadd.i64 q1,q1,q8
+vshl.i64 q8,q8,#26
+vadd.i64 q15,q1,q4
+vsub.i64 q2,q2,q10
+vshr.s64 q10,q13,#26
+vsub.i64 q8,q12,q8
+vshr.s64 q12,q15,#25
+vadd.i64 q3,q3,q10
+vshl.i64 q10,q10,#26
+vadd.i64 q13,q3,q4
+vadd.i64 q14,q14,q12
+add r2,r3,#288
+vshl.i64 q12,q12,#25
+add r4,r3,#336
+vadd.i64 q15,q14,q9
+add r2,r2,#8
+vsub.i64 q0,q0,q10
+add r4,r4,#8
+vshr.s64 q10,q13,#25
+vsub.i64 q1,q1,q12
+vshr.s64 q12,q15,#26
+vadd.i64 q13,q10,q10
+vadd.i64 q11,q11,q12
+vtrn.32 d16,d2
+vshl.i64 q12,q12,#26
+vtrn.32 d17,d3
+vadd.i64 q1,q11,q4
+vadd.i64 q4,q5,q13
+vst1.8 d16,[r2,: 64]!
+vshl.i64 q5,q10,#4
+vst1.8 d17,[r4,: 64]!
+vsub.i64 q8,q14,q12
+vshr.s64 q1,q1,#25
+vadd.i64 q4,q4,q5
+vadd.i64 q5,q6,q1
+vshl.i64 q1,q1,#25
+vadd.i64 q6,q5,q9
+vadd.i64 q4,q4,q10
+vshl.i64 q10,q10,#25
+vadd.i64 q9,q4,q9
+vsub.i64 q1,q11,q1
+vshr.s64 q6,q6,#26
+vsub.i64 q3,q3,q10
+vtrn.32 d16,d2
+vshr.s64 q9,q9,#26
+vtrn.32 d17,d3
+vadd.i64 q1,q2,q6
+vst1.8 d16,[r2,: 64]
+vshl.i64 q2,q6,#26
+vst1.8 d17,[r4,: 64]
+vadd.i64 q6,q7,q9
+vtrn.32 d0,d6
+vshl.i64 q7,q9,#26
+vtrn.32 d1,d7
+vsub.i64 q2,q5,q2
+add r2,r2,#16
+vsub.i64 q3,q4,q7
+vst1.8 d0,[r2,: 64]
+add r4,r4,#16
+vst1.8 d1,[r4,: 64]
+vtrn.32 d4,d2
+vtrn.32 d5,d3
+sub r2,r2,#8
+sub r4,r4,#8
+vtrn.32 d6,d12
+vtrn.32 d7,d13
+vst1.8 d4,[r2,: 64]
+vst1.8 d5,[r4,: 64]
+sub r2,r2,#24
+sub r4,r4,#24
+vst1.8 d6,[r2,: 64]
+vst1.8 d7,[r4,: 64]
+add r2,r3,#240
+add r4,r3,#96
+vld1.8 {d0-d1},[r4,: 128]!
+vld1.8 {d2-d3},[r4,: 128]!
+vld1.8 {d4},[r4,: 64]
+add r4,r3,#144
+vld1.8 {d6-d7},[r4,: 128]!
+vtrn.32 q0,q3
+vld1.8 {d8-d9},[r4,: 128]!
+vshl.i32 q5,q0,#4
+vtrn.32 q1,q4
+vshl.i32 q6,q3,#4
+vadd.i32 q5,q5,q0
+vadd.i32 q6,q6,q3
+vshl.i32 q7,q1,#4
+vld1.8 {d5},[r4,: 64]
+vshl.i32 q8,q4,#4
+vtrn.32 d4,d5
+vadd.i32 q7,q7,q1
+vadd.i32 q8,q8,q4
+vld1.8 {d18-d19},[r2,: 128]!
+vshl.i32 q10,q2,#4
+vld1.8 {d22-d23},[r2,: 128]!
+vadd.i32 q10,q10,q2
+vld1.8 {d24},[r2,: 64]
+vadd.i32 q5,q5,q0
+add r2,r3,#192
+vld1.8 {d26-d27},[r2,: 128]!
+vadd.i32 q6,q6,q3
+vld1.8 {d28-d29},[r2,: 128]!
+vadd.i32 q8,q8,q4
+vld1.8 {d25},[r2,: 64]
+vadd.i32 q10,q10,q2
+vtrn.32 q9,q13
+vadd.i32 q7,q7,q1
+vadd.i32 q5,q5,q0
+vtrn.32 q11,q14
+vadd.i32 q6,q6,q3
+add r2,sp,#560
+vadd.i32 q10,q10,q2
+vtrn.32 d24,d25
+vst1.8 {d12-d13},[r2,: 128]
+vshl.i32 q6,q13,#1
+add r2,sp,#576
+vst1.8 {d20-d21},[r2,: 128]
+vshl.i32 q10,q14,#1
+add r2,sp,#592
+vst1.8 {d12-d13},[r2,: 128]
+vshl.i32 q15,q12,#1
+vadd.i32 q8,q8,q4
+vext.32 d10,d31,d30,#0
+vadd.i32 q7,q7,q1
+add r2,sp,#608
+vst1.8 {d16-d17},[r2,: 128]
+vmull.s32 q8,d18,d5
+vmlal.s32 q8,d26,d4
+vmlal.s32 q8,d19,d9
+vmlal.s32 q8,d27,d3
+vmlal.s32 q8,d22,d8
+vmlal.s32 q8,d28,d2
+vmlal.s32 q8,d23,d7
+vmlal.s32 q8,d29,d1
+vmlal.s32 q8,d24,d6
+vmlal.s32 q8,d25,d0
+add r2,sp,#624
+vst1.8 {d14-d15},[r2,: 128]
+vmull.s32 q2,d18,d4
+vmlal.s32 q2,d12,d9
+vmlal.s32 q2,d13,d8
+vmlal.s32 q2,d19,d3
+vmlal.s32 q2,d22,d2
+vmlal.s32 q2,d23,d1
+vmlal.s32 q2,d24,d0
+add r2,sp,#640
+vst1.8 {d20-d21},[r2,: 128]
+vmull.s32 q7,d18,d9
+vmlal.s32 q7,d26,d3
+vmlal.s32 q7,d19,d8
+vmlal.s32 q7,d27,d2
+vmlal.s32 q7,d22,d7
+vmlal.s32 q7,d28,d1
+vmlal.s32 q7,d23,d6
+vmlal.s32 q7,d29,d0
+add r2,sp,#656
+vst1.8 {d10-d11},[r2,: 128]
+vmull.s32 q5,d18,d3
+vmlal.s32 q5,d19,d2
+vmlal.s32 q5,d22,d1
+vmlal.s32 q5,d23,d0
+vmlal.s32 q5,d12,d8
+add r2,sp,#672
+vst1.8 {d16-d17},[r2,: 128]
+vmull.s32 q4,d18,d8
+vmlal.s32 q4,d26,d2
+vmlal.s32 q4,d19,d7
+vmlal.s32 q4,d27,d1
+vmlal.s32 q4,d22,d6
+vmlal.s32 q4,d28,d0
+vmull.s32 q8,d18,d7
+vmlal.s32 q8,d26,d1
+vmlal.s32 q8,d19,d6
+vmlal.s32 q8,d27,d0
+add r2,sp,#576
+vld1.8 {d20-d21},[r2,: 128]
+vmlal.s32 q7,d24,d21
+vmlal.s32 q7,d25,d20
+vmlal.s32 q4,d23,d21
+vmlal.s32 q4,d29,d20
+vmlal.s32 q8,d22,d21
+vmlal.s32 q8,d28,d20
+vmlal.s32 q5,d24,d20
+add r2,sp,#576
+vst1.8 {d14-d15},[r2,: 128]
+vmull.s32 q7,d18,d6
+vmlal.s32 q7,d26,d0
+add r2,sp,#656
+vld1.8 {d30-d31},[r2,: 128]
+vmlal.s32 q2,d30,d21
+vmlal.s32 q7,d19,d21
+vmlal.s32 q7,d27,d20
+add r2,sp,#624
+vld1.8 {d26-d27},[r2,: 128]
+vmlal.s32 q4,d25,d27
+vmlal.s32 q8,d29,d27
+vmlal.s32 q8,d25,d26
+vmlal.s32 q7,d28,d27
+vmlal.s32 q7,d29,d26
+add r2,sp,#608
+vld1.8 {d28-d29},[r2,: 128]
+vmlal.s32 q4,d24,d29
+vmlal.s32 q8,d23,d29
+vmlal.s32 q8,d24,d28
+vmlal.s32 q7,d22,d29
+vmlal.s32 q7,d23,d28
+add r2,sp,#608
+vst1.8 {d8-d9},[r2,: 128]
+add r2,sp,#560
+vld1.8 {d8-d9},[r2,: 128]
+vmlal.s32 q7,d24,d9
+vmlal.s32 q7,d25,d31
+vmull.s32 q1,d18,d2
+vmlal.s32 q1,d19,d1
+vmlal.s32 q1,d22,d0
+vmlal.s32 q1,d24,d27
+vmlal.s32 q1,d23,d20
+vmlal.s32 q1,d12,d7
+vmlal.s32 q1,d13,d6
+vmull.s32 q6,d18,d1
+vmlal.s32 q6,d19,d0
+vmlal.s32 q6,d23,d27
+vmlal.s32 q6,d22,d20
+vmlal.s32 q6,d24,d26
+vmull.s32 q0,d18,d0
+vmlal.s32 q0,d22,d27
+vmlal.s32 q0,d23,d26
+vmlal.s32 q0,d24,d31
+vmlal.s32 q0,d19,d20
+add r2,sp,#640
+vld1.8 {d18-d19},[r2,: 128]
+vmlal.s32 q2,d18,d7
+vmlal.s32 q2,d19,d6
+vmlal.s32 q5,d18,d6
+vmlal.s32 q5,d19,d21
+vmlal.s32 q1,d18,d21
+vmlal.s32 q1,d19,d29
+vmlal.s32 q0,d18,d28
+vmlal.s32 q0,d19,d9
+vmlal.s32 q6,d18,d29
+vmlal.s32 q6,d19,d28
+add r2,sp,#592
+vld1.8 {d18-d19},[r2,: 128]
+add r2,sp,#512
+vld1.8 {d22-d23},[r2,: 128]
+vmlal.s32 q5,d19,d7
+vmlal.s32 q0,d18,d21
+vmlal.s32 q0,d19,d29
+vmlal.s32 q6,d18,d6
+add r2,sp,#528
+vld1.8 {d6-d7},[r2,: 128]
+vmlal.s32 q6,d19,d21
+add r2,sp,#576
+vld1.8 {d18-d19},[r2,: 128]
+vmlal.s32 q0,d30,d8
+add r2,sp,#672
+vld1.8 {d20-d21},[r2,: 128]
+vmlal.s32 q5,d30,d29
+add r2,sp,#608
+vld1.8 {d24-d25},[r2,: 128]
+vmlal.s32 q1,d30,d28
+vadd.i64 q13,q0,q11
+vadd.i64 q14,q5,q11
+vmlal.s32 q6,d30,d9
+vshr.s64 q4,q13,#26
+vshr.s64 q13,q14,#26
+vadd.i64 q7,q7,q4
+vshl.i64 q4,q4,#26
+vadd.i64 q14,q7,q3
+vadd.i64 q9,q9,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q15,q9,q3
+vsub.i64 q0,q0,q4
+vshr.s64 q4,q14,#25
+vsub.i64 q5,q5,q13
+vshr.s64 q13,q15,#25
+vadd.i64 q6,q6,q4
+vshl.i64 q4,q4,#25
+vadd.i64 q14,q6,q11
+vadd.i64 q2,q2,q13
+vsub.i64 q4,q7,q4
+vshr.s64 q7,q14,#26
+vshl.i64 q13,q13,#25
+vadd.i64 q14,q2,q11
+vadd.i64 q8,q8,q7
+vshl.i64 q7,q7,#26
+vadd.i64 q15,q8,q3
+vsub.i64 q9,q9,q13
+vshr.s64 q13,q14,#26
+vsub.i64 q6,q6,q7
+vshr.s64 q7,q15,#25
+vadd.i64 q10,q10,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q14,q10,q3
+vadd.i64 q1,q1,q7
+add r2,r3,#144
+vshl.i64 q7,q7,#25
+add r4,r3,#96
+vadd.i64 q15,q1,q11
+add r2,r2,#8
+vsub.i64 q2,q2,q13
+add r4,r4,#8
+vshr.s64 q13,q14,#25
+vsub.i64 q7,q8,q7
+vshr.s64 q8,q15,#26
+vadd.i64 q14,q13,q13
+vadd.i64 q12,q12,q8
+vtrn.32 d12,d14
+vshl.i64 q8,q8,#26
+vtrn.32 d13,d15
+vadd.i64 q3,q12,q3
+vadd.i64 q0,q0,q14
+vst1.8 d12,[r2,: 64]!
+vshl.i64 q7,q13,#4
+vst1.8 d13,[r4,: 64]!
+vsub.i64 q1,q1,q8
+vshr.s64 q3,q3,#25
+vadd.i64 q0,q0,q7
+vadd.i64 q5,q5,q3
+vshl.i64 q3,q3,#25
+vadd.i64 q6,q5,q11
+vadd.i64 q0,q0,q13
+vshl.i64 q7,q13,#25
+vadd.i64 q8,q0,q11
+vsub.i64 q3,q12,q3
+vshr.s64 q6,q6,#26
+vsub.i64 q7,q10,q7
+vtrn.32 d2,d6
+vshr.s64 q8,q8,#26
+vtrn.32 d3,d7
+vadd.i64 q3,q9,q6
+vst1.8 d2,[r2,: 64]
+vshl.i64 q6,q6,#26
+vst1.8 d3,[r4,: 64]
+vadd.i64 q1,q4,q8
+vtrn.32 d4,d14
+vshl.i64 q4,q8,#26
+vtrn.32 d5,d15
+vsub.i64 q5,q5,q6
+add r2,r2,#16
+vsub.i64 q0,q0,q4
+vst1.8 d4,[r2,: 64]
+add r4,r4,#16
+vst1.8 d5,[r4,: 64]
+vtrn.32 d10,d6
+vtrn.32 d11,d7
+sub r2,r2,#8
+sub r4,r4,#8
+vtrn.32 d0,d2
+vtrn.32 d1,d3
+vst1.8 d10,[r2,: 64]
+vst1.8 d11,[r4,: 64]
+sub r2,r2,#24
+sub r4,r4,#24
+vst1.8 d0,[r2,: 64]
+vst1.8 d1,[r4,: 64]
+add r2,r3,#288
+add r4,r3,#336
+vld1.8 {d0-d1},[r2,: 128]!
+vld1.8 {d2-d3},[r4,: 128]!
+vsub.i32 q0,q0,q1
+vld1.8 {d2-d3},[r2,: 128]!
+vld1.8 {d4-d5},[r4,: 128]!
+vsub.i32 q1,q1,q2
+add r5,r3,#240
+vld1.8 {d4},[r2,: 64]
+vld1.8 {d6},[r4,: 64]
+vsub.i32 q2,q2,q3
+vst1.8 {d0-d1},[r5,: 128]!
+vst1.8 {d2-d3},[r5,: 128]!
+vst1.8 d4,[r5,: 64]
+add r2,r3,#144
+add r4,r3,#96
+add r5,r3,#144
+add r6,r3,#192
+vld1.8 {d0-d1},[r2,: 128]!
+vld1.8 {d2-d3},[r4,: 128]!
+vsub.i32 q2,q0,q1
+vadd.i32 q0,q0,q1
+vld1.8 {d2-d3},[r2,: 128]!
+vld1.8 {d6-d7},[r4,: 128]!
+vsub.i32 q4,q1,q3
+vadd.i32 q1,q1,q3
+vld1.8 {d6},[r2,: 64]
+vld1.8 {d10},[r4,: 64]
+vsub.i32 q6,q3,q5
+vadd.i32 q3,q3,q5
+vst1.8 {d4-d5},[r5,: 128]!
+vst1.8 {d0-d1},[r6,: 128]!
+vst1.8 {d8-d9},[r5,: 128]!
+vst1.8 {d2-d3},[r6,: 128]!
+vst1.8 d12,[r5,: 64]
+vst1.8 d6,[r6,: 64]
+add r2,r3,#0
+add r4,r3,#240
+vld1.8 {d0-d1},[r4,: 128]!
+vld1.8 {d2-d3},[r4,: 128]!
+vld1.8 {d4},[r4,: 64]
+add r4,r3,#336
+vld1.8 {d6-d7},[r4,: 128]!
+vtrn.32 q0,q3
+vld1.8 {d8-d9},[r4,: 128]!
+vshl.i32 q5,q0,#4
+vtrn.32 q1,q4
+vshl.i32 q6,q3,#4
+vadd.i32 q5,q5,q0
+vadd.i32 q6,q6,q3
+vshl.i32 q7,q1,#4
+vld1.8 {d5},[r4,: 64]
+vshl.i32 q8,q4,#4
+vtrn.32 d4,d5
+vadd.i32 q7,q7,q1
+vadd.i32 q8,q8,q4
+vld1.8 {d18-d19},[r2,: 128]!
+vshl.i32 q10,q2,#4
+vld1.8 {d22-d23},[r2,: 128]!
+vadd.i32 q10,q10,q2
+vld1.8 {d24},[r2,: 64]
+vadd.i32 q5,q5,q0
+add r2,r3,#288
+vld1.8 {d26-d27},[r2,: 128]!
+vadd.i32 q6,q6,q3
+vld1.8 {d28-d29},[r2,: 128]!
+vadd.i32 q8,q8,q4
+vld1.8 {d25},[r2,: 64]
+vadd.i32 q10,q10,q2
+vtrn.32 q9,q13
+vadd.i32 q7,q7,q1
+vadd.i32 q5,q5,q0
+vtrn.32 q11,q14
+vadd.i32 q6,q6,q3
+add r2,sp,#560
+vadd.i32 q10,q10,q2
+vtrn.32 d24,d25
+vst1.8 {d12-d13},[r2,: 128]
+vshl.i32 q6,q13,#1
+add r2,sp,#576
+vst1.8 {d20-d21},[r2,: 128]
+vshl.i32 q10,q14,#1
+add r2,sp,#592
+vst1.8 {d12-d13},[r2,: 128]
+vshl.i32 q15,q12,#1
+vadd.i32 q8,q8,q4
+vext.32 d10,d31,d30,#0
+vadd.i32 q7,q7,q1
+add r2,sp,#608
+vst1.8 {d16-d17},[r2,: 128]
+vmull.s32 q8,d18,d5
+vmlal.s32 q8,d26,d4
+vmlal.s32 q8,d19,d9
+vmlal.s32 q8,d27,d3
+vmlal.s32 q8,d22,d8
+vmlal.s32 q8,d28,d2
+vmlal.s32 q8,d23,d7
+vmlal.s32 q8,d29,d1
+vmlal.s32 q8,d24,d6
+vmlal.s32 q8,d25,d0
+add r2,sp,#624
+vst1.8 {d14-d15},[r2,: 128]
+vmull.s32 q2,d18,d4
+vmlal.s32 q2,d12,d9
+vmlal.s32 q2,d13,d8
+vmlal.s32 q2,d19,d3
+vmlal.s32 q2,d22,d2
+vmlal.s32 q2,d23,d1
+vmlal.s32 q2,d24,d0
+add r2,sp,#640
+vst1.8 {d20-d21},[r2,: 128]
+vmull.s32 q7,d18,d9
+vmlal.s32 q7,d26,d3
+vmlal.s32 q7,d19,d8
+vmlal.s32 q7,d27,d2
+vmlal.s32 q7,d22,d7
+vmlal.s32 q7,d28,d1
+vmlal.s32 q7,d23,d6
+vmlal.s32 q7,d29,d0
+add r2,sp,#656
+vst1.8 {d10-d11},[r2,: 128]
+vmull.s32 q5,d18,d3
+vmlal.s32 q5,d19,d2
+vmlal.s32 q5,d22,d1
+vmlal.s32 q5,d23,d0
+vmlal.s32 q5,d12,d8
+add r2,sp,#672
+vst1.8 {d16-d17},[r2,: 128]
+vmull.s32 q4,d18,d8
+vmlal.s32 q4,d26,d2
+vmlal.s32 q4,d19,d7
+vmlal.s32 q4,d27,d1
+vmlal.s32 q4,d22,d6
+vmlal.s32 q4,d28,d0
+vmull.s32 q8,d18,d7
+vmlal.s32 q8,d26,d1
+vmlal.s32 q8,d19,d6
+vmlal.s32 q8,d27,d0
+add r2,sp,#576
+vld1.8 {d20-d21},[r2,: 128]
+vmlal.s32 q7,d24,d21
+vmlal.s32 q7,d25,d20
+vmlal.s32 q4,d23,d21
+vmlal.s32 q4,d29,d20
+vmlal.s32 q8,d22,d21
+vmlal.s32 q8,d28,d20
+vmlal.s32 q5,d24,d20
+add r2,sp,#576
+vst1.8 {d14-d15},[r2,: 128]
+vmull.s32 q7,d18,d6
+vmlal.s32 q7,d26,d0
+add r2,sp,#656
+vld1.8 {d30-d31},[r2,: 128]
+vmlal.s32 q2,d30,d21
+vmlal.s32 q7,d19,d21
+vmlal.s32 q7,d27,d20
+add r2,sp,#624
+vld1.8 {d26-d27},[r2,: 128]
+vmlal.s32 q4,d25,d27
+vmlal.s32 q8,d29,d27
+vmlal.s32 q8,d25,d26
+vmlal.s32 q7,d28,d27
+vmlal.s32 q7,d29,d26
+add r2,sp,#608
+vld1.8 {d28-d29},[r2,: 128]
+vmlal.s32 q4,d24,d29
+vmlal.s32 q8,d23,d29
+vmlal.s32 q8,d24,d28
+vmlal.s32 q7,d22,d29
+vmlal.s32 q7,d23,d28
+add r2,sp,#608
+vst1.8 {d8-d9},[r2,: 128]
+add r2,sp,#560
+vld1.8 {d8-d9},[r2,: 128]
+vmlal.s32 q7,d24,d9
+vmlal.s32 q7,d25,d31
+vmull.s32 q1,d18,d2
+vmlal.s32 q1,d19,d1
+vmlal.s32 q1,d22,d0
+vmlal.s32 q1,d24,d27
+vmlal.s32 q1,d23,d20
+vmlal.s32 q1,d12,d7
+vmlal.s32 q1,d13,d6
+vmull.s32 q6,d18,d1
+vmlal.s32 q6,d19,d0
+vmlal.s32 q6,d23,d27
+vmlal.s32 q6,d22,d20
+vmlal.s32 q6,d24,d26
+vmull.s32 q0,d18,d0
+vmlal.s32 q0,d22,d27
+vmlal.s32 q0,d23,d26
+vmlal.s32 q0,d24,d31
+vmlal.s32 q0,d19,d20
+add r2,sp,#640
+vld1.8 {d18-d19},[r2,: 128]
+vmlal.s32 q2,d18,d7
+vmlal.s32 q2,d19,d6
+vmlal.s32 q5,d18,d6
+vmlal.s32 q5,d19,d21
+vmlal.s32 q1,d18,d21
+vmlal.s32 q1,d19,d29
+vmlal.s32 q0,d18,d28
+vmlal.s32 q0,d19,d9
+vmlal.s32 q6,d18,d29
+vmlal.s32 q6,d19,d28
+add r2,sp,#592
+vld1.8 {d18-d19},[r2,: 128]
+add r2,sp,#512
+vld1.8 {d22-d23},[r2,: 128]
+vmlal.s32 q5,d19,d7
+vmlal.s32 q0,d18,d21
+vmlal.s32 q0,d19,d29
+vmlal.s32 q6,d18,d6
+add r2,sp,#528
+vld1.8 {d6-d7},[r2,: 128]
+vmlal.s32 q6,d19,d21
+add r2,sp,#576
+vld1.8 {d18-d19},[r2,: 128]
+vmlal.s32 q0,d30,d8
+add r2,sp,#672
+vld1.8 {d20-d21},[r2,: 128]
+vmlal.s32 q5,d30,d29
+add r2,sp,#608
+vld1.8 {d24-d25},[r2,: 128]
+vmlal.s32 q1,d30,d28
+vadd.i64 q13,q0,q11
+vadd.i64 q14,q5,q11
+vmlal.s32 q6,d30,d9
+vshr.s64 q4,q13,#26
+vshr.s64 q13,q14,#26
+vadd.i64 q7,q7,q4
+vshl.i64 q4,q4,#26
+vadd.i64 q14,q7,q3
+vadd.i64 q9,q9,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q15,q9,q3
+vsub.i64 q0,q0,q4
+vshr.s64 q4,q14,#25
+vsub.i64 q5,q5,q13
+vshr.s64 q13,q15,#25
+vadd.i64 q6,q6,q4
+vshl.i64 q4,q4,#25
+vadd.i64 q14,q6,q11
+vadd.i64 q2,q2,q13
+vsub.i64 q4,q7,q4
+vshr.s64 q7,q14,#26
+vshl.i64 q13,q13,#25
+vadd.i64 q14,q2,q11
+vadd.i64 q8,q8,q7
+vshl.i64 q7,q7,#26
+vadd.i64 q15,q8,q3
+vsub.i64 q9,q9,q13
+vshr.s64 q13,q14,#26
+vsub.i64 q6,q6,q7
+vshr.s64 q7,q15,#25
+vadd.i64 q10,q10,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q14,q10,q3
+vadd.i64 q1,q1,q7
+add r2,r3,#288
+vshl.i64 q7,q7,#25
+add r4,r3,#96
+vadd.i64 q15,q1,q11
+add r2,r2,#8
+vsub.i64 q2,q2,q13
+add r4,r4,#8
+vshr.s64 q13,q14,#25
+vsub.i64 q7,q8,q7
+vshr.s64 q8,q15,#26
+vadd.i64 q14,q13,q13
+vadd.i64 q12,q12,q8
+vtrn.32 d12,d14
+vshl.i64 q8,q8,#26
+vtrn.32 d13,d15
+vadd.i64 q3,q12,q3
+vadd.i64 q0,q0,q14
+vst1.8 d12,[r2,: 64]!
+vshl.i64 q7,q13,#4
+vst1.8 d13,[r4,: 64]!
+vsub.i64 q1,q1,q8
+vshr.s64 q3,q3,#25
+vadd.i64 q0,q0,q7
+vadd.i64 q5,q5,q3
+vshl.i64 q3,q3,#25
+vadd.i64 q6,q5,q11
+vadd.i64 q0,q0,q13
+vshl.i64 q7,q13,#25
+vadd.i64 q8,q0,q11
+vsub.i64 q3,q12,q3
+vshr.s64 q6,q6,#26
+vsub.i64 q7,q10,q7
+vtrn.32 d2,d6
+vshr.s64 q8,q8,#26
+vtrn.32 d3,d7
+vadd.i64 q3,q9,q6
+vst1.8 d2,[r2,: 64]
+vshl.i64 q6,q6,#26
+vst1.8 d3,[r4,: 64]
+vadd.i64 q1,q4,q8
+vtrn.32 d4,d14
+vshl.i64 q4,q8,#26
+vtrn.32 d5,d15
+vsub.i64 q5,q5,q6
+add r2,r2,#16
+vsub.i64 q0,q0,q4
+vst1.8 d4,[r2,: 64]
+add r4,r4,#16
+vst1.8 d5,[r4,: 64]
+vtrn.32 d10,d6
+vtrn.32 d11,d7
+sub r2,r2,#8
+sub r4,r4,#8
+vtrn.32 d0,d2
+vtrn.32 d1,d3
+vst1.8 d10,[r2,: 64]
+vst1.8 d11,[r4,: 64]
+sub r2,r2,#24
+sub r4,r4,#24
+vst1.8 d0,[r2,: 64]
+vst1.8 d1,[r4,: 64]
+add r2,sp,#544
+add r4,r3,#144
+add r5,r3,#192
+vld1.8 {d0-d1},[r2,: 128]
+vld1.8 {d2-d3},[r4,: 128]!
+vld1.8 {d4-d5},[r5,: 128]!
+vzip.i32 q1,q2
+vld1.8 {d6-d7},[r4,: 128]!
+vld1.8 {d8-d9},[r5,: 128]!
+vshl.i32 q5,q1,#1
+vzip.i32 q3,q4
+vshl.i32 q6,q2,#1
+vld1.8 {d14},[r4,: 64]
+vshl.i32 q8,q3,#1
+vld1.8 {d15},[r5,: 64]
+vshl.i32 q9,q4,#1
+vmul.i32 d21,d7,d1
+vtrn.32 d14,d15
+vmul.i32 q11,q4,q0
+vmul.i32 q0,q7,q0
+vmull.s32 q12,d2,d2
+vmlal.s32 q12,d11,d1
+vmlal.s32 q12,d12,d0
+vmlal.s32 q12,d13,d23
+vmlal.s32 q12,d16,d22
+vmlal.s32 q12,d7,d21
+vmull.s32 q10,d2,d11
+vmlal.s32 q10,d4,d1
+vmlal.s32 q10,d13,d0
+vmlal.s32 q10,d6,d23
+vmlal.s32 q10,d17,d22
+vmull.s32 q13,d10,d4
+vmlal.s32 q13,d11,d3
+vmlal.s32 q13,d13,d1
+vmlal.s32 q13,d16,d0
+vmlal.s32 q13,d17,d23
+vmlal.s32 q13,d8,d22
+vmull.s32 q1,d10,d5
+vmlal.s32 q1,d11,d4
+vmlal.s32 q1,d6,d1
+vmlal.s32 q1,d17,d0
+vmlal.s32 q1,d8,d23
+vmull.s32 q14,d10,d6
+vmlal.s32 q14,d11,d13
+vmlal.s32 q14,d4,d4
+vmlal.s32 q14,d17,d1
+vmlal.s32 q14,d18,d0
+vmlal.s32 q14,d9,d23
+vmull.s32 q11,d10,d7
+vmlal.s32 q11,d11,d6
+vmlal.s32 q11,d12,d5
+vmlal.s32 q11,d8,d1
+vmlal.s32 q11,d19,d0
+vmull.s32 q15,d10,d8
+vmlal.s32 q15,d11,d17
+vmlal.s32 q15,d12,d6
+vmlal.s32 q15,d13,d5
+vmlal.s32 q15,d19,d1
+vmlal.s32 q15,d14,d0
+vmull.s32 q2,d10,d9
+vmlal.s32 q2,d11,d8
+vmlal.s32 q2,d12,d7
+vmlal.s32 q2,d13,d6
+vmlal.s32 q2,d14,d1
+vmull.s32 q0,d15,d1
+vmlal.s32 q0,d10,d14
+vmlal.s32 q0,d11,d19
+vmlal.s32 q0,d12,d8
+vmlal.s32 q0,d13,d17
+vmlal.s32 q0,d6,d6
+add r2,sp,#512
+vld1.8 {d18-d19},[r2,: 128]
+vmull.s32 q3,d16,d7
+vmlal.s32 q3,d10,d15
+vmlal.s32 q3,d11,d14
+vmlal.s32 q3,d12,d9
+vmlal.s32 q3,d13,d8
+add r2,sp,#528
+vld1.8 {d8-d9},[r2,: 128]
+vadd.i64 q5,q12,q9
+vadd.i64 q6,q15,q9
+vshr.s64 q5,q5,#26
+vshr.s64 q6,q6,#26
+vadd.i64 q7,q10,q5
+vshl.i64 q5,q5,#26
+vadd.i64 q8,q7,q4
+vadd.i64 q2,q2,q6
+vshl.i64 q6,q6,#26
+vadd.i64 q10,q2,q4
+vsub.i64 q5,q12,q5
+vshr.s64 q8,q8,#25
+vsub.i64 q6,q15,q6
+vshr.s64 q10,q10,#25
+vadd.i64 q12,q13,q8
+vshl.i64 q8,q8,#25
+vadd.i64 q13,q12,q9
+vadd.i64 q0,q0,q10
+vsub.i64 q7,q7,q8
+vshr.s64 q8,q13,#26
+vshl.i64 q10,q10,#25
+vadd.i64 q13,q0,q9
+vadd.i64 q1,q1,q8
+vshl.i64 q8,q8,#26
+vadd.i64 q15,q1,q4
+vsub.i64 q2,q2,q10
+vshr.s64 q10,q13,#26
+vsub.i64 q8,q12,q8
+vshr.s64 q12,q15,#25
+vadd.i64 q3,q3,q10
+vshl.i64 q10,q10,#26
+vadd.i64 q13,q3,q4
+vadd.i64 q14,q14,q12
+add r2,r3,#144
+vshl.i64 q12,q12,#25
+add r4,r3,#192
+vadd.i64 q15,q14,q9
+add r2,r2,#8
+vsub.i64 q0,q0,q10
+add r4,r4,#8
+vshr.s64 q10,q13,#25
+vsub.i64 q1,q1,q12
+vshr.s64 q12,q15,#26
+vadd.i64 q13,q10,q10
+vadd.i64 q11,q11,q12
+vtrn.32 d16,d2
+vshl.i64 q12,q12,#26
+vtrn.32 d17,d3
+vadd.i64 q1,q11,q4
+vadd.i64 q4,q5,q13
+vst1.8 d16,[r2,: 64]!
+vshl.i64 q5,q10,#4
+vst1.8 d17,[r4,: 64]!
+vsub.i64 q8,q14,q12
+vshr.s64 q1,q1,#25
+vadd.i64 q4,q4,q5
+vadd.i64 q5,q6,q1
+vshl.i64 q1,q1,#25
+vadd.i64 q6,q5,q9
+vadd.i64 q4,q4,q10
+vshl.i64 q10,q10,#25
+vadd.i64 q9,q4,q9
+vsub.i64 q1,q11,q1
+vshr.s64 q6,q6,#26
+vsub.i64 q3,q3,q10
+vtrn.32 d16,d2
+vshr.s64 q9,q9,#26
+vtrn.32 d17,d3
+vadd.i64 q1,q2,q6
+vst1.8 d16,[r2,: 64]
+vshl.i64 q2,q6,#26
+vst1.8 d17,[r4,: 64]
+vadd.i64 q6,q7,q9
+vtrn.32 d0,d6
+vshl.i64 q7,q9,#26
+vtrn.32 d1,d7
+vsub.i64 q2,q5,q2
+add r2,r2,#16
+vsub.i64 q3,q4,q7
+vst1.8 d0,[r2,: 64]
+add r4,r4,#16
+vst1.8 d1,[r4,: 64]
+vtrn.32 d4,d2
+vtrn.32 d5,d3
+sub r2,r2,#8
+sub r4,r4,#8
+vtrn.32 d6,d12
+vtrn.32 d7,d13
+vst1.8 d4,[r2,: 64]
+vst1.8 d5,[r4,: 64]
+sub r2,r2,#24
+sub r4,r4,#24
+vst1.8 d6,[r2,: 64]
+vst1.8 d7,[r4,: 64]
+add r2,r3,#336
+add r4,r3,#288
+vld1.8 {d0-d1},[r2,: 128]!
+vld1.8 {d2-d3},[r4,: 128]!
+vadd.i32 q0,q0,q1
+vld1.8 {d2-d3},[r2,: 128]!
+vld1.8 {d4-d5},[r4,: 128]!
+vadd.i32 q1,q1,q2
+add r5,r3,#288
+vld1.8 {d4},[r2,: 64]
+vld1.8 {d6},[r4,: 64]
+vadd.i32 q2,q2,q3
+vst1.8 {d0-d1},[r5,: 128]!
+vst1.8 {d2-d3},[r5,: 128]!
+vst1.8 d4,[r5,: 64]
+add r2,r3,#48
+add r4,r3,#144
+vld1.8 {d0-d1},[r4,: 128]!
+vld1.8 {d2-d3},[r4,: 128]!
+vld1.8 {d4},[r4,: 64]
+add r4,r3,#288
+vld1.8 {d6-d7},[r4,: 128]!
+vtrn.32 q0,q3
+vld1.8 {d8-d9},[r4,: 128]!
+vshl.i32 q5,q0,#4
+vtrn.32 q1,q4
+vshl.i32 q6,q3,#4
+vadd.i32 q5,q5,q0
+vadd.i32 q6,q6,q3
+vshl.i32 q7,q1,#4
+vld1.8 {d5},[r4,: 64]
+vshl.i32 q8,q4,#4
+vtrn.32 d4,d5
+vadd.i32 q7,q7,q1
+vadd.i32 q8,q8,q4
+vld1.8 {d18-d19},[r2,: 128]!
+vshl.i32 q10,q2,#4
+vld1.8 {d22-d23},[r2,: 128]!
+vadd.i32 q10,q10,q2
+vld1.8 {d24},[r2,: 64]
+vadd.i32 q5,q5,q0
+add r2,r3,#240
+vld1.8 {d26-d27},[r2,: 128]!
+vadd.i32 q6,q6,q3
+vld1.8 {d28-d29},[r2,: 128]!
+vadd.i32 q8,q8,q4
+vld1.8 {d25},[r2,: 64]
+vadd.i32 q10,q10,q2
+vtrn.32 q9,q13
+vadd.i32 q7,q7,q1
+vadd.i32 q5,q5,q0
+vtrn.32 q11,q14
+vadd.i32 q6,q6,q3
+add r2,sp,#560
+vadd.i32 q10,q10,q2
+vtrn.32 d24,d25
+vst1.8 {d12-d13},[r2,: 128]
+vshl.i32 q6,q13,#1
+add r2,sp,#576
+vst1.8 {d20-d21},[r2,: 128]
+vshl.i32 q10,q14,#1
+add r2,sp,#592
+vst1.8 {d12-d13},[r2,: 128]
+vshl.i32 q15,q12,#1
+vadd.i32 q8,q8,q4
+vext.32 d10,d31,d30,#0
+vadd.i32 q7,q7,q1
+add r2,sp,#608
+vst1.8 {d16-d17},[r2,: 128]
+vmull.s32 q8,d18,d5
+vmlal.s32 q8,d26,d4
+vmlal.s32 q8,d19,d9
+vmlal.s32 q8,d27,d3
+vmlal.s32 q8,d22,d8
+vmlal.s32 q8,d28,d2
+vmlal.s32 q8,d23,d7
+vmlal.s32 q8,d29,d1
+vmlal.s32 q8,d24,d6
+vmlal.s32 q8,d25,d0
+add r2,sp,#624
+vst1.8 {d14-d15},[r2,: 128]
+vmull.s32 q2,d18,d4
+vmlal.s32 q2,d12,d9
+vmlal.s32 q2,d13,d8
+vmlal.s32 q2,d19,d3
+vmlal.s32 q2,d22,d2
+vmlal.s32 q2,d23,d1
+vmlal.s32 q2,d24,d0
+add r2,sp,#640
+vst1.8 {d20-d21},[r2,: 128]
+vmull.s32 q7,d18,d9
+vmlal.s32 q7,d26,d3
+vmlal.s32 q7,d19,d8
+vmlal.s32 q7,d27,d2
+vmlal.s32 q7,d22,d7
+vmlal.s32 q7,d28,d1
+vmlal.s32 q7,d23,d6
+vmlal.s32 q7,d29,d0
+add r2,sp,#656
+vst1.8 {d10-d11},[r2,: 128]
+vmull.s32 q5,d18,d3
+vmlal.s32 q5,d19,d2
+vmlal.s32 q5,d22,d1
+vmlal.s32 q5,d23,d0
+vmlal.s32 q5,d12,d8
+add r2,sp,#672
+vst1.8 {d16-d17},[r2,: 128]
+vmull.s32 q4,d18,d8
+vmlal.s32 q4,d26,d2
+vmlal.s32 q4,d19,d7
+vmlal.s32 q4,d27,d1
+vmlal.s32 q4,d22,d6
+vmlal.s32 q4,d28,d0
+vmull.s32 q8,d18,d7
+vmlal.s32 q8,d26,d1
+vmlal.s32 q8,d19,d6
+vmlal.s32 q8,d27,d0
+add r2,sp,#576
+vld1.8 {d20-d21},[r2,: 128]
+vmlal.s32 q7,d24,d21
+vmlal.s32 q7,d25,d20
+vmlal.s32 q4,d23,d21
+vmlal.s32 q4,d29,d20
+vmlal.s32 q8,d22,d21
+vmlal.s32 q8,d28,d20
+vmlal.s32 q5,d24,d20
+add r2,sp,#576
+vst1.8 {d14-d15},[r2,: 128]
+vmull.s32 q7,d18,d6
+vmlal.s32 q7,d26,d0
+add r2,sp,#656
+vld1.8 {d30-d31},[r2,: 128]
+vmlal.s32 q2,d30,d21
+vmlal.s32 q7,d19,d21
+vmlal.s32 q7,d27,d20
+add r2,sp,#624
+vld1.8 {d26-d27},[r2,: 128]
+vmlal.s32 q4,d25,d27
+vmlal.s32 q8,d29,d27
+vmlal.s32 q8,d25,d26
+vmlal.s32 q7,d28,d27
+vmlal.s32 q7,d29,d26
+add r2,sp,#608
+vld1.8 {d28-d29},[r2,: 128]
+vmlal.s32 q4,d24,d29
+vmlal.s32 q8,d23,d29
+vmlal.s32 q8,d24,d28
+vmlal.s32 q7,d22,d29
+vmlal.s32 q7,d23,d28
+add r2,sp,#608
+vst1.8 {d8-d9},[r2,: 128]
+add r2,sp,#560
+vld1.8 {d8-d9},[r2,: 128]
+vmlal.s32 q7,d24,d9
+vmlal.s32 q7,d25,d31
+vmull.s32 q1,d18,d2
+vmlal.s32 q1,d19,d1
+vmlal.s32 q1,d22,d0
+vmlal.s32 q1,d24,d27
+vmlal.s32 q1,d23,d20
+vmlal.s32 q1,d12,d7
+vmlal.s32 q1,d13,d6
+vmull.s32 q6,d18,d1
+vmlal.s32 q6,d19,d0
+vmlal.s32 q6,d23,d27
+vmlal.s32 q6,d22,d20
+vmlal.s32 q6,d24,d26
+vmull.s32 q0,d18,d0
+vmlal.s32 q0,d22,d27
+vmlal.s32 q0,d23,d26
+vmlal.s32 q0,d24,d31
+vmlal.s32 q0,d19,d20
+add r2,sp,#640
+vld1.8 {d18-d19},[r2,: 128]
+vmlal.s32 q2,d18,d7
+vmlal.s32 q2,d19,d6
+vmlal.s32 q5,d18,d6
+vmlal.s32 q5,d19,d21
+vmlal.s32 q1,d18,d21
+vmlal.s32 q1,d19,d29
+vmlal.s32 q0,d18,d28
+vmlal.s32 q0,d19,d9
+vmlal.s32 q6,d18,d29
+vmlal.s32 q6,d19,d28
+add r2,sp,#592
+vld1.8 {d18-d19},[r2,: 128]
+add r2,sp,#512
+vld1.8 {d22-d23},[r2,: 128]
+vmlal.s32 q5,d19,d7
+vmlal.s32 q0,d18,d21
+vmlal.s32 q0,d19,d29
+vmlal.s32 q6,d18,d6
+add r2,sp,#528
+vld1.8 {d6-d7},[r2,: 128]
+vmlal.s32 q6,d19,d21
+add r2,sp,#576
+vld1.8 {d18-d19},[r2,: 128]
+vmlal.s32 q0,d30,d8
+add r2,sp,#672
+vld1.8 {d20-d21},[r2,: 128]
+vmlal.s32 q5,d30,d29
+add r2,sp,#608
+vld1.8 {d24-d25},[r2,: 128]
+vmlal.s32 q1,d30,d28
+vadd.i64 q13,q0,q11
+vadd.i64 q14,q5,q11
+vmlal.s32 q6,d30,d9
+vshr.s64 q4,q13,#26
+vshr.s64 q13,q14,#26
+vadd.i64 q7,q7,q4
+vshl.i64 q4,q4,#26
+vadd.i64 q14,q7,q3
+vadd.i64 q9,q9,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q15,q9,q3
+vsub.i64 q0,q0,q4
+vshr.s64 q4,q14,#25
+vsub.i64 q5,q5,q13
+vshr.s64 q13,q15,#25
+vadd.i64 q6,q6,q4
+vshl.i64 q4,q4,#25
+vadd.i64 q14,q6,q11
+vadd.i64 q2,q2,q13
+vsub.i64 q4,q7,q4
+vshr.s64 q7,q14,#26
+vshl.i64 q13,q13,#25
+vadd.i64 q14,q2,q11
+vadd.i64 q8,q8,q7
+vshl.i64 q7,q7,#26
+vadd.i64 q15,q8,q3
+vsub.i64 q9,q9,q13
+vshr.s64 q13,q14,#26
+vsub.i64 q6,q6,q7
+vshr.s64 q7,q15,#25
+vadd.i64 q10,q10,q13
+vshl.i64 q13,q13,#26
+vadd.i64 q14,q10,q3
+vadd.i64 q1,q1,q7
+add r2,r3,#240
+vshl.i64 q7,q7,#25
+add r4,r3,#144
+vadd.i64 q15,q1,q11
+add r2,r2,#8
+vsub.i64 q2,q2,q13
+add r4,r4,#8
+vshr.s64 q13,q14,#25
+vsub.i64 q7,q8,q7
+vshr.s64 q8,q15,#26
+vadd.i64 q14,q13,q13
+vadd.i64 q12,q12,q8
+vtrn.32 d12,d14
+vshl.i64 q8,q8,#26
+vtrn.32 d13,d15
+vadd.i64 q3,q12,q3
+vadd.i64 q0,q0,q14
+vst1.8 d12,[r2,: 64]!
+vshl.i64 q7,q13,#4
+vst1.8 d13,[r4,: 64]!
+vsub.i64 q1,q1,q8
+vshr.s64 q3,q3,#25
+vadd.i64 q0,q0,q7
+vadd.i64 q5,q5,q3
+vshl.i64 q3,q3,#25
+vadd.i64 q6,q5,q11
+vadd.i64 q0,q0,q13
+vshl.i64 q7,q13,#25
+vadd.i64 q8,q0,q11
+vsub.i64 q3,q12,q3
+vshr.s64 q6,q6,#26
+vsub.i64 q7,q10,q7
+vtrn.32 d2,d6
+vshr.s64 q8,q8,#26
+vtrn.32 d3,d7
+vadd.i64 q3,q9,q6
+vst1.8 d2,[r2,: 64]
+vshl.i64 q6,q6,#26
+vst1.8 d3,[r4,: 64]
+vadd.i64 q1,q4,q8
+vtrn.32 d4,d14
+vshl.i64 q4,q8,#26
+vtrn.32 d5,d15
+vsub.i64 q5,q5,q6
+add r2,r2,#16
+vsub.i64 q0,q0,q4
+vst1.8 d4,[r2,: 64]
+add r4,r4,#16
+vst1.8 d5,[r4,: 64]
+vtrn.32 d10,d6
+vtrn.32 d11,d7
+sub r2,r2,#8
+sub r4,r4,#8
+vtrn.32 d0,d2
+vtrn.32 d1,d3
+vst1.8 d10,[r2,: 64]
+vst1.8 d11,[r4,: 64]
+sub r2,r2,#24
+sub r4,r4,#24
+vst1.8 d0,[r2,: 64]
+vst1.8 d1,[r4,: 64]
+ldr r2,[sp,#488]
+ldr r4,[sp,#492]
+subs r5,r2,#1
+bge ._mainloop
+add r1,r3,#144
+add r2,r3,#336
+vld1.8 {d0-d1},[r1,: 128]!
+vld1.8 {d2-d3},[r1,: 128]!
+vld1.8 {d4},[r1,: 64]
+vst1.8 {d0-d1},[r2,: 128]!
+vst1.8 {d2-d3},[r2,: 128]!
+vst1.8 d4,[r2,: 64]
+ldr r1,=0
+._invertloop:
+add r2,r3,#144
+ldr r4,=0
+ldr r5,=2
+cmp r1,#1
+ldreq r5,=1
+addeq r2,r3,#336
+addeq r4,r3,#48
+cmp r1,#2
+ldreq r5,=1
+addeq r2,r3,#48
+cmp r1,#3
+ldreq r5,=5
+addeq r4,r3,#336
+cmp r1,#4
+ldreq r5,=10
+cmp r1,#5
+ldreq r5,=20
+cmp r1,#6
+ldreq r5,=10
+addeq r2,r3,#336
+addeq r4,r3,#336
+cmp r1,#7
+ldreq r5,=50
+cmp r1,#8
+ldreq r5,=100
+cmp r1,#9
+ldreq r5,=50
+addeq r2,r3,#336
+cmp r1,#10
+ldreq r5,=5
+addeq r2,r3,#48
+cmp r1,#11
+ldreq r5,=0
+addeq r2,r3,#96
+add r6,r3,#144
+add r7,r3,#288
+vld1.8 {d0-d1},[r6,: 128]!
+vld1.8 {d2-d3},[r6,: 128]!
+vld1.8 {d4},[r6,: 64]
+vst1.8 {d0-d1},[r7,: 128]!
+vst1.8 {d2-d3},[r7,: 128]!
+vst1.8 d4,[r7,: 64]
+cmp r5,#0
+beq ._skipsquaringloop
+._squaringloop:
+add r6,r3,#288
+add r7,r3,#288
+add r8,r3,#288
+vmov.i32 q0,#19
+vmov.i32 q1,#0
+vmov.i32 q2,#1
+vzip.i32 q1,q2
+vld1.8 {d4-d5},[r7,: 128]!
+vld1.8 {d6-d7},[r7,: 128]!
+vld1.8 {d9},[r7,: 64]
+vld1.8 {d10-d11},[r6,: 128]!
+add r7,sp,#416
+vld1.8 {d12-d13},[r6,: 128]!
+vmul.i32 q7,q2,q0
+vld1.8 {d8},[r6,: 64]
+vext.32 d17,d11,d10,#1
+vmul.i32 q9,q3,q0
+vext.32 d16,d10,d8,#1
+vshl.u32 q10,q5,q1
+vext.32 d22,d14,d4,#1
+vext.32 d24,d18,d6,#1
+vshl.u32 q13,q6,q1
+vshl.u32 d28,d8,d2
+vrev64.i32 d22,d22
+vmul.i32 d1,d9,d1
+vrev64.i32 d24,d24
+vext.32 d29,d8,d13,#1
+vext.32 d0,d1,d9,#1
+vrev64.i32 d0,d0
+vext.32 d2,d9,d1,#1
+vext.32 d23,d15,d5,#1
+vmull.s32 q4,d20,d4
+vrev64.i32 d23,d23
+vmlal.s32 q4,d21,d1
+vrev64.i32 d2,d2
+vmlal.s32 q4,d26,d19
+vext.32 d3,d5,d15,#1
+vmlal.s32 q4,d27,d18
+vrev64.i32 d3,d3
+vmlal.s32 q4,d28,d15
+vext.32 d14,d12,d11,#1
+vmull.s32 q5,d16,d23
+vext.32 d15,d13,d12,#1
+vmlal.s32 q5,d17,d4
+vst1.8 d8,[r7,: 64]!
+vmlal.s32 q5,d14,d1
+vext.32 d12,d9,d8,#0
+vmlal.s32 q5,d15,d19
+vmov.i64 d13,#0
+vmlal.s32 q5,d29,d18
+vext.32 d25,d19,d7,#1
+vmlal.s32 q6,d20,d5
+vrev64.i32 d25,d25
+vmlal.s32 q6,d21,d4
+vst1.8 d11,[r7,: 64]!
+vmlal.s32 q6,d26,d1
+vext.32 d9,d10,d10,#0
+vmlal.s32 q6,d27,d19
+vmov.i64 d8,#0
+vmlal.s32 q6,d28,d18
+vmlal.s32 q4,d16,d24
+vmlal.s32 q4,d17,d5
+vmlal.s32 q4,d14,d4
+vst1.8 d12,[r7,: 64]!
+vmlal.s32 q4,d15,d1
+vext.32 d10,d13,d12,#0
+vmlal.s32 q4,d29,d19
+vmov.i64 d11,#0
+vmlal.s32 q5,d20,d6
+vmlal.s32 q5,d21,d5
+vmlal.s32 q5,d26,d4
+vext.32 d13,d8,d8,#0
+vmlal.s32 q5,d27,d1
+vmov.i64 d12,#0
+vmlal.s32 q5,d28,d19
+vst1.8 d9,[r7,: 64]!
+vmlal.s32 q6,d16,d25
+vmlal.s32 q6,d17,d6
+vst1.8 d10,[r7,: 64]
+vmlal.s32 q6,d14,d5
+vext.32 d8,d11,d10,#0
+vmlal.s32 q6,d15,d4
+vmov.i64 d9,#0
+vmlal.s32 q6,d29,d1
+vmlal.s32 q4,d20,d7
+vmlal.s32 q4,d21,d6
+vmlal.s32 q4,d26,d5
+vext.32 d11,d12,d12,#0
+vmlal.s32 q4,d27,d4
+vmov.i64 d10,#0
+vmlal.s32 q4,d28,d1
+vmlal.s32 q5,d16,d0
+sub r6,r7,#32
+vmlal.s32 q5,d17,d7
+vmlal.s32 q5,d14,d6
+vext.32 d30,d9,d8,#0
+vmlal.s32 q5,d15,d5
+vld1.8 {d31},[r6,: 64]!
+vmlal.s32 q5,d29,d4
+vmlal.s32 q15,d20,d0
+vext.32 d0,d6,d18,#1
+vmlal.s32 q15,d21,d25
+vrev64.i32 d0,d0
+vmlal.s32 q15,d26,d24
+vext.32 d1,d7,d19,#1
+vext.32 d7,d10,d10,#0
+vmlal.s32 q15,d27,d23
+vrev64.i32 d1,d1
+vld1.8 {d6},[r6,: 64]
+vmlal.s32 q15,d28,d22
+vmlal.s32 q3,d16,d4
+add r6,r6,#24
+vmlal.s32 q3,d17,d2
+vext.32 d4,d31,d30,#0
+vmov d17,d11
+vmlal.s32 q3,d14,d1
+vext.32 d11,d13,d13,#0
+vext.32 d13,d30,d30,#0
+vmlal.s32 q3,d15,d0
+vext.32 d1,d8,d8,#0
+vmlal.s32 q3,d29,d3
+vld1.8 {d5},[r6,: 64]
+sub r6,r6,#16
+vext.32 d10,d6,d6,#0
+vmov.i32 q1,#0xffffffff
+vshl.i64 q4,q1,#25
+add r7,sp,#512
+vld1.8 {d14-d15},[r7,: 128]
+vadd.i64 q9,q2,q7
+vshl.i64 q1,q1,#26
+vshr.s64 q10,q9,#26
+vld1.8 {d0},[r6,: 64]!
+vadd.i64 q5,q5,q10
+vand q9,q9,q1
+vld1.8 {d16},[r6,: 64]!
+add r6,sp,#528
+vld1.8 {d20-d21},[r6,: 128]
+vadd.i64 q11,q5,q10
+vsub.i64 q2,q2,q9
+vshr.s64 q9,q11,#25
+vext.32 d12,d5,d4,#0
+vand q11,q11,q4
+vadd.i64 q0,q0,q9
+vmov d19,d7
+vadd.i64 q3,q0,q7
+vsub.i64 q5,q5,q11
+vshr.s64 q11,q3,#26
+vext.32 d18,d11,d10,#0
+vand q3,q3,q1
+vadd.i64 q8,q8,q11
+vadd.i64 q11,q8,q10
+vsub.i64 q0,q0,q3
+vshr.s64 q3,q11,#25
+vand q11,q11,q4
+vadd.i64 q3,q6,q3
+vadd.i64 q6,q3,q7
+vsub.i64 q8,q8,q11
+vshr.s64 q11,q6,#26
+vand q6,q6,q1
+vadd.i64 q9,q9,q11
+vadd.i64 d25,d19,d21
+vsub.i64 q3,q3,q6
+vshr.s64 d23,d25,#25
+vand q4,q12,q4
+vadd.i64 d21,d23,d23
+vshl.i64 d25,d23,#4
+vadd.i64 d21,d21,d23
+vadd.i64 d25,d25,d21
+vadd.i64 d4,d4,d25
+vzip.i32 q0,q8
+vadd.i64 d12,d4,d14
+add r6,r8,#8
+vst1.8 d0,[r6,: 64]
+vsub.i64 d19,d19,d9
+add r6,r6,#16
+vst1.8 d16,[r6,: 64]
+vshr.s64 d22,d12,#26
+vand q0,q6,q1
+vadd.i64 d10,d10,d22
+vzip.i32 q3,q9
+vsub.i64 d4,d4,d0
+sub r6,r6,#8
+vst1.8 d6,[r6,: 64]
+add r6,r6,#16
+vst1.8 d18,[r6,: 64]
+vzip.i32 q2,q5
+sub r6,r6,#32
+vst1.8 d4,[r6,: 64]
+subs r5,r5,#1
+bhi ._squaringloop
+._skipsquaringloop:
+mov r2,r2
+add r5,r3,#288
+add r6,r3,#144
+vmov.i32 q0,#19
+vmov.i32 q1,#0
+vmov.i32 q2,#1
+vzip.i32 q1,q2
+vld1.8 {d4-d5},[r5,: 128]!
+vld1.8 {d6-d7},[r5,: 128]!
+vld1.8 {d9},[r5,: 64]
+vld1.8 {d10-d11},[r2,: 128]!
+add r5,sp,#416
+vld1.8 {d12-d13},[r2,: 128]!
+vmul.i32 q7,q2,q0
+vld1.8 {d8},[r2,: 64]
+vext.32 d17,d11,d10,#1
+vmul.i32 q9,q3,q0
+vext.32 d16,d10,d8,#1
+vshl.u32 q10,q5,q1
+vext.32 d22,d14,d4,#1
+vext.32 d24,d18,d6,#1
+vshl.u32 q13,q6,q1
+vshl.u32 d28,d8,d2
+vrev64.i32 d22,d22
+vmul.i32 d1,d9,d1
+vrev64.i32 d24,d24
+vext.32 d29,d8,d13,#1
+vext.32 d0,d1,d9,#1
+vrev64.i32 d0,d0
+vext.32 d2,d9,d1,#1
+vext.32 d23,d15,d5,#1
+vmull.s32 q4,d20,d4
+vrev64.i32 d23,d23
+vmlal.s32 q4,d21,d1
+vrev64.i32 d2,d2
+vmlal.s32 q4,d26,d19
+vext.32 d3,d5,d15,#1
+vmlal.s32 q4,d27,d18
+vrev64.i32 d3,d3
+vmlal.s32 q4,d28,d15
+vext.32 d14,d12,d11,#1
+vmull.s32 q5,d16,d23
+vext.32 d15,d13,d12,#1
+vmlal.s32 q5,d17,d4
+vst1.8 d8,[r5,: 64]!
+vmlal.s32 q5,d14,d1
+vext.32 d12,d9,d8,#0
+vmlal.s32 q5,d15,d19
+vmov.i64 d13,#0
+vmlal.s32 q5,d29,d18
+vext.32 d25,d19,d7,#1
+vmlal.s32 q6,d20,d5
+vrev64.i32 d25,d25
+vmlal.s32 q6,d21,d4
+vst1.8 d11,[r5,: 64]!
+vmlal.s32 q6,d26,d1
+vext.32 d9,d10,d10,#0
+vmlal.s32 q6,d27,d19
+vmov.i64 d8,#0
+vmlal.s32 q6,d28,d18
+vmlal.s32 q4,d16,d24
+vmlal.s32 q4,d17,d5
+vmlal.s32 q4,d14,d4
+vst1.8 d12,[r5,: 64]!
+vmlal.s32 q4,d15,d1
+vext.32 d10,d13,d12,#0
+vmlal.s32 q4,d29,d19
+vmov.i64 d11,#0
+vmlal.s32 q5,d20,d6
+vmlal.s32 q5,d21,d5
+vmlal.s32 q5,d26,d4
+vext.32 d13,d8,d8,#0
+vmlal.s32 q5,d27,d1
+vmov.i64 d12,#0
+vmlal.s32 q5,d28,d19
+vst1.8 d9,[r5,: 64]!
+vmlal.s32 q6,d16,d25
+vmlal.s32 q6,d17,d6
+vst1.8 d10,[r5,: 64]
+vmlal.s32 q6,d14,d5
+vext.32 d8,d11,d10,#0
+vmlal.s32 q6,d15,d4
+vmov.i64 d9,#0
+vmlal.s32 q6,d29,d1
+vmlal.s32 q4,d20,d7
+vmlal.s32 q4,d21,d6
+vmlal.s32 q4,d26,d5
+vext.32 d11,d12,d12,#0
+vmlal.s32 q4,d27,d4
+vmov.i64 d10,#0
+vmlal.s32 q4,d28,d1
+vmlal.s32 q5,d16,d0
+sub r2,r5,#32
+vmlal.s32 q5,d17,d7
+vmlal.s32 q5,d14,d6
+vext.32 d30,d9,d8,#0
+vmlal.s32 q5,d15,d5
+vld1.8 {d31},[r2,: 64]!
+vmlal.s32 q5,d29,d4
+vmlal.s32 q15,d20,d0
+vext.32 d0,d6,d18,#1
+vmlal.s32 q15,d21,d25
+vrev64.i32 d0,d0
+vmlal.s32 q15,d26,d24
+vext.32 d1,d7,d19,#1
+vext.32 d7,d10,d10,#0
+vmlal.s32 q15,d27,d23
+vrev64.i32 d1,d1
+vld1.8 {d6},[r2,: 64]
+vmlal.s32 q15,d28,d22
+vmlal.s32 q3,d16,d4
+add r2,r2,#24
+vmlal.s32 q3,d17,d2
+vext.32 d4,d31,d30,#0
+vmov d17,d11
+vmlal.s32 q3,d14,d1
+vext.32 d11,d13,d13,#0
+vext.32 d13,d30,d30,#0
+vmlal.s32 q3,d15,d0
+vext.32 d1,d8,d8,#0
+vmlal.s32 q3,d29,d3
+vld1.8 {d5},[r2,: 64]
+sub r2,r2,#16
+vext.32 d10,d6,d6,#0
+vmov.i32 q1,#0xffffffff
+vshl.i64 q4,q1,#25
+add r5,sp,#512
+vld1.8 {d14-d15},[r5,: 128]
+vadd.i64 q9,q2,q7
+vshl.i64 q1,q1,#26
+vshr.s64 q10,q9,#26
+vld1.8 {d0},[r2,: 64]!
+vadd.i64 q5,q5,q10
+vand q9,q9,q1
+vld1.8 {d16},[r2,: 64]!
+add r2,sp,#528
+vld1.8 {d20-d21},[r2,: 128]
+vadd.i64 q11,q5,q10
+vsub.i64 q2,q2,q9
+vshr.s64 q9,q11,#25
+vext.32 d12,d5,d4,#0
+vand q11,q11,q4
+vadd.i64 q0,q0,q9
+vmov d19,d7
+vadd.i64 q3,q0,q7
+vsub.i64 q5,q5,q11
+vshr.s64 q11,q3,#26
+vext.32 d18,d11,d10,#0
+vand q3,q3,q1
+vadd.i64 q8,q8,q11
+vadd.i64 q11,q8,q10
+vsub.i64 q0,q0,q3
+vshr.s64 q3,q11,#25
+vand q11,q11,q4
+vadd.i64 q3,q6,q3
+vadd.i64 q6,q3,q7
+vsub.i64 q8,q8,q11
+vshr.s64 q11,q6,#26
+vand q6,q6,q1
+vadd.i64 q9,q9,q11
+vadd.i64 d25,d19,d21
+vsub.i64 q3,q3,q6
+vshr.s64 d23,d25,#25
+vand q4,q12,q4
+vadd.i64 d21,d23,d23
+vshl.i64 d25,d23,#4
+vadd.i64 d21,d21,d23
+vadd.i64 d25,d25,d21
+vadd.i64 d4,d4,d25
+vzip.i32 q0,q8
+vadd.i64 d12,d4,d14
+add r2,r6,#8
+vst1.8 d0,[r2,: 64]
+vsub.i64 d19,d19,d9
+add r2,r2,#16
+vst1.8 d16,[r2,: 64]
+vshr.s64 d22,d12,#26
+vand q0,q6,q1
+vadd.i64 d10,d10,d22
+vzip.i32 q3,q9
+vsub.i64 d4,d4,d0
+sub r2,r2,#8
+vst1.8 d6,[r2,: 64]
+add r2,r2,#16
+vst1.8 d18,[r2,: 64]
+vzip.i32 q2,q5
+sub r2,r2,#32
+vst1.8 d4,[r2,: 64]
+cmp r4,#0
+beq ._skippostcopy
+add r2,r3,#144
+mov r4,r4
+vld1.8 {d0-d1},[r2,: 128]!
+vld1.8 {d2-d3},[r2,: 128]!
+vld1.8 {d4},[r2,: 64]
+vst1.8 {d0-d1},[r4,: 128]!
+vst1.8 {d2-d3},[r4,: 128]!
+vst1.8 d4,[r4,: 64]
+._skippostcopy:
+cmp r1,#1
+bne ._skipfinalcopy
+add r2,r3,#288
+add r4,r3,#144
+vld1.8 {d0-d1},[r2,: 128]!
+vld1.8 {d2-d3},[r2,: 128]!
+vld1.8 {d4},[r2,: 64]
+vst1.8 {d0-d1},[r4,: 128]!
+vst1.8 {d2-d3},[r4,: 128]!
+vst1.8 d4,[r4,: 64]
+._skipfinalcopy:
+add r1,r1,#1
+cmp r1,#12
+blo ._invertloop
+add r1,r3,#144
+ldr r2,[r1],#4
+ldr r3,[r1],#4
+ldr r4,[r1],#4
+ldr r5,[r1],#4
+ldr r6,[r1],#4
+ldr r7,[r1],#4
+ldr r8,[r1],#4
+ldr r9,[r1],#4
+ldr r10,[r1],#4
+ldr r1,[r1]
+add r11,r1,r1,LSL #4
+add r11,r11,r1,LSL #1
+add r11,r11,#16777216
+mov r11,r11,ASR #25
+add r11,r11,r2
+mov r11,r11,ASR #26
+add r11,r11,r3
+mov r11,r11,ASR #25
+add r11,r11,r4
+mov r11,r11,ASR #26
+add r11,r11,r5
+mov r11,r11,ASR #25
+add r11,r11,r6
+mov r11,r11,ASR #26
+add r11,r11,r7
+mov r11,r11,ASR #25
+add r11,r11,r8
+mov r11,r11,ASR #26
+add r11,r11,r9
+mov r11,r11,ASR #25
+add r11,r11,r10
+mov r11,r11,ASR #26
+add r11,r11,r1
+mov r11,r11,ASR #25
+add r2,r2,r11
+add r2,r2,r11,LSL #1
+add r2,r2,r11,LSL #4
+mov r11,r2,ASR #26
+add r3,r3,r11
+sub r2,r2,r11,LSL #26
+mov r11,r3,ASR #25
+add r4,r4,r11
+sub r3,r3,r11,LSL #25
+mov r11,r4,ASR #26
+add r5,r5,r11
+sub r4,r4,r11,LSL #26
+mov r11,r5,ASR #25
+add r6,r6,r11
+sub r5,r5,r11,LSL #25
+mov r11,r6,ASR #26
+add r7,r7,r11
+sub r6,r6,r11,LSL #26
+mov r11,r7,ASR #25
+add r8,r8,r11
+sub r7,r7,r11,LSL #25
+mov r11,r8,ASR #26
+add r9,r9,r11
+sub r8,r8,r11,LSL #26
+mov r11,r9,ASR #25
+add r10,r10,r11
+sub r9,r9,r11,LSL #25
+mov r11,r10,ASR #26
+add r1,r1,r11
+sub r10,r10,r11,LSL #26
+mov r11,r1,ASR #25
+sub r1,r1,r11,LSL #25
+add r2,r2,r3,LSL #26
+mov r3,r3,LSR #6
+add r3,r3,r4,LSL #19
+mov r4,r4,LSR #13
+add r4,r4,r5,LSL #13
+mov r5,r5,LSR #19
+add r5,r5,r6,LSL #6
+add r6,r7,r8,LSL #25
+mov r7,r8,LSR #7
+add r7,r7,r9,LSL #19
+mov r8,r9,LSR #13
+add r8,r8,r10,LSL #12
+mov r9,r10,LSR #20
+add r1,r9,r1,LSL #6
+str r2,[r0],#4
+str r3,[r0],#4
+str r4,[r0],#4
+str r5,[r0],#4
+str r6,[r0],#4
+str r7,[r0],#4
+str r8,[r0],#4
+str r1,[r0]
+ldrd r4,[sp,#0]
+ldrd r6,[sp,#8]
+ldrd r8,[sp,#16]
+ldrd r10,[sp,#24]
+ldr r12,[sp,#480]
+ldr r14,[sp,#484]
+ldr r0,=0
+mov sp,r12
+vpop {q4,q5,q6,q7}
+bx lr
+
+#endif  /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */
diff --git a/ring-0.17.14/crypto/curve25519/curve25519.c b/ring-0.17.14/crypto/curve25519/curve25519.c
new file mode 100644
index 0000000000..99d7d7fbb2
--- /dev/null
+++ b/ring-0.17.14/crypto/curve25519/curve25519.c
@@ -0,0 +1,1924 @@
+// Copyright 2020 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
+// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
+// public domain. Other parts have been replaced to call into code generated by
+// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat.
+//
+// The field functions are shared by Ed25519 and X25519 where possible.
+
+#include <ring-core/mem.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+#if defined(_MSC_VER) && !defined(__clang__)
+// '=': conversion from 'int64_t' to 'int32_t', possible loss of data
+#pragma warning(disable: 4242)
+// '=': conversion from 'int32_t' to 'uint8_t', possible loss of data
+#pragma warning(disable: 4244)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Winline"
+#endif
+
+// Various pre-computed constants.
+#include "./curve25519_tables.h"
+
+#if defined(BORINGSSL_HAS_UINT128)
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include "../../third_party/fiat/curve25519_64.h"
+#elif defined(OPENSSL_64_BIT)
+#include "../../third_party/fiat/curve25519_64_msvc.h"
+#else
+#include "../../third_party/fiat/curve25519_32.h"
+#endif
+
+
+// Low-level intrinsic operations
+
+static uint64_t load_3(const uint8_t *in) {
+  uint64_t result;
+  result = (uint64_t)in[0];
+  result |= ((uint64_t)in[1]) << 8;
+  result |= ((uint64_t)in[2]) << 16;
+  return result;
+}
+
+static uint64_t load_4(const uint8_t *in) {
+  uint64_t result;
+  result = (uint64_t)in[0];
+  result |= ((uint64_t)in[1]) << 8;
+  result |= ((uint64_t)in[2]) << 16;
+  result |= ((uint64_t)in[3]) << 24;
+  return result;
+}
+
+
+// Field operations.
+
+#if defined(OPENSSL_64_BIT)
+
+// assert_fe asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc]]
+//
+// See comments in curve25519_64.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe(f)                                                    \
+  do {                                                                  \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
+      declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));  \
+    }                                                                   \
+  } while (0)
+
+// assert_fe_loose asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664]]
+//
+// See comments in curve25519_64.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe_loose(f)                                              \
+  do {                                                                  \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
+      declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \
+    }                                                                   \
+  } while (0)
+
+#else
+
+// assert_fe asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
+//
+// See comments in curve25519_32.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe(f)                                                     \
+  do {                                                                   \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
+      declassify_assert(f[_assert_fe_i] <=                               \
+                        ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \
+    }                                                                    \
+  } while (0)
+
+// assert_fe_loose asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
+//
+// See comments in curve25519_32.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe_loose(f)                                               \
+  do {                                                                   \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
+      declassify_assert(f[_assert_fe_i] <=                               \
+                        ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \
+    }                                                                    \
+  } while (0)
+
+#endif  // OPENSSL_64_BIT
+
+OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
+                      "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
+
+static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
+  // |fiat_25519_from_bytes| requires the top-most bit be clear.
+  declassify_assert((s[31] & 0x80) == 0);
+  fiat_25519_from_bytes(h->v, s);
+  assert_fe(h->v);
+}
+
+static void fe_frombytes(fe *h, const uint8_t s[32]) {
+  uint8_t s_copy[32];
+  OPENSSL_memcpy(s_copy, s, 32);
+  s_copy[31] &= 0x7f;
+  fe_frombytes_strict(h, s_copy);
+}
+
+static void fe_tobytes(uint8_t s[32], const fe *f) {
+  assert_fe(f->v);
+  fiat_25519_to_bytes(s, f->v);
+}
+
+// h = 0
+static void fe_0(fe *h) {
+  OPENSSL_memset(h, 0, sizeof(fe));
+}
+
+#if defined(OPENSSL_SMALL)
+
+static void fe_loose_0(fe_loose *h) {
+  OPENSSL_memset(h, 0, sizeof(fe_loose));
+}
+
+#endif
+
+// h = 1
+static void fe_1(fe *h) {
+  OPENSSL_memset(h, 0, sizeof(fe));
+  h->v[0] = 1;
+}
+
+#if defined(OPENSSL_SMALL)
+
+static void fe_loose_1(fe_loose *h) {
+  OPENSSL_memset(h, 0, sizeof(fe_loose));
+  h->v[0] = 1;
+}
+
+#endif
+
+// h = f + g
+// Can overlap h with f or g.
+static void fe_add(fe_loose *h, const fe *f, const fe *g) {
+  assert_fe(f->v);
+  assert_fe(g->v);
+  fiat_25519_add(h->v, f->v, g->v);
+  assert_fe_loose(h->v);
+}
+
+// h = f - g
+// Can overlap h with f or g.
+static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
+  assert_fe(f->v);
+  assert_fe(g->v);
+  fiat_25519_sub(h->v, f->v, g->v);
+  assert_fe_loose(h->v);
+}
+
+static void fe_carry(fe *h, const fe_loose* f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry(h->v, f->v);
+  assert_fe(h->v);
+}
+
+static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
+                        const fe_limb_t in1[FE_NUM_LIMBS],
+                        const fe_limb_t in2[FE_NUM_LIMBS]) {
+  assert_fe_loose(in1);
+  assert_fe_loose(in2);
+  fiat_25519_carry_mul(out, in1, in2);
+  assert_fe(out);
+}
+
+static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+
+#if defined(OPENSSL_SMALL)
+static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+#endif
+
+static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+
+static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+
+static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+
+static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+
+static void fe_sq_tl(fe *h, const fe_loose *f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry_square(h->v, f->v);
+  assert_fe(h->v);
+}
+
+static void fe_sq_tt(fe *h, const fe *f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry_square(h->v, f->v);
+  assert_fe(h->v);
+}
+
+// Replace (f,g) with (g,f) if b == 1;
+// replace (f,g) with (f,g) if b == 0.
+//
+// Preconditions: b in {0,1}.
+static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
+  b = 0-b;
+  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
+    fe_limb_t x = f->v[i] ^ g->v[i];
+    x &= b;
+    f->v[i] ^= x;
+    g->v[i] ^= x;
+  }
+}
+
+static void fe_mul121666(fe *h, const fe_loose *f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry_scmul_121666(h->v, f->v);
+  assert_fe(h->v);
+}
+
+// h = -f
+static void fe_neg(fe_loose *h, const fe *f) {
+  assert_fe(f->v);
+  fiat_25519_opp(h->v, f->v);
+  assert_fe_loose(h->v);
+}
+
+// Replace (f,g) with (g,g) if b == 1;
+// replace (f,g) with (f,g) if b == 0.
+//
+// Preconditions: b in {0,1}.
+static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
+  // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
+  // different one.
+
+  b = 0-b;
+  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
+    fe_limb_t x = f->v[i] ^ g->v[i];
+    x &= b;
+    f->v[i] ^= x;
+  }
+}
+
+// h = f
+static void fe_copy(fe *h, const fe *f) {
+  fe_limbs_copy(h->v, f->v);
+}
+
+static void fe_copy_lt(fe_loose *h, const fe *f) {
+  OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch");
+  fe_limbs_copy(h->v, f->v);
+}
+
+static void fe_loose_invert(fe *out, const fe_loose *z) {
+  fe t0;
+  fe t1;
+  fe t2;
+  fe t3;
+  int i;
+
+  fe_sq_tl(&t0, z);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 2; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_tlt(&t1, z, &t1);
+  fe_mul_ttt(&t0, &t0, &t1);
+  fe_sq_tt(&t2, &t0);
+  fe_mul_ttt(&t1, &t1, &t2);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 5; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t2, &t2, &t1);
+  fe_sq_tt(&t3, &t2);
+  for (i = 1; i < 20; ++i) {
+    fe_sq_tt(&t3, &t3);
+  }
+  fe_mul_ttt(&t2, &t3, &t2);
+  fe_sq_tt(&t2, &t2);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t2, &t2, &t1);
+  fe_sq_tt(&t3, &t2);
+  for (i = 1; i < 100; ++i) {
+    fe_sq_tt(&t3, &t3);
+  }
+  fe_mul_ttt(&t2, &t3, &t2);
+  fe_sq_tt(&t2, &t2);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t1, &t1);
+  for (i = 1; i < 5; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(out, &t1, &t0);
+}
+
+static void fe_invert(fe *out, const fe *z) {
+  fe_loose l;
+  fe_copy_lt(&l, z);
+  fe_loose_invert(out, &l);
+}
+
+// return 0 if f == 0
+// return 1 if f != 0
+static int fe_isnonzero(const fe_loose *f) {
+  fe tight;
+  fe_carry(&tight, f);
+  uint8_t s[32];
+  fe_tobytes(s, &tight);
+
+  static const uint8_t zero[32] = {0};
+  return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
+}
+
+// return 1 if f is in {1,3,5,...,q-2}
+// return 0 if f is in {0,2,4,...,q-1}
+static int fe_isnegative(const fe *f) {
+  uint8_t s[32];
+  fe_tobytes(s, f);
+  return s[0] & 1;
+}
+
+static void fe_sq2_tt(fe *h, const fe *f) {
+  // h = f^2
+  fe_sq_tt(h, f);
+
+  // h = h + h
+  fe_loose tmp;
+  fe_add(&tmp, h, h);
+  fe_carry(h, &tmp);
+}
+
+static void fe_pow22523(fe *out, const fe *z) {
+  fe t0;
+  fe t1;
+  fe t2;
+  int i;
+
+  fe_sq_tt(&t0, z);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 2; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t1, z, &t1);
+  fe_mul_ttt(&t0, &t0, &t1);
+  fe_sq_tt(&t0, &t0);
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 5; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t1, &t1, &t0);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 20; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t1, &t1);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t1, &t1, &t0);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 100; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t1, &t1);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t0, &t0);
+  for (i = 1; i < 2; ++i) {
+    fe_sq_tt(&t0, &t0);
+  }
+  fe_mul_ttt(out, &t0, z);
+}
+
+
+// Group operations.
+
+int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
+  fe u;
+  fe_loose v;
+  fe w;
+  fe vxx;
+  fe_loose check;
+
+  fe_frombytes(&h->Y, s);
+  fe_1(&h->Z);
+  fe_sq_tt(&w, &h->Y);
+  fe_mul_ttt(&vxx, &w, &d);
+  fe_sub(&v, &w, &h->Z);  // u = y^2-1
+  fe_carry(&u, &v);
+  fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
+
+  fe_mul_ttl(&w, &u, &v);  // w = u*v
+  fe_pow22523(&h->X, &w);  // x = w^((q-5)/8)
+  fe_mul_ttt(&h->X, &h->X, &u);  // x = u*w^((q-5)/8)
+
+  fe_sq_tt(&vxx, &h->X);
+  fe_mul_ttl(&vxx, &vxx, &v);
+  fe_sub(&check, &vxx, &u);
+  if (fe_isnonzero(&check)) {
+    fe_add(&check, &vxx, &u);
+    if (fe_isnonzero(&check)) {
+      return 0;
+    }
+    fe_mul_ttt(&h->X, &h->X, &sqrtm1);
+  }
+
+  if (fe_isnegative(&h->X) != (s[31] >> 7)) {
+    fe_loose t;
+    fe_neg(&t, &h->X);
+    fe_carry(&h->X, &t);
+  }
+
+  fe_mul_ttt(&h->T, &h->X, &h->Y);
+  return 1;
+}
+
+static void ge_p2_0(ge_p2 *h) {
+  fe_0(&h->X);
+  fe_1(&h->Y);
+  fe_1(&h->Z);
+}
+
+static void ge_p3_0(ge_p3 *h) {
+  fe_0(&h->X);
+  fe_1(&h->Y);
+  fe_1(&h->Z);
+  fe_0(&h->T);
+}
+
+#if defined(OPENSSL_SMALL)
+
+static void ge_precomp_0(ge_precomp *h) {
+  fe_loose_1(&h->yplusx);
+  fe_loose_1(&h->yminusx);
+  fe_loose_0(&h->xy2d);
+}
+
+#endif
+
+// r = p
+static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
+  fe_copy(&r->X, &p->X);
+  fe_copy(&r->Y, &p->Y);
+  fe_copy(&r->Z, &p->Z);
+}
+
+// r = p
+static void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
+  fe_add(&r->YplusX, &p->Y, &p->X);
+  fe_sub(&r->YminusX, &p->Y, &p->X);
+  fe_copy_lt(&r->Z, &p->Z);
+  fe_mul_ltt(&r->T2d, &p->T, &d2);
+}
+
+// r = p
+static void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
+  fe_mul_tll(&r->X, &p->X, &p->T);
+  fe_mul_tll(&r->Y, &p->Y, &p->Z);
+  fe_mul_tll(&r->Z, &p->Z, &p->T);
+}
+
+// r = p
+static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
+  fe_mul_tll(&r->X, &p->X, &p->T);
+  fe_mul_tll(&r->Y, &p->Y, &p->Z);
+  fe_mul_tll(&r->Z, &p->Z, &p->T);
+  fe_mul_tll(&r->T, &p->X, &p->Y);
+}
+
+// r = 2 * p
+static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
+  fe trX, trZ, trT;
+  fe t0;
+
+  fe_sq_tt(&trX, &p->X);
+  fe_sq_tt(&trZ, &p->Y);
+  fe_sq2_tt(&trT, &p->Z);
+  fe_add(&r->Y, &p->X, &p->Y);
+  fe_sq_tl(&t0, &r->Y);
+
+  fe_add(&r->Y, &trZ, &trX);
+  fe_sub(&r->Z, &trZ, &trX);
+  fe_carry(&trZ, &r->Y);
+  fe_sub(&r->X, &t0, &trZ);
+  fe_carry(&trZ, &r->Z);
+  fe_sub(&r->T, &trT, &trZ);
+}
+
+// r = 2 * p
+static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
+  ge_p2 q;
+  ge_p3_to_p2(&q, p);
+  ge_p2_dbl(r, &q);
+}
+
+// r = p + q
+static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
+  fe trY, trZ, trT;
+
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->yplusx);
+  fe_mul_tll(&trY, &r->Y, &q->yminusx);
+  fe_mul_tlt(&trT, &q->xy2d, &p->T);
+  fe_add(&r->T, &p->Z, &p->Z);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_add(&r->Z, &trZ, &trT);
+  fe_sub(&r->T, &trZ, &trT);
+}
+
+// r = p - q
+static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
+  fe trY, trZ, trT;
+
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->yminusx);
+  fe_mul_tll(&trY, &r->Y, &q->yplusx);
+  fe_mul_tlt(&trT, &q->xy2d, &p->T);
+  fe_add(&r->T, &p->Z, &p->Z);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_sub(&r->Z, &trZ, &trT);
+  fe_add(&r->T, &trZ, &trT);
+}
+
+// r = p + q
+static void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
+  fe trX, trY, trZ, trT;
+
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->YplusX);
+  fe_mul_tll(&trY, &r->Y, &q->YminusX);
+  fe_mul_tlt(&trT, &q->T2d, &p->T);
+  fe_mul_ttl(&trX, &p->Z, &q->Z);
+  fe_add(&r->T, &trX, &trX);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_add(&r->Z, &trZ, &trT);
+  fe_sub(&r->T, &trZ, &trT);
+}
+
+// r = p - q
+static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
+  fe trX, trY, trZ, trT;
+
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->YminusX);
+  fe_mul_tll(&trY, &r->Y, &q->YplusX);
+  fe_mul_tlt(&trT, &q->T2d, &p->T);
+  fe_mul_ttl(&trX, &p->Z, &q->Z);
+  fe_add(&r->T, &trX, &trX);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_sub(&r->Z, &trZ, &trT);
+  fe_add(&r->T, &trZ, &trT);
+}
+
+static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
+  fe_cmov(&t->yplusx, &u->yplusx, b);
+  fe_cmov(&t->yminusx, &u->yminusx, b);
+  fe_cmov(&t->xy2d, &u->xy2d, b);
+}
+
+#if defined(OPENSSL_SMALL)
+
+static void x25519_ge_scalarmult_small_precomp(
+    ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
+  // precomp_table is first expanded into matching |ge_precomp|
+  // elements.
+  ge_precomp multiples[15];
+
+  unsigned i;
+  for (i = 0; i < 15; i++) {
+    // The precomputed table is assumed to already clear the top bit, so
+    // |fe_frombytes_strict| may be used directly.
+    const uint8_t *bytes = &precomp_table[i*(2 * 32)];
+    fe x, y;
+    fe_frombytes_strict(&x, bytes);
+    fe_frombytes_strict(&y, bytes + 32);
+
+    ge_precomp *out = &multiples[i];
+    fe_add(&out->yplusx, &y, &x);
+    fe_sub(&out->yminusx, &y, &x);
+    fe_mul_ltt(&out->xy2d, &x, &y);
+    fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
+  }
+
+  // See the comment above |k25519SmallPrecomp| about the structure of the
+  // precomputed elements. This loop does 64 additions and 64 doublings to
+  // calculate the result.
+  ge_p3_0(h);
+
+  for (i = 63; i < 64; i--) {
+    unsigned j;
+    signed char index = 0;
+
+    for (j = 0; j < 4; j++) {
+      const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
+      index |= (bit << j);
+    }
+
+    ge_precomp e;
+    ge_precomp_0(&e);
+
+    for (j = 1; j < 16; j++) {
+      cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j));
+    }
+
+    ge_cached cached;
+    ge_p1p1 r;
+    x25519_ge_p3_to_cached(&cached, h);
+    x25519_ge_add(&r, h, &cached);
+    x25519_ge_p1p1_to_p3(h, &r);
+
+    ge_madd(&r, h, &e);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+}
+
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) {
+  (void)use_adx;
+  x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
+}
+
+#else
+
+static void table_select(ge_precomp *t, const int pos, const signed char b) {
+  uint8_t bnegative = constant_time_msb_w(b);
+  uint8_t babs = b - ((bnegative & b) << 1);
+
+  uint8_t t_bytes[3][32] = {
+      {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
+#if defined(__clang__) // materialize for vectorization, 6% speedup
+  __asm__("" : "+m" (t_bytes) : /*no inputs*/);
+#endif
+  OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
+  for (int i = 0; i < 8; i++) {
+    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
+                                     sizeof(t_bytes),
+                                     constant_time_eq_w(babs, 1 + i));
+  }
+
+  fe yplusx, yminusx, xy2d;
+  fe_frombytes_strict(&yplusx, t_bytes[0]);
+  fe_frombytes_strict(&yminusx, t_bytes[1]);
+  fe_frombytes_strict(&xy2d, t_bytes[2]);
+
+  fe_copy_lt(&t->yplusx, &yplusx);
+  fe_copy_lt(&t->yminusx, &yminusx);
+  fe_copy_lt(&t->xy2d, &xy2d);
+
+  ge_precomp minust;
+  fe_copy_lt(&minust.yplusx, &yminusx);
+  fe_copy_lt(&minust.yminusx, &yplusx);
+  fe_neg(&minust.xy2d, &xy2d);
+  cmov(t, &minust, bnegative>>7);
+}
+
+// h = a * B
+// where a = a[0]+256*a[1]+...+256^31 a[31]
+// B is the Ed25519 base point (x,4/5) with x positive.
+//
+// Preconditions:
+//   a[31] <= 127
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) {
+#if defined(BORINGSSL_FE25519_ADX)
+  if (use_adx) {
+    uint8_t t[4][32];
+    x25519_ge_scalarmult_base_adx(t, a);
+    fiat_25519_from_bytes(h->X.v, t[0]);
+    fiat_25519_from_bytes(h->Y.v, t[1]);
+    fiat_25519_from_bytes(h->Z.v, t[2]);
+    fiat_25519_from_bytes(h->T.v, t[3]);
+    return;
+  }
+#else
+  (void)use_adx;
+#endif
+  signed char e[64];
+  signed char carry;
+  ge_p1p1 r;
+  ge_p2 s;
+  ge_precomp t;
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    e[2 * i + 0] = (a[i] >> 0) & 15;
+    e[2 * i + 1] = (a[i] >> 4) & 15;
+  }
+  // each e[i] is between 0 and 15
+  // e[63] is between 0 and 7
+
+  carry = 0;
+  for (i = 0; i < 63; ++i) {
+    e[i] += carry;
+    carry = e[i] + 8;
+    carry >>= 4;
+    e[i] -= carry << 4;
+  }
+  e[63] += carry;
+  // each e[i] is between -8 and 8
+
+  ge_p3_0(h);
+  for (i = 1; i < 64; i += 2) {
+    table_select(&t, i / 2, e[i]);
+    ge_madd(&r, h, &t);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+
+  ge_p3_dbl(&r, h);
+  x25519_ge_p1p1_to_p2(&s, &r);
+  ge_p2_dbl(&r, &s);
+  x25519_ge_p1p1_to_p2(&s, &r);
+  ge_p2_dbl(&r, &s);
+  x25519_ge_p1p1_to_p2(&s, &r);
+  ge_p2_dbl(&r, &s);
+  x25519_ge_p1p1_to_p3(h, &r);
+
+  for (i = 0; i < 64; i += 2) {
+    table_select(&t, i / 2, e[i]);
+    ge_madd(&r, h, &t);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+}
+
+#endif
+
+static void slide(signed char *r, const uint8_t *a) {
+  int i;
+  int b;
+  int k;
+
+  for (i = 0; i < 256; ++i) {
+    r[i] = 1 & (a[i >> 3] >> (i & 7));
+  }
+
+  for (i = 0; i < 256; ++i) {
+    if (r[i]) {
+      for (b = 1; b <= 6 && i + b < 256; ++b) {
+        if (r[i + b]) {
+          if (r[i] + (r[i + b] << b) <= 15) {
+            r[i] += r[i + b] << b;
+            r[i + b] = 0;
+          } else if (r[i] - (r[i + b] << b) >= -15) {
+            r[i] -= r[i + b] << b;
+            for (k = i + b; k < 256; ++k) {
+              if (!r[k]) {
+                r[k] = 1;
+                break;
+              }
+              r[k] = 0;
+            }
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+// r = a * A + b * B
+// where a = a[0]+256*a[1]+...+256^31 a[31].
+// and b = b[0]+256*b[1]+...+256^31 b[31].
+// B is the Ed25519 base point (x,4/5) with x positive.
+static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
+                                         const ge_p3 *A, const uint8_t *b) {
+  signed char aslide[256];
+  signed char bslide[256];
+  ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
+  ge_p1p1 t;
+  ge_p3 u;
+  ge_p3 A2;
+  int i;
+
+  slide(aslide, a);
+  slide(bslide, b);
+
+  x25519_ge_p3_to_cached(&Ai[0], A);
+  ge_p3_dbl(&t, A);
+  x25519_ge_p1p1_to_p3(&A2, &t);
+  x25519_ge_add(&t, &A2, &Ai[0]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[1], &u);
+  x25519_ge_add(&t, &A2, &Ai[1]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[2], &u);
+  x25519_ge_add(&t, &A2, &Ai[2]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[3], &u);
+  x25519_ge_add(&t, &A2, &Ai[3]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[4], &u);
+  x25519_ge_add(&t, &A2, &Ai[4]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[5], &u);
+  x25519_ge_add(&t, &A2, &Ai[5]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[6], &u);
+  x25519_ge_add(&t, &A2, &Ai[6]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[7], &u);
+
+  ge_p2_0(r);
+
+  for (i = 255; i >= 0; --i) {
+    if (aslide[i] || bslide[i]) {
+      break;
+    }
+  }
+
+  for (; i >= 0; --i) {
+    ge_p2_dbl(&t, r);
+
+    if (aslide[i] > 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
+    } else if (aslide[i] < 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
+    }
+
+    if (bslide[i] > 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      ge_madd(&t, &u, &Bi[bslide[i] / 2]);
+    } else if (bslide[i] < 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
+    }
+
+    x25519_ge_p1p1_to_p2(r, &t);
+  }
+}
+
+// int64_lshift21 returns |a << 21| but is defined when shifting bits into the
+// sign bit. This works around a language flaw in C.
+static inline int64_t int64_lshift21(int64_t a) {
+  return (int64_t)((uint64_t)a << 21);
+}
+
+// The set of scalars is \Z/l
+// where l = 2^252 + 27742317777372353535851937790883648493.
+
+// Input:
+//   s[0]+256*s[1]+...+256^63*s[63] = s
+//
+// Output:
+//   s[0]+256*s[1]+...+256^31*s[31] = s mod l
+//   where l = 2^252 + 27742317777372353535851937790883648493.
+//   Overwrites s in place.
+void x25519_sc_reduce(uint8_t s[64]) {
+  int64_t s0 = 2097151 & load_3(s);
+  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
+  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
+  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
+  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
+  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
+  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
+  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
+  int64_t s8 = 2097151 & load_3(s + 21);
+  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
+  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
+  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
+  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
+  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
+  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
+  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
+  int64_t s16 = 2097151 & load_3(s + 42);
+  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
+  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
+  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
+  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
+  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
+  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
+  int64_t s23 = (load_4(s + 60) >> 3);
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry12 = (s12 + (1 << 20)) >> 21;
+  s13 += carry12;
+  s12 -= int64_lshift21(carry12);
+  carry14 = (s14 + (1 << 20)) >> 21;
+  s15 += carry14;
+  s14 -= int64_lshift21(carry14);
+  carry16 = (s16 + (1 << 20)) >> 21;
+  s17 += carry16;
+  s16 -= int64_lshift21(carry16);
+
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  carry13 = (s13 + (1 << 20)) >> 21;
+  s14 += carry13;
+  s13 -= int64_lshift21(carry13);
+  carry15 = (s15 + (1 << 20)) >> 21;
+  s16 += carry15;
+  s15 -= int64_lshift21(carry15);
+
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = (s0 + (1 << 20)) >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry2 = (s2 + (1 << 20)) >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry4 = (s4 + (1 << 20)) >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+
+  carry1 = (s1 + (1 << 20)) >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry3 = (s3 + (1 << 20)) >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry5 = (s5 + (1 << 20)) >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry11 = s11 >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+}
+
+// Input:
+//   a[0]+256*a[1]+...+256^31*a[31] = a
+//   b[0]+256*b[1]+...+256^31*b[31] = b
+//   c[0]+256*c[1]+...+256^31*c[31] = c
+//
+// Output:
+//   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
+//   where l = 2^252 + 27742317777372353535851937790883648493.
+static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
+                      const uint8_t *c) {
+  int64_t a0 = 2097151 & load_3(a);
+  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
+  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
+  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
+  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
+  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
+  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
+  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
+  int64_t a8 = 2097151 & load_3(a + 21);
+  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
+  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
+  int64_t a11 = (load_4(a + 28) >> 7);
+  int64_t b0 = 2097151 & load_3(b);
+  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
+  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
+  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
+  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
+  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
+  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
+  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
+  int64_t b8 = 2097151 & load_3(b + 21);
+  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
+  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
+  int64_t b11 = (load_4(b + 28) >> 7);
+  int64_t c0 = 2097151 & load_3(c);
+  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
+  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
+  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
+  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
+  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
+  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
+  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
+  int64_t c8 = 2097151 & load_3(c + 21);
+  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
+  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
+  int64_t c11 = (load_4(c + 28) >> 7);
+  int64_t s0;
+  int64_t s1;
+  int64_t s2;
+  int64_t s3;
+  int64_t s4;
+  int64_t s5;
+  int64_t s6;
+  int64_t s7;
+  int64_t s8;
+  int64_t s9;
+  int64_t s10;
+  int64_t s11;
+  int64_t s12;
+  int64_t s13;
+  int64_t s14;
+  int64_t s15;
+  int64_t s16;
+  int64_t s17;
+  int64_t s18;
+  int64_t s19;
+  int64_t s20;
+  int64_t s21;
+  int64_t s22;
+  int64_t s23;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+  int64_t carry17;
+  int64_t carry18;
+  int64_t carry19;
+  int64_t carry20;
+  int64_t carry21;
+  int64_t carry22;
+
+  s0 = c0 + a0 * b0;
+  s1 = c1 + a0 * b1 + a1 * b0;
+  s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
+  s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+  s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
+  s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
+  s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
+  s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
+       a6 * b1 + a7 * b0;
+  s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
+       a6 * b2 + a7 * b1 + a8 * b0;
+  s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
+       a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
+  s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
+        a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
+  s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
+        a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
+  s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
+        a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
+  s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
+        a9 * b4 + a10 * b3 + a11 * b2;
+  s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
+        a10 * b4 + a11 * b3;
+  s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
+        a11 * b4;
+  s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
+  s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
+  s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
+  s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
+  s20 = a9 * b11 + a10 * b10 + a11 * b9;
+  s21 = a10 * b11 + a11 * b10;
+  s22 = a11 * b11;
+  s23 = 0;
+
+  carry0 = (s0 + (1 << 20)) >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry2 = (s2 + (1 << 20)) >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry4 = (s4 + (1 << 20)) >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry12 = (s12 + (1 << 20)) >> 21;
+  s13 += carry12;
+  s12 -= int64_lshift21(carry12);
+  carry14 = (s14 + (1 << 20)) >> 21;
+  s15 += carry14;
+  s14 -= int64_lshift21(carry14);
+  carry16 = (s16 + (1 << 20)) >> 21;
+  s17 += carry16;
+  s16 -= int64_lshift21(carry16);
+  carry18 = (s18 + (1 << 20)) >> 21;
+  s19 += carry18;
+  s18 -= int64_lshift21(carry18);
+  carry20 = (s20 + (1 << 20)) >> 21;
+  s21 += carry20;
+  s20 -= int64_lshift21(carry20);
+  carry22 = (s22 + (1 << 20)) >> 21;
+  s23 += carry22;
+  s22 -= int64_lshift21(carry22);
+
+  carry1 = (s1 + (1 << 20)) >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry3 = (s3 + (1 << 20)) >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry5 = (s5 + (1 << 20)) >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  carry13 = (s13 + (1 << 20)) >> 21;
+  s14 += carry13;
+  s13 -= int64_lshift21(carry13);
+  carry15 = (s15 + (1 << 20)) >> 21;
+  s16 += carry15;
+  s15 -= int64_lshift21(carry15);
+  carry17 = (s17 + (1 << 20)) >> 21;
+  s18 += carry17;
+  s17 -= int64_lshift21(carry17);
+  carry19 = (s19 + (1 << 20)) >> 21;
+  s20 += carry19;
+  s19 -= int64_lshift21(carry19);
+  carry21 = (s21 + (1 << 20)) >> 21;
+  s22 += carry21;
+  s21 -= int64_lshift21(carry21);
+
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry12 = (s12 + (1 << 20)) >> 21;
+  s13 += carry12;
+  s12 -= int64_lshift21(carry12);
+  carry14 = (s14 + (1 << 20)) >> 21;
+  s15 += carry14;
+  s14 -= int64_lshift21(carry14);
+  carry16 = (s16 + (1 << 20)) >> 21;
+  s17 += carry16;
+  s16 -= int64_lshift21(carry16);
+
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  carry13 = (s13 + (1 << 20)) >> 21;
+  s14 += carry13;
+  s13 -= int64_lshift21(carry13);
+  carry15 = (s15 + (1 << 20)) >> 21;
+  s16 += carry15;
+  s15 -= int64_lshift21(carry15);
+
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = (s0 + (1 << 20)) >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry2 = (s2 + (1 << 20)) >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry4 = (s4 + (1 << 20)) >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+
+  carry1 = (s1 + (1 << 20)) >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry3 = (s3 + (1 << 20)) >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry5 = (s5 + (1 << 20)) >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry11 = s11 >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+}
+
+
+void x25519_scalar_mult_generic_masked(uint8_t out[32],
+                                           const uint8_t scalar_masked[32],
+                                           const uint8_t point[32]) {
+  fe x1, x2, z2, x3, z3, tmp0, tmp1;
+  fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
+
+  uint8_t e[32];
+  OPENSSL_memcpy(e, scalar_masked, 32);
+  // The following implementation was transcribed to Coq and proven to
+  // correspond to unary scalar multiplication in affine coordinates given that
+  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
+  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
+  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
+  // underlying field, so it applies to Curve25519 itself and the quadratic
+  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
+  // correctly simulates extension-field arithmetic on prime-field values.
+  // The decoding of the byte array representation of e was not considered.
+  // Specification of Montgomery curves in affine coordinates:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+  // Proof that these form a group that is isomorphic to a Weierstrass curve:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+  // Coq transcription and correctness proof of the loop (where scalarbits=255):
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
+  fe_frombytes(&x1, point);
+  fe_1(&x2);
+  fe_0(&z2);
+  fe_copy(&x3, &x1);
+  fe_1(&z3);
+
+  unsigned swap = 0;
+  int pos;
+  for (pos = 254; pos >= 0; --pos) {
+    // loop invariant as of right before the test, for the case where x1 != 0:
+    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
+    //   let r := e >> (pos+1) in the following equalities of projective points:
+    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
+    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
+    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+    swap ^= b;
+    fe_cswap(&x2, &x3, swap);
+    fe_cswap(&z2, &z3, swap);
+    swap = b;
+    // Coq transcription of ladderstep formula (called from transcribed loop):
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+    // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+    // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+    fe_sub(&tmp0l, &x3, &z3);
+    fe_sub(&tmp1l, &x2, &z2);
+    fe_add(&x2l, &x2, &z2);
+    fe_add(&z2l, &x3, &z3);
+    fe_mul_tll(&z3, &tmp0l, &x2l);
+    fe_mul_tll(&z2, &z2l, &tmp1l);
+    fe_sq_tl(&tmp0, &tmp1l);
+    fe_sq_tl(&tmp1, &x2l);
+    fe_add(&x3l, &z3, &z2);
+    fe_sub(&z2l, &z3, &z2);
+    fe_mul_ttt(&x2, &tmp1, &tmp0);
+    fe_sub(&tmp1l, &tmp1, &tmp0);
+    fe_sq_tl(&z2, &z2l);
+    fe_mul121666(&z3, &tmp1l);
+    fe_sq_tl(&x3, &x3l);
+    fe_add(&tmp0l, &tmp0, &z3);
+    fe_mul_ttt(&z3, &x1, &z2);
+    fe_mul_tll(&z2, &tmp1l, &tmp0l);
+  }
+  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
+  fe_cswap(&x2, &x3, swap);
+  fe_cswap(&z2, &z3, swap);
+
+  fe_invert(&z2, &z2);
+  fe_mul_ttt(&x2, &x2, &z2);
+  fe_tobytes(out, &x2);
+}
+
+void x25519_public_from_private_generic_masked(uint8_t out_public_value[32],
+                                               const uint8_t private_key_masked[32],
+                                               int use_adx) {
+  uint8_t e[32];
+  OPENSSL_memcpy(e, private_key_masked, 32);
+
+  ge_p3 A;
+  x25519_ge_scalarmult_base(&A, e, use_adx);
+
+  // We only need the u-coordinate of the curve25519 point. The map is
+  // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
+  fe_loose zplusy, zminusy;
+  fe zminusy_inv;
+  fe_add(&zplusy, &A.Z, &A.Y);
+  fe_sub(&zminusy, &A.Z, &A.Y);
+  fe_loose_invert(&zminusy_inv, &zminusy);
+  fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
+  fe_tobytes(out_public_value, &zminusy_inv);
+  CONSTTIME_DECLASSIFY(out_public_value, 32);
+}
+
+void x25519_fe_invert(fe *out, const fe *z) {
+  fe_invert(out, z);
+}
+
+uint8_t x25519_fe_isnegative(const fe *f) {
+  return (uint8_t)fe_isnegative(f);
+}
+
+void x25519_fe_mul_ttt(fe *h, const fe *f, const fe *g) {
+  fe_mul_ttt(h, f, g);
+}
+
+void x25519_fe_neg(fe *f) {
+  fe_loose t;
+  fe_neg(&t, f);
+  fe_carry(f, &t);
+}
+
+void x25519_fe_tobytes(uint8_t s[32], const fe *h) {
+  fe_tobytes(s, h);
+}
+
+void x25519_ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
+                                             const ge_p3 *A, const uint8_t *b) {
+  ge_double_scalarmult_vartime(r, a, A, b);
+}
+
+void x25519_sc_mask(uint8_t a[32]) {
+  a[0] &= 248;
+  a[31] &= 127;
+  a[31] |= 64;
+}
+
+void x25519_sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
+                          const uint8_t *c) {
+  sc_muladd(s, a, b, c);
+}
diff --git a/ring-0.17.14/crypto/curve25519/curve25519_64_adx.c b/ring-0.17.14/crypto/curve25519/curve25519_64_adx.c
new file mode 100644
index 0000000000..88964a9ddc
--- /dev/null
+++ b/ring-0.17.14/crypto/curve25519/curve25519_64_adx.c
@@ -0,0 +1,23 @@
+// Copyright 2023 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "internal.h"
+#if defined(BORINGSSL_FE25519_ADX)
+
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wpedantic"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+
+#include "../../third_party/fiat/curve25519_64_adx.h"
+#endif
diff --git a/ring-0.17.14/crypto/curve25519/curve25519_tables.h b/ring-0.17.14/crypto/curve25519/curve25519_tables.h
new file mode 100644
index 0000000000..72caa33783
--- /dev/null
+++ b/ring-0.17.14/crypto/curve25519/curve25519_tables.h
@@ -0,0 +1,3264 @@
+// Copyright 2020 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is generated from
+//    ./make_curve25519_tables.py > curve25519_tables.h
+
+
+static const fe d = {{
+#if defined(OPENSSL_64_BIT)
+    929955233495203, 466365720129213, 1662059464998953, 2033849074728123,
+    1442794654840575
+#else
+    56195235, 13857412, 51736253, 6949390, 114729, 24766616, 60832955, 30306712,
+    48412415, 21499315
+#endif
+}};
+
+static const fe sqrtm1 = {{
+#if defined(OPENSSL_64_BIT)
+    1718705420411056, 234908883556509, 2233514472574048, 2117202627021982,
+    765476049583133
+#else
+    34513072, 25610706, 9377949, 3500415, 12389472, 33281959, 41962654,
+    31548777, 326685, 11406482
+#endif
+}};
+
+static const fe d2 = {{
+#if defined(OPENSSL_64_BIT)
+    1859910466990425, 932731440258426, 1072319116312658, 1815898335770999,
+    633789495995903
+#else
+    45281625, 27714825, 36363642, 13898781, 229458, 15978800, 54557047,
+    27058993, 29715967, 9444199
+#endif
+}};
+
+#if defined(OPENSSL_SMALL)
+
+// This block of code replaces the standard base-point table with a much smaller
+// one. The standard table is 30,720 bytes while this one is just 960.
+//
+// This table contains 15 pairs of group elements, (x, y), where each field
+// element is serialised with |fe_tobytes|. If |i| is the index of the group
+// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀
+// is the most significant bit). The value of the group element is then:
+// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator.
+static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {
+    0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9, 0xb2, 0xa7, 0x25, 0x95,
+    0x60, 0xc7, 0x2c, 0x69, 0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0,
+    0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21, 0x58, 0x66, 0x66, 0x66,
+    0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+    0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+    0x66, 0x66, 0x66, 0x66, 0x02, 0xa2, 0xed, 0xf4, 0x8f, 0x6b, 0x0b, 0x3e,
+    0xeb, 0x35, 0x1a, 0xd5, 0x7e, 0xdb, 0x78, 0x00, 0x96, 0x8a, 0xa0, 0xb4,
+    0xcf, 0x60, 0x4b, 0xd4, 0xd5, 0xf9, 0x2d, 0xbf, 0x88, 0xbd, 0x22, 0x62,
+    0x13, 0x53, 0xe4, 0x82, 0x57, 0xfa, 0x1e, 0x8f, 0x06, 0x2b, 0x90, 0xba,
+    0x08, 0xb6, 0x10, 0x54, 0x4f, 0x7c, 0x1b, 0x26, 0xed, 0xda, 0x6b, 0xdd,
+    0x25, 0xd0, 0x4e, 0xea, 0x42, 0xbb, 0x25, 0x03, 0xa2, 0xfb, 0xcc, 0x61,
+    0x67, 0x06, 0x70, 0x1a, 0xc4, 0x78, 0x3a, 0xff, 0x32, 0x62, 0xdd, 0x2c,
+    0xab, 0x50, 0x19, 0x3b, 0xf2, 0x9b, 0x7d, 0xb8, 0xfd, 0x4f, 0x29, 0x9c,
+    0xa7, 0x91, 0xba, 0x0e, 0x46, 0x5e, 0x51, 0xfe, 0x1d, 0xbf, 0xe5, 0xe5,
+    0x9b, 0x95, 0x0d, 0x67, 0xf8, 0xd1, 0xb5, 0x5a, 0xa1, 0x93, 0x2c, 0xc3,
+    0xde, 0x0e, 0x97, 0x85, 0x2d, 0x7f, 0xea, 0xab, 0x3e, 0x47, 0x30, 0x18,
+    0x24, 0xe8, 0xb7, 0x60, 0xae, 0x47, 0x80, 0xfc, 0xe5, 0x23, 0xe7, 0xc2,
+    0xc9, 0x85, 0xe6, 0x98, 0xa0, 0x29, 0x4e, 0xe1, 0x84, 0x39, 0x2d, 0x95,
+    0x2c, 0xf3, 0x45, 0x3c, 0xff, 0xaf, 0x27, 0x4c, 0x6b, 0xa6, 0xf5, 0x4b,
+    0x11, 0xbd, 0xba, 0x5b, 0x9e, 0xc4, 0xa4, 0x51, 0x1e, 0xbe, 0xd0, 0x90,
+    0x3a, 0x9c, 0xc2, 0x26, 0xb6, 0x1e, 0xf1, 0x95, 0x7d, 0xc8, 0x6d, 0x52,
+    0xe6, 0x99, 0x2c, 0x5f, 0x9a, 0x96, 0x0c, 0x68, 0x29, 0xfd, 0xe2, 0xfb,
+    0xe6, 0xbc, 0xec, 0x31, 0x08, 0xec, 0xe6, 0xb0, 0x53, 0x60, 0xc3, 0x8c,
+    0xbe, 0xc1, 0xb3, 0x8a, 0x8f, 0xe4, 0x88, 0x2b, 0x55, 0xe5, 0x64, 0x6e,
+    0x9b, 0xd0, 0xaf, 0x7b, 0x64, 0x2a, 0x35, 0x25, 0x10, 0x52, 0xc5, 0x9e,
+    0x58, 0x11, 0x39, 0x36, 0x45, 0x51, 0xb8, 0x39, 0x93, 0xfc, 0x9d, 0x6a,
+    0xbe, 0x58, 0xcb, 0xa4, 0x0f, 0x51, 0x3c, 0x38, 0x05, 0xca, 0xab, 0x43,
+    0x63, 0x0e, 0xf3, 0x8b, 0x41, 0xa6, 0xf8, 0x9b, 0x53, 0x70, 0x80, 0x53,
+    0x86, 0x5e, 0x8f, 0xe3, 0xc3, 0x0d, 0x18, 0xc8, 0x4b, 0x34, 0x1f, 0xd8,
+    0x1d, 0xbc, 0xf2, 0x6d, 0x34, 0x3a, 0xbe, 0xdf, 0xd9, 0xf6, 0xf3, 0x89,
+    0xa1, 0xe1, 0x94, 0x9f, 0x5d, 0x4c, 0x5d, 0xe9, 0xa1, 0x49, 0x92, 0xef,
+    0x0e, 0x53, 0x81, 0x89, 0x58, 0x87, 0xa6, 0x37, 0xf1, 0xdd, 0x62, 0x60,
+    0x63, 0x5a, 0x9d, 0x1b, 0x8c, 0xc6, 0x7d, 0x52, 0xea, 0x70, 0x09, 0x6a,
+    0xe1, 0x32, 0xf3, 0x73, 0x21, 0x1f, 0x07, 0x7b, 0x7c, 0x9b, 0x49, 0xd8,
+    0xc0, 0xf3, 0x25, 0x72, 0x6f, 0x9d, 0xed, 0x31, 0x67, 0x36, 0x36, 0x54,
+    0x40, 0x92, 0x71, 0xe6, 0x11, 0x28, 0x11, 0xad, 0x93, 0x32, 0x85, 0x7b,
+    0x3e, 0xb7, 0x3b, 0x49, 0x13, 0x1c, 0x07, 0xb0, 0x2e, 0x93, 0xaa, 0xfd,
+    0xfd, 0x28, 0x47, 0x3d, 0x8d, 0xd2, 0xda, 0xc7, 0x44, 0xd6, 0x7a, 0xdb,
+    0x26, 0x7d, 0x1d, 0xb8, 0xe1, 0xde, 0x9d, 0x7a, 0x7d, 0x17, 0x7e, 0x1c,
+    0x37, 0x04, 0x8d, 0x2d, 0x7c, 0x5e, 0x18, 0x38, 0x1e, 0xaf, 0xc7, 0x1b,
+    0x33, 0x48, 0x31, 0x00, 0x59, 0xf6, 0xf2, 0xca, 0x0f, 0x27, 0x1b, 0x63,
+    0x12, 0x7e, 0x02, 0x1d, 0x49, 0xc0, 0x5d, 0x79, 0x87, 0xef, 0x5e, 0x7a,
+    0x2f, 0x1f, 0x66, 0x55, 0xd8, 0x09, 0xd9, 0x61, 0x38, 0x68, 0xb0, 0x07,
+    0xa3, 0xfc, 0xcc, 0x85, 0x10, 0x7f, 0x4c, 0x65, 0x65, 0xb3, 0xfa, 0xfa,
+    0xa5, 0x53, 0x6f, 0xdb, 0x74, 0x4c, 0x56, 0x46, 0x03, 0xe2, 0xd5, 0x7a,
+    0x29, 0x1c, 0xc6, 0x02, 0xbc, 0x59, 0xf2, 0x04, 0x75, 0x63, 0xc0, 0x84,
+    0x2f, 0x60, 0x1c, 0x67, 0x76, 0xfd, 0x63, 0x86, 0xf3, 0xfa, 0xbf, 0xdc,
+    0xd2, 0x2d, 0x90, 0x91, 0xbd, 0x33, 0xa9, 0xe5, 0x66, 0x0c, 0xda, 0x42,
+    0x27, 0xca, 0xf4, 0x66, 0xc2, 0xec, 0x92, 0x14, 0x57, 0x06, 0x63, 0xd0,
+    0x4d, 0x15, 0x06, 0xeb, 0x69, 0x58, 0x4f, 0x77, 0xc5, 0x8b, 0xc7, 0xf0,
+    0x8e, 0xed, 0x64, 0xa0, 0xb3, 0x3c, 0x66, 0x71, 0xc6, 0x2d, 0xda, 0x0a,
+    0x0d, 0xfe, 0x70, 0x27, 0x64, 0xf8, 0x27, 0xfa, 0xf6, 0x5f, 0x30, 0xa5,
+    0x0d, 0x6c, 0xda, 0xf2, 0x62, 0x5e, 0x78, 0x47, 0xd3, 0x66, 0x00, 0x1c,
+    0xfd, 0x56, 0x1f, 0x5d, 0x3f, 0x6f, 0xf4, 0x4c, 0xd8, 0xfd, 0x0e, 0x27,
+    0xc9, 0x5c, 0x2b, 0xbc, 0xc0, 0xa4, 0xe7, 0x23, 0x29, 0x02, 0x9f, 0x31,
+    0xd6, 0xe9, 0xd7, 0x96, 0xf4, 0xe0, 0x5e, 0x0b, 0x0e, 0x13, 0xee, 0x3c,
+    0x09, 0xed, 0xf2, 0x3d, 0x76, 0x91, 0xc3, 0xa4, 0x97, 0xae, 0xd4, 0x87,
+    0xd0, 0x5d, 0xf6, 0x18, 0x47, 0x1f, 0x1d, 0x67, 0xf2, 0xcf, 0x63, 0xa0,
+    0x91, 0x27, 0xf8, 0x93, 0x45, 0x75, 0x23, 0x3f, 0xd1, 0xf1, 0xad, 0x23,
+    0xdd, 0x64, 0x93, 0x96, 0x41, 0x70, 0x7f, 0xf7, 0xf5, 0xa9, 0x89, 0xa2,
+    0x34, 0xb0, 0x8d, 0x1b, 0xae, 0x19, 0x15, 0x49, 0x58, 0x23, 0x6d, 0x87,
+    0x15, 0x4f, 0x81, 0x76, 0xfb, 0x23, 0xb5, 0xea, 0xcf, 0xac, 0x54, 0x8d,
+    0x4e, 0x42, 0x2f, 0xeb, 0x0f, 0x63, 0xdb, 0x68, 0x37, 0xa8, 0xcf, 0x8b,
+    0xab, 0xf5, 0xa4, 0x6e, 0x96, 0x2a, 0xb2, 0xd6, 0xbe, 0x9e, 0xbd, 0x0d,
+    0xb4, 0x42, 0xa9, 0xcf, 0x01, 0x83, 0x8a, 0x17, 0x47, 0x76, 0xc4, 0xc6,
+    0x83, 0x04, 0x95, 0x0b, 0xfc, 0x11, 0xc9, 0x62, 0xb8, 0x0c, 0x76, 0x84,
+    0xd9, 0xb9, 0x37, 0xfa, 0xfc, 0x7c, 0xc2, 0x6d, 0x58, 0x3e, 0xb3, 0x04,
+    0xbb, 0x8c, 0x8f, 0x48, 0xbc, 0x91, 0x27, 0xcc, 0xf9, 0xb7, 0x22, 0x19,
+    0x83, 0x2e, 0x09, 0xb5, 0x72, 0xd9, 0x54, 0x1c, 0x4d, 0xa1, 0xea, 0x0b,
+    0xf1, 0xc6, 0x08, 0x72, 0x46, 0x87, 0x7a, 0x6e, 0x80, 0x56, 0x0a, 0x8a,
+    0xc0, 0xdd, 0x11, 0x6b, 0xd6, 0xdd, 0x47, 0xdf, 0x10, 0xd9, 0xd8, 0xea,
+    0x7c, 0xb0, 0x8f, 0x03, 0x00, 0x2e, 0xc1, 0x8f, 0x44, 0xa8, 0xd3, 0x30,
+    0x06, 0x89, 0xa2, 0xf9, 0x34, 0xad, 0xdc, 0x03, 0x85, 0xed, 0x51, 0xa7,
+    0x82, 0x9c, 0xe7, 0x5d, 0x52, 0x93, 0x0c, 0x32, 0x9a, 0x5b, 0xe1, 0xaa,
+    0xca, 0xb8, 0x02, 0x6d, 0x3a, 0xd4, 0xb1, 0x3a, 0xf0, 0x5f, 0xbe, 0xb5,
+    0x0d, 0x10, 0x6b, 0x38, 0x32, 0xac, 0x76, 0x80, 0xbd, 0xca, 0x94, 0x71,
+    0x7a, 0xf2, 0xc9, 0x35, 0x2a, 0xde, 0x9f, 0x42, 0x49, 0x18, 0x01, 0xab,
+    0xbc, 0xef, 0x7c, 0x64, 0x3f, 0x58, 0x3d, 0x92, 0x59, 0xdb, 0x13, 0xdb,
+    0x58, 0x6e, 0x0a, 0xe0, 0xb7, 0x91, 0x4a, 0x08, 0x20, 0xd6, 0x2e, 0x3c,
+    0x45, 0xc9, 0x8b, 0x17, 0x79, 0xe7, 0xc7, 0x90, 0x99, 0x3a, 0x18, 0x25,
+};
+
+#else
+
+// k25519Precomp[i][j] = (j+1)*256^i*B
+const uint8_t k25519Precomp[32][8][3][32] = {
+    {
+        {
+            {0x85, 0x3b, 0x8c, 0xf5, 0xc6, 0x93, 0xbc, 0x2f, 0x19, 0xe,  0x8c,
+             0xfb, 0xc6, 0x2d, 0x93, 0xcf, 0xc2, 0x42, 0x3d, 0x64, 0x98, 0x48,
+             0xb,  0x27, 0x65, 0xba, 0xd4, 0x33, 0x3a, 0x9d, 0xcf, 0x7},
+            {0x3e, 0x91, 0x40, 0xd7, 0x5,  0x39, 0x10, 0x9d, 0xb3, 0xbe, 0x40,
+             0xd1, 0x5,  0x9f, 0x39, 0xfd, 0x9,  0x8a, 0x8f, 0x68, 0x34, 0x84,
+             0xc1, 0xa5, 0x67, 0x12, 0xf8, 0x98, 0x92, 0x2f, 0xfd, 0x44},
+            {0x68, 0xaa, 0x7a, 0x87, 0x5,  0x12, 0xc9, 0xab, 0x9e, 0xc4, 0xaa,
+             0xcc, 0x23, 0xe8, 0xd9, 0x26, 0x8c, 0x59, 0x43, 0xdd, 0xcb, 0x7d,
+             0x1b, 0x5a, 0xa8, 0x65, 0xc,  0x9f, 0x68, 0x7b, 0x11, 0x6f},
+        },
+        {
+            {0xd7, 0x71, 0x3c, 0x93, 0xfc, 0xe7, 0x24, 0x92, 0xb5, 0xf5, 0xf,
+             0x7a, 0x96, 0x9d, 0x46, 0x9f, 0x2,  0x7,  0xd6, 0xe1, 0x65, 0x9a,
+             0xa6, 0x5a, 0x2e, 0x2e, 0x7d, 0xa8, 0x3f, 0x6,  0xc,  0x59},
+            {0xa8, 0xd5, 0xb4, 0x42, 0x60, 0xa5, 0x99, 0x8a, 0xf6, 0xac, 0x60,
+             0x4e, 0xc,  0x81, 0x2b, 0x8f, 0xaa, 0x37, 0x6e, 0xb1, 0x6b, 0x23,
+             0x9e, 0xe0, 0x55, 0x25, 0xc9, 0x69, 0xa6, 0x95, 0xb5, 0x6b},
+            {0x5f, 0x7a, 0x9b, 0xa5, 0xb3, 0xa8, 0xfa, 0x43, 0x78, 0xcf, 0x9a,
+             0x5d, 0xdd, 0x6b, 0xc1, 0x36, 0x31, 0x6a, 0x3d, 0xb,  0x84, 0xa0,
+             0xf,  0x50, 0x73, 0xb,  0xa5, 0x3e, 0xb1, 0xf5, 0x1a, 0x70},
+        },
+        {
+            {0x30, 0x97, 0xee, 0x4c, 0xa8, 0xb0, 0x25, 0xaf, 0x8a, 0x4b, 0x86,
+             0xe8, 0x30, 0x84, 0x5a, 0x2,  0x32, 0x67, 0x1,  0x9f, 0x2,  0x50,
+             0x1b, 0xc1, 0xf4, 0xf8, 0x80, 0x9a, 0x1b, 0x4e, 0x16, 0x7a},
+            {0x65, 0xd2, 0xfc, 0xa4, 0xe8, 0x1f, 0x61, 0x56, 0x7d, 0xba, 0xc1,
+             0xe5, 0xfd, 0x53, 0xd3, 0x3b, 0xbd, 0xd6, 0x4b, 0x21, 0x1a, 0xf3,
+             0x31, 0x81, 0x62, 0xda, 0x5b, 0x55, 0x87, 0x15, 0xb9, 0x2a},
+            {0x89, 0xd8, 0xd0, 0xd,  0x3f, 0x93, 0xae, 0x14, 0x62, 0xda, 0x35,
+             0x1c, 0x22, 0x23, 0x94, 0x58, 0x4c, 0xdb, 0xf2, 0x8c, 0x45, 0xe5,
+             0x70, 0xd1, 0xc6, 0xb4, 0xb9, 0x12, 0xaf, 0x26, 0x28, 0x5a},
+        },
+        {
+            {0x9f, 0x9,  0xfc, 0x8e, 0xb9, 0x51, 0x73, 0x28, 0x38, 0x25, 0xfd,
+             0x7d, 0xf4, 0xc6, 0x65, 0x67, 0x65, 0x92, 0xa,  0xfb, 0x3d, 0x8d,
+             0x34, 0xca, 0x27, 0x87, 0xe5, 0x21, 0x3,  0x91, 0xe,  0x68},
+            {0xbf, 0x18, 0x68, 0x5,  0xa,  0x5,  0xfe, 0x95, 0xa9, 0xfa, 0x60,
+             0x56, 0x71, 0x89, 0x7e, 0x32, 0x73, 0x50, 0xa0, 0x6,  0xcd, 0xe3,
+             0xe8, 0xc3, 0x9a, 0xa4, 0x45, 0x74, 0x4c, 0x3f, 0x93, 0x27},
+            {0x9,  0xff, 0x76, 0xc4, 0xe9, 0xfb, 0x13, 0x5a, 0x72, 0xc1, 0x5c,
+             0x7b, 0x45, 0x39, 0x9e, 0x6e, 0x94, 0x44, 0x2b, 0x10, 0xf9, 0xdc,
+             0xdb, 0x5d, 0x2b, 0x3e, 0x55, 0x63, 0xbf, 0xc,  0x9d, 0x7f},
+        },
+        {
+            {0x33, 0xbb, 0xa5, 0x8,  0x44, 0xbc, 0x12, 0xa2, 0x2,  0xed, 0x5e,
+             0xc7, 0xc3, 0x48, 0x50, 0x8d, 0x44, 0xec, 0xbf, 0x5a, 0xc,  0xeb,
+             0x1b, 0xdd, 0xeb, 0x6,  0xe2, 0x46, 0xf1, 0xcc, 0x45, 0x29},
+            {0xba, 0xd6, 0x47, 0xa4, 0xc3, 0x82, 0x91, 0x7f, 0xb7, 0x29, 0x27,
+             0x4b, 0xd1, 0x14, 0x0,  0xd5, 0x87, 0xa0, 0x64, 0xb8, 0x1c, 0xf1,
+             0x3c, 0xe3, 0xf3, 0x55, 0x1b, 0xeb, 0x73, 0x7e, 0x4a, 0x15},
+            {0x85, 0x82, 0x2a, 0x81, 0xf1, 0xdb, 0xbb, 0xbc, 0xfc, 0xd1, 0xbd,
+             0xd0, 0x7,  0x8,  0xe,  0x27, 0x2d, 0xa7, 0xbd, 0x1b, 0xb,  0x67,
+             0x1b, 0xb4, 0x9a, 0xb6, 0x3b, 0x6b, 0x69, 0xbe, 0xaa, 0x43},
+        },
+        {
+            {0x31, 0x71, 0x15, 0x77, 0xeb, 0xee, 0xc,  0x3a, 0x88, 0xaf, 0xc8,
+             0x0,  0x89, 0x15, 0x27, 0x9b, 0x36, 0xa7, 0x59, 0xda, 0x68, 0xb6,
+             0x65, 0x80, 0xbd, 0x38, 0xcc, 0xa2, 0xb6, 0x7b, 0xe5, 0x51},
+            {0xa4, 0x8c, 0x7d, 0x7b, 0xb6, 0x6,  0x98, 0x49, 0x39, 0x27, 0xd2,
+             0x27, 0x84, 0xe2, 0x5b, 0x57, 0xb9, 0x53, 0x45, 0x20, 0xe7, 0x5c,
+             0x8,  0xbb, 0x84, 0x78, 0x41, 0xae, 0x41, 0x4c, 0xb6, 0x38},
+            {0x71, 0x4b, 0xea, 0x2,  0x67, 0x32, 0xac, 0x85, 0x1,  0xbb, 0xa1,
+             0x41, 0x3,  0xe0, 0x70, 0xbe, 0x44, 0xc1, 0x3b, 0x8,  0x4b, 0xa2,
+             0xe4, 0x53, 0xe3, 0x61, 0xd,  0x9f, 0x1a, 0xe9, 0xb8, 0x10},
+        },
+        {
+            {0xbf, 0xa3, 0x4e, 0x94, 0xd0, 0x5c, 0x1a, 0x6b, 0xd2, 0xc0, 0x9d,
+             0xb3, 0x3a, 0x35, 0x70, 0x74, 0x49, 0x2e, 0x54, 0x28, 0x82, 0x52,
+             0xb2, 0x71, 0x7e, 0x92, 0x3c, 0x28, 0x69, 0xea, 0x1b, 0x46},
+            {0xb1, 0x21, 0x32, 0xaa, 0x9a, 0x2c, 0x6f, 0xba, 0xa7, 0x23, 0xba,
+             0x3b, 0x53, 0x21, 0xa0, 0x6c, 0x3a, 0x2c, 0x19, 0x92, 0x4f, 0x76,
+             0xea, 0x9d, 0xe0, 0x17, 0x53, 0x2e, 0x5d, 0xdd, 0x6e, 0x1d},
+            {0xa2, 0xb3, 0xb8, 0x1,  0xc8, 0x6d, 0x83, 0xf1, 0x9a, 0xa4, 0x3e,
+             0x5,  0x47, 0x5f, 0x3,  0xb3, 0xf3, 0xad, 0x77, 0x58, 0xba, 0x41,
+             0x9c, 0x52, 0xa7, 0x90, 0xf,  0x6a, 0x1c, 0xbb, 0x9f, 0x7a},
+        },
+        {
+            {0x8f, 0x3e, 0xdd, 0x4,  0x66, 0x59, 0xb7, 0x59, 0x2c, 0x70, 0x88,
+             0xe2, 0x77, 0x3,  0xb3, 0x6c, 0x23, 0xc3, 0xd9, 0x5e, 0x66, 0x9c,
+             0x33, 0xb1, 0x2f, 0xe5, 0xbc, 0x61, 0x60, 0xe7, 0x15, 0x9},
+            {0xd9, 0x34, 0x92, 0xf3, 0xed, 0x5d, 0xa7, 0xe2, 0xf9, 0x58, 0xb5,
+             0xe1, 0x80, 0x76, 0x3d, 0x96, 0xfb, 0x23, 0x3c, 0x6e, 0xac, 0x41,
+             0x27, 0x2c, 0xc3, 0x1,  0xe,  0x32, 0xa1, 0x24, 0x90, 0x3a},
+            {0x1a, 0x91, 0xa2, 0xc9, 0xd9, 0xf5, 0xc1, 0xe7, 0xd7, 0xa7, 0xcc,
+             0x8b, 0x78, 0x71, 0xa3, 0xb8, 0x32, 0x2a, 0xb6, 0xe,  0x19, 0x12,
+             0x64, 0x63, 0x95, 0x4e, 0xcc, 0x2e, 0x5c, 0x7c, 0x90, 0x26},
+        },
+    },
+    {
+        {
+            {0x1d, 0x9c, 0x2f, 0x63, 0xe,  0xdd, 0xcc, 0x2e, 0x15, 0x31, 0x89,
+             0x76, 0x96, 0xb6, 0xd0, 0x51, 0x58, 0x7a, 0x63, 0xa8, 0x6b, 0xb7,
+             0xdf, 0x52, 0x39, 0xef, 0xe,  0xa0, 0x49, 0x7d, 0xd3, 0x6d},
+            {0x5e, 0x51, 0xaa, 0x49, 0x54, 0x63, 0x5b, 0xed, 0x3a, 0x82, 0xc6,
+             0xb,  0x9f, 0xc4, 0x65, 0xa8, 0xc4, 0xd1, 0x42, 0x5b, 0xe9, 0x1f,
+             0xc,  0x85, 0xb9, 0x15, 0xd3, 0x3,  0x6f, 0x6d, 0xd7, 0x30},
+            {0xc7, 0xe4, 0x6,  0x21, 0x17, 0x44, 0x44, 0x6c, 0x69, 0x7f, 0x8d,
+             0x92, 0x80, 0xd6, 0x53, 0xfb, 0x26, 0x3f, 0x4d, 0x69, 0xa4, 0x9e,
+             0x73, 0xb4, 0xb0, 0x4b, 0x86, 0x2e, 0x11, 0x97, 0xc6, 0x10},
+        },
+        {
+            {0x5,  0xc8, 0x58, 0x83, 0xa0, 0x2a, 0xa6, 0xc,  0x47, 0x42, 0x20,
+             0x7a, 0xe3, 0x4a, 0x3d, 0x6a, 0xdc, 0xed, 0x11, 0x3b, 0xa6, 0xd3,
+             0x64, 0x74, 0xef, 0x6,  0x8,  0x55, 0xaf, 0x9b, 0xbf, 0x3},
+            {0xde, 0x5f, 0xbe, 0x7d, 0x27, 0xc4, 0x93, 0x64, 0xa2, 0x7e, 0xad,
+             0x19, 0xad, 0x4f, 0x5d, 0x26, 0x90, 0x45, 0x30, 0x46, 0xc8, 0xdf,
+             0x0,  0xe,  0x9,  0xfe, 0x66, 0xed, 0xab, 0x1c, 0xe6, 0x25},
+            {0x4,  0x66, 0x58, 0xcc, 0x28, 0xe1, 0x13, 0x3f, 0x7e, 0x74, 0x59,
+             0xb4, 0xec, 0x73, 0x58, 0x6f, 0xf5, 0x68, 0x12, 0xcc, 0xed, 0x3d,
+             0xb6, 0xa0, 0x2c, 0xe2, 0x86, 0x45, 0x63, 0x78, 0x6d, 0x56},
+        },
+        {
+            {0xd0, 0x2f, 0x5a, 0xc6, 0x85, 0x42, 0x5,  0xa1, 0xc3, 0x67, 0x16,
+             0xf3, 0x2a, 0x11, 0x64, 0x6c, 0x58, 0xee, 0x1a, 0x73, 0x40, 0xe2,
+             0xa,  0x68, 0x2a, 0xb2, 0x93, 0x47, 0xf3, 0xa5, 0xfb, 0x14},
+            {0x34, 0x8,  0xc1, 0x9c, 0x9f, 0xa4, 0x37, 0x16, 0x51, 0xc4, 0x9b,
+             0xa8, 0xd5, 0x56, 0x8e, 0xbc, 0xdb, 0xd2, 0x7f, 0x7f, 0xf,  0xec,
+             0xb5, 0x1c, 0xd9, 0x35, 0xcc, 0x5e, 0xca, 0x5b, 0x97, 0x33},
+            {0xd4, 0xf7, 0x85, 0x69, 0x16, 0x46, 0xd7, 0x3c, 0x57, 0x0,  0xc8,
+             0xc9, 0x84, 0x5e, 0x3e, 0x59, 0x1e, 0x13, 0x61, 0x7b, 0xb6, 0xf2,
+             0xc3, 0x2f, 0x6c, 0x52, 0xfc, 0x83, 0xea, 0x9c, 0x82, 0x14},
+        },
+        {
+            {0xb8, 0xec, 0x71, 0x4e, 0x2f, 0xb,  0xe7, 0x21, 0xe3, 0x77, 0xa4,
+             0x40, 0xb9, 0xdd, 0x56, 0xe6, 0x80, 0x4f, 0x1d, 0xce, 0xce, 0x56,
+             0x65, 0xbf, 0x7e, 0x7b, 0x5d, 0x53, 0xc4, 0x3b, 0xfc, 0x5},
+            {0xc2, 0x95, 0xdd, 0x97, 0x84, 0x7b, 0x43, 0xff, 0xa7, 0xb5, 0x4e,
+             0xaa, 0x30, 0x4e, 0x74, 0x6c, 0x8b, 0xe8, 0x85, 0x3c, 0x61, 0x5d,
+             0xc,  0x9e, 0x73, 0x81, 0x75, 0x5f, 0x1e, 0xc7, 0xd9, 0x2f},
+            {0xdd, 0xde, 0xaf, 0x52, 0xae, 0xb3, 0xb8, 0x24, 0xcf, 0x30, 0x3b,
+             0xed, 0x8c, 0x63, 0x95, 0x34, 0x95, 0x81, 0xbe, 0xa9, 0x83, 0xbc,
+             0xa4, 0x33, 0x4,  0x1f, 0x65, 0x5c, 0x47, 0x67, 0x37, 0x37},
+        },
+        {
+            {0x90, 0x65, 0x24, 0x14, 0xcb, 0x95, 0x40, 0x63, 0x35, 0x55, 0xc1,
+             0x16, 0x40, 0x14, 0x12, 0xef, 0x60, 0xbc, 0x10, 0x89, 0xc,  0x14,
+             0x38, 0x9e, 0x8c, 0x7c, 0x90, 0x30, 0x57, 0x90, 0xf5, 0x6b},
+            {0xd9, 0xad, 0xd1, 0x40, 0xfd, 0x99, 0xba, 0x2f, 0x27, 0xd0, 0xf4,
+             0x96, 0x6f, 0x16, 0x7,  0xb3, 0xae, 0x3b, 0xf0, 0x15, 0x52, 0xf0,
+             0x63, 0x43, 0x99, 0xf9, 0x18, 0x3b, 0x6c, 0xa5, 0xbe, 0x1f},
+            {0x8a, 0x5b, 0x41, 0xe1, 0xf1, 0x78, 0xa7, 0xf,  0x7e, 0xa7, 0xc3,
+             0xba, 0xf7, 0x9f, 0x40, 0x6,  0x50, 0x9a, 0xa2, 0x9a, 0xb8, 0xd7,
+             0x52, 0x6f, 0x56, 0x5a, 0x63, 0x7a, 0xf6, 0x1c, 0x52, 0x2},
+        },
+        {
+            {0xe4, 0x5e, 0x2f, 0x77, 0x20, 0x67, 0x14, 0xb1, 0xce, 0x9a, 0x7,
+             0x96, 0xb1, 0x94, 0xf8, 0xe8, 0x4a, 0x82, 0xac, 0x0,  0x4d, 0x22,
+             0xf8, 0x4a, 0xc4, 0x6c, 0xcd, 0xf7, 0xd9, 0x53, 0x17, 0x0},
+            {0x94, 0x52, 0x9d, 0xa,  0xb,  0xee, 0x3f, 0x51, 0x66, 0x5a, 0xdf,
+             0xf,  0x5c, 0xe7, 0x98, 0x8f, 0xce, 0x7,  0xe1, 0xbf, 0x88, 0x86,
+             0x61, 0xd4, 0xed, 0x2c, 0x38, 0x71, 0x7e, 0xa,  0xa0, 0x3f},
+            {0x34, 0xdb, 0x3d, 0x96, 0x2d, 0x23, 0x69, 0x3c, 0x58, 0x38, 0x97,
+             0xb4, 0xda, 0x87, 0xde, 0x1d, 0x85, 0xf2, 0x91, 0xa0, 0xf9, 0xd1,
+             0xd7, 0xaa, 0xb6, 0xed, 0x48, 0xa0, 0x2f, 0xfe, 0xb5, 0x12},
+        },
+        {
+            {0x92, 0x1e, 0x6f, 0xad, 0x26, 0x7c, 0x2b, 0xdf, 0x13, 0x89, 0x4b,
+             0x50, 0x23, 0xd3, 0x66, 0x4b, 0xc3, 0x8b, 0x1c, 0x75, 0xc0, 0x9d,
+             0x40, 0x8c, 0xb8, 0xc7, 0x96, 0x7,  0xc2, 0x93, 0x7e, 0x6f},
+            {0x4d, 0xe3, 0xfc, 0x96, 0xc4, 0xfb, 0xf0, 0x71, 0xed, 0x5b, 0xf3,
+             0xad, 0x6b, 0x82, 0xb9, 0x73, 0x61, 0xc5, 0x28, 0xff, 0x61, 0x72,
+             0x4,  0xd2, 0x6f, 0x20, 0xb1, 0x6f, 0xf9, 0x76, 0x9b, 0x74},
+            {0x5,  0xae, 0xa6, 0xae, 0x4,  0xf6, 0x5a, 0x1f, 0x99, 0x9c, 0xe4,
+             0xbe, 0xf1, 0x51, 0x23, 0xc1, 0x66, 0x6b, 0xff, 0xee, 0xb5, 0x8,
+             0xa8, 0x61, 0x51, 0x21, 0xe0, 0x1,  0xf,  0xc1, 0xce, 0xf},
+        },
+        {
+            {0x45, 0x4e, 0x24, 0xc4, 0x9d, 0xd2, 0xf2, 0x3d, 0xa,  0xde, 0xd8,
+             0x93, 0x74, 0xe,  0x2,  0x2b, 0x4d, 0x21, 0xc,  0x82, 0x7e, 0x6,
+             0xc8, 0x6c, 0xa,  0xb9, 0xea, 0x6f, 0x16, 0x79, 0x37, 0x41},
+            {0x44, 0x1e, 0xfe, 0x49, 0xa6, 0x58, 0x4d, 0x64, 0x7e, 0x77, 0xad,
+             0x31, 0xa2, 0xae, 0xfc, 0x21, 0xd2, 0xd0, 0x7f, 0x88, 0x5a, 0x1c,
+             0x44, 0x2,  0xf3, 0x11, 0xc5, 0x83, 0x71, 0xaa, 0x1,  0x49},
+            {0xf0, 0xf8, 0x1a, 0x8c, 0x54, 0xb7, 0xb1, 0x8,  0xb4, 0x99, 0x62,
+             0x24, 0x7c, 0x7a, 0xf,  0xce, 0x39, 0xd9, 0x6,  0x1e, 0xf9, 0xb0,
+             0x60, 0xf7, 0x13, 0x12, 0x6d, 0x72, 0x7b, 0x88, 0xbb, 0x41},
+        },
+    },
+    {
+        {
+            {0xae, 0x91, 0x66, 0x7c, 0x59, 0x4c, 0x23, 0x7e, 0xc8, 0xb4, 0x85,
+             0xa,  0x3d, 0x9d, 0x88, 0x64, 0xe7, 0xfa, 0x4a, 0x35, 0xc,  0xc9,
+             0xe2, 0xda, 0x1d, 0x9e, 0x6a, 0xc,  0x7,  0x1e, 0x87, 0xa},
+            {0xbe, 0x46, 0x43, 0x74, 0x44, 0x7d, 0xe8, 0x40, 0x25, 0x2b, 0xb5,
+             0x15, 0xd4, 0xda, 0x48, 0x1d, 0x3e, 0x60, 0x3b, 0xa1, 0x18, 0x8a,
+             0x3a, 0x7c, 0xf7, 0xbd, 0xcd, 0x2f, 0xc1, 0x28, 0xb7, 0x4e},
+            {0x89, 0x89, 0xbc, 0x4b, 0x99, 0xb5, 0x1,  0x33, 0x60, 0x42, 0xdd,
+             0x5b, 0x3a, 0xae, 0x6b, 0x73, 0x3c, 0x9e, 0xd5, 0x19, 0xe2, 0xad,
+             0x61, 0xd,  0x64, 0xd4, 0x85, 0x26, 0xf,  0x30, 0xe7, 0x3e},
+        },
+        {
+            {0x18, 0x75, 0x1e, 0x84, 0x47, 0x79, 0xfa, 0x43, 0xd7, 0x46, 0x9c,
+             0x63, 0x59, 0xfa, 0xc6, 0xe5, 0x74, 0x2b, 0x5,  0xe3, 0x1d, 0x5e,
+             0x6,  0xa1, 0x30, 0x90, 0xb8, 0xcf, 0xa2, 0xc6, 0x47, 0x7d},
+            {0xb7, 0xd6, 0x7d, 0x9e, 0xe4, 0x55, 0xd2, 0xf5, 0xac, 0x1e, 0xb,
+             0x61, 0x5c, 0x11, 0x16, 0x80, 0xca, 0x87, 0xe1, 0x92, 0x5d, 0x97,
+             0x99, 0x3c, 0xc2, 0x25, 0x91, 0x97, 0x62, 0x57, 0x81, 0x13},
+            {0xe0, 0xd6, 0xf0, 0x8e, 0x14, 0xd0, 0xda, 0x3f, 0x3c, 0x6f, 0x54,
+             0x91, 0x9a, 0x74, 0x3e, 0x9d, 0x57, 0x81, 0xbb, 0x26, 0x10, 0x62,
+             0xec, 0x71, 0x80, 0xec, 0xc9, 0x34, 0x8d, 0xf5, 0x8c, 0x14},
+        },
+        {
+            {0x6d, 0x75, 0xe4, 0x9a, 0x7d, 0x2f, 0x57, 0xe2, 0x7f, 0x48, 0xf3,
+             0x88, 0xbb, 0x45, 0xc3, 0x56, 0x8d, 0xa8, 0x60, 0x69, 0x6d, 0xb,
+             0xd1, 0x9f, 0xb9, 0xa1, 0xae, 0x4e, 0xad, 0xeb, 0x8f, 0x27},
+            {0x27, 0xf0, 0x34, 0x79, 0xf6, 0x92, 0xa4, 0x46, 0xa9, 0xa,  0x84,
+             0xf6, 0xbe, 0x84, 0x99, 0x46, 0x54, 0x18, 0x61, 0x89, 0x2a, 0xbc,
+             0xa1, 0x5c, 0xd4, 0xbb, 0x5d, 0xbd, 0x1e, 0xfa, 0xf2, 0x3f},
+            {0x66, 0x39, 0x93, 0x8c, 0x1f, 0x68, 0xaa, 0xb1, 0x98, 0xc,  0x29,
+             0x20, 0x9c, 0x94, 0x21, 0x8c, 0x52, 0x3c, 0x9d, 0x21, 0x91, 0x52,
+             0x11, 0x39, 0x7b, 0x67, 0x9c, 0xfe, 0x2,  0xdd, 0x4,  0x41},
+        },
+        {
+            {0xb8, 0x6a, 0x9,  0xdb, 0x6,  0x4e, 0x21, 0x81, 0x35, 0x4f, 0xe4,
+             0xc,  0xc9, 0xb6, 0xa8, 0x21, 0xf5, 0x2a, 0x9e, 0x40, 0x2a, 0xc1,
+             0x24, 0x65, 0x81, 0xa4, 0xfc, 0x8e, 0xa4, 0xb5, 0x65, 0x1},
+            {0x2a, 0x42, 0x24, 0x11, 0x5e, 0xbf, 0xb2, 0x72, 0xb5, 0x3a, 0xa3,
+             0x98, 0x33, 0xc,  0xfa, 0xa1, 0x66, 0xb6, 0x52, 0xfa, 0x1,  0x61,
+             0xcb, 0x94, 0xd5, 0x53, 0xaf, 0xaf, 0x0,  0x3b, 0x86, 0x2c},
+            {0x76, 0x6a, 0x84, 0xa0, 0x74, 0xa4, 0x90, 0xf1, 0xc0, 0x7c, 0x2f,
+             0xcd, 0x84, 0xf9, 0xef, 0x12, 0x8f, 0x2b, 0xaa, 0x58, 0x6,  0x29,
+             0x5e, 0x69, 0xb8, 0xc8, 0xfe, 0xbf, 0xd9, 0x67, 0x1b, 0x59},
+        },
+        {
+            {0x5d, 0xb5, 0x18, 0x9f, 0x71, 0xb3, 0xb9, 0x99, 0x1e, 0x64, 0x8c,
+             0xa1, 0xfa, 0xe5, 0x65, 0xe4, 0xed, 0x5,  0x9f, 0xc2, 0x36, 0x11,
+             0x8,  0x61, 0x8b, 0x12, 0x30, 0x70, 0x86, 0x4f, 0x9b, 0x48},
+            {0xfa, 0x9b, 0xb4, 0x80, 0x1c, 0xd,  0x2f, 0x31, 0x8a, 0xec, 0xf3,
+             0xab, 0x5e, 0x51, 0x79, 0x59, 0x88, 0x1c, 0xf0, 0x9e, 0xc0, 0x33,
+             0x70, 0x72, 0xcb, 0x7b, 0x8f, 0xca, 0xc7, 0x2e, 0xe0, 0x3d},
+            {0xef, 0x92, 0xeb, 0x3a, 0x2d, 0x10, 0x32, 0xd2, 0x61, 0xa8, 0x16,
+             0x61, 0xb4, 0x53, 0x62, 0xe1, 0x24, 0xaa, 0xb,  0x19, 0xe7, 0xab,
+             0x7e, 0x3d, 0xbf, 0xbe, 0x6c, 0x49, 0xba, 0xfb, 0xf5, 0x49},
+        },
+        {
+            {0x2e, 0x57, 0x9c, 0x1e, 0x8c, 0x62, 0x5d, 0x15, 0x41, 0x47, 0x88,
+             0xc5, 0xac, 0x86, 0x4d, 0x8a, 0xeb, 0x63, 0x57, 0x51, 0xf6, 0x52,
+             0xa3, 0x91, 0x5b, 0x51, 0x67, 0x88, 0xc2, 0xa6, 0xa1, 0x6},
+            {0xd4, 0xcf, 0x5b, 0x8a, 0x10, 0x9a, 0x94, 0x30, 0xeb, 0x73, 0x64,
+             0xbc, 0x70, 0xdd, 0x40, 0xdc, 0x1c, 0xd,  0x7c, 0x30, 0xc1, 0x94,
+             0xc2, 0x92, 0x74, 0x6e, 0xfa, 0xcb, 0x6d, 0xa8, 0x4,  0x56},
+            {0xb6, 0x64, 0x17, 0x7c, 0xd4, 0xd1, 0x88, 0x72, 0x51, 0x8b, 0x41,
+             0xe0, 0x40, 0x11, 0x54, 0x72, 0xd1, 0xf6, 0xac, 0x18, 0x60, 0x1a,
+             0x3,  0x9f, 0xc6, 0x42, 0x27, 0xfe, 0x89, 0x9e, 0x98, 0x20},
+        },
+        {
+            {0x2e, 0xec, 0xea, 0x85, 0x8b, 0x27, 0x74, 0x16, 0xdf, 0x2b, 0xcb,
+             0x7a, 0x7,  0xdc, 0x21, 0x56, 0x5a, 0xf4, 0xcb, 0x61, 0x16, 0x4c,
+             0xa,  0x64, 0xd3, 0x95, 0x5,  0xf7, 0x50, 0x99, 0xb,  0x73},
+            {0x7f, 0xcc, 0x2d, 0x3a, 0xfd, 0x77, 0x97, 0x49, 0x92, 0xd8, 0x4f,
+             0xa5, 0x2c, 0x7c, 0x85, 0x32, 0xa0, 0xe3, 0x7,  0xd2, 0x64, 0xd8,
+             0x79, 0xa2, 0x29, 0x7e, 0xa6, 0xc,  0x1d, 0xed, 0x3,  0x4},
+            {0x52, 0xc5, 0x4e, 0x87, 0x35, 0x2d, 0x4b, 0xc9, 0x8d, 0x6f, 0x24,
+             0x98, 0xcf, 0xc8, 0xe6, 0xc5, 0xce, 0x35, 0xc0, 0x16, 0xfa, 0x46,
+             0xcb, 0xf7, 0xcc, 0x3d, 0x30, 0x8,  0x43, 0x45, 0xd7, 0x5b},
+        },
+        {
+            {0x2a, 0x79, 0xe7, 0x15, 0x21, 0x93, 0xc4, 0x85, 0xc9, 0xdd, 0xcd,
+             0xbd, 0xa2, 0x89, 0x4c, 0xc6, 0x62, 0xd7, 0xa3, 0xad, 0xa8, 0x3d,
+             0x1e, 0x9d, 0x2c, 0xf8, 0x67, 0x30, 0x12, 0xdb, 0xb7, 0x5b},
+            {0xc2, 0x4c, 0xb2, 0x28, 0x95, 0xd1, 0x9a, 0x7f, 0x81, 0xc1, 0x35,
+             0x63, 0x65, 0x54, 0x6b, 0x7f, 0x36, 0x72, 0xc0, 0x4f, 0x6e, 0xb6,
+             0xb8, 0x66, 0x83, 0xad, 0x80, 0x73, 0x0,  0x78, 0x3a, 0x13},
+            {0xbe, 0x62, 0xca, 0xc6, 0x67, 0xf4, 0x61, 0x9,  0xee, 0x52, 0x19,
+             0x21, 0xd6, 0x21, 0xec, 0x4,  0x70, 0x47, 0xd5, 0x9b, 0x77, 0x60,
+             0x23, 0x18, 0xd2, 0xe0, 0xf0, 0x58, 0x6d, 0xca, 0xd,  0x74},
+        },
+    },
+    {
+        {
+            {0x3c, 0x43, 0x78, 0x4,  0x57, 0x8c, 0x1a, 0x23, 0x9d, 0x43, 0x81,
+             0xc2, 0xe,  0x27, 0xb5, 0xb7, 0x9f, 0x7,  0xd9, 0xe3, 0xea, 0x99,
+             0xaa, 0xdb, 0xd9, 0x3,  0x2b, 0x6c, 0x25, 0xf5, 0x3,  0x2c},
+            {0x4e, 0xce, 0xcf, 0x52, 0x7,  0xee, 0x48, 0xdf, 0xb7, 0x8,  0xec,
+             0x6,  0xf3, 0xfa, 0xff, 0xc3, 0xc4, 0x59, 0x54, 0xb9, 0x2a, 0xb,
+             0x71, 0x5,  0x8d, 0xa3, 0x3e, 0x96, 0xfa, 0x25, 0x1d, 0x16},
+            {0x7d, 0xa4, 0x53, 0x7b, 0x75, 0x18, 0xf,  0x79, 0x79, 0x58, 0xc,
+             0xcf, 0x30, 0x1,  0x7b, 0x30, 0xf9, 0xf7, 0x7e, 0x25, 0x77, 0x3d,
+             0x90, 0x31, 0xaf, 0xbb, 0x96, 0xbd, 0xbd, 0x68, 0x94, 0x69},
+        },
+        {
+            {0x48, 0x19, 0xa9, 0x6a, 0xe6, 0x3d, 0xdd, 0xd8, 0xcc, 0xd2, 0xc0,
+             0x2f, 0xc2, 0x64, 0x50, 0x48, 0x2f, 0xea, 0xfd, 0x34, 0x66, 0x24,
+             0x48, 0x9b, 0x3a, 0x2e, 0x4a, 0x6c, 0x4e, 0x1c, 0x3e, 0x29},
+            {0xcf, 0xfe, 0xda, 0xf4, 0x46, 0x2f, 0x1f, 0xbd, 0xf7, 0xd6, 0x7f,
+             0xa4, 0x14, 0x1,  0xef, 0x7c, 0x7f, 0xb3, 0x47, 0x4a, 0xda, 0xfd,
+             0x1f, 0xd3, 0x85, 0x57, 0x90, 0x73, 0xa4, 0x19, 0x52, 0x52},
+            {0xe1, 0x12, 0x51, 0x92, 0x4b, 0x13, 0x6e, 0x37, 0xa0, 0x5d, 0xa1,
+             0xdc, 0xb5, 0x78, 0x37, 0x70, 0x11, 0x31, 0x1c, 0x46, 0xaf, 0x89,
+             0x45, 0xb0, 0x23, 0x28, 0x3,  0x7f, 0x44, 0x5c, 0x60, 0x5b},
+        },
+        {
+            {0x4c, 0xf0, 0xe7, 0xf0, 0xc6, 0xfe, 0xe9, 0x3b, 0x62, 0x49, 0xe3,
+             0x75, 0x9e, 0x57, 0x6a, 0x86, 0x1a, 0xe6, 0x1d, 0x1e, 0x16, 0xef,
+             0x42, 0x55, 0xd5, 0xbd, 0x5a, 0xcc, 0xf4, 0xfe, 0x12, 0x2f},
+            {0x89, 0x7c, 0xc4, 0x20, 0x59, 0x80, 0x65, 0xb9, 0xcc, 0x8f, 0x3b,
+             0x92, 0xc,  0x10, 0xf0, 0xe7, 0x77, 0xef, 0xe2, 0x2,  0x65, 0x25,
+             0x1,  0x0,  0xee, 0xb3, 0xae, 0xa8, 0xce, 0x6d, 0xa7, 0x24},
+            {0x40, 0xc7, 0xc0, 0xdf, 0xb2, 0x22, 0x45, 0xa,  0x7,  0xa4, 0xc9,
+             0x40, 0x7f, 0x6e, 0xd0, 0x10, 0x68, 0xf6, 0xcf, 0x78, 0x41, 0x14,
+             0xcf, 0xc6, 0x90, 0x37, 0xa4, 0x18, 0x25, 0x7b, 0x60, 0x5e},
+        },
+        {
+            {0x14, 0xcf, 0x96, 0xa5, 0x1c, 0x43, 0x2c, 0xa0, 0x0,  0xe4, 0xd3,
+             0xae, 0x40, 0x2d, 0xc4, 0xe3, 0xdb, 0x26, 0xf,  0x2e, 0x80, 0x26,
+             0x45, 0xd2, 0x68, 0x70, 0x45, 0x9e, 0x13, 0x33, 0x1f, 0x20},
+            {0x18, 0x18, 0xdf, 0x6c, 0x8f, 0x1d, 0xb3, 0x58, 0xa2, 0x58, 0x62,
+             0xc3, 0x4f, 0xa7, 0xcf, 0x35, 0x6e, 0x1d, 0xe6, 0x66, 0x4f, 0xff,
+             0xb3, 0xe1, 0xf7, 0xd5, 0xcd, 0x6c, 0xab, 0xac, 0x67, 0x50},
+            {0x51, 0x9d, 0x3,  0x8,  0x6b, 0x7f, 0x52, 0xfd, 0x6,  0x0,  0x7c,
+             0x1,  0x64, 0x49, 0xb1, 0x18, 0xa8, 0xa4, 0x25, 0x2e, 0xb0, 0xe,
+             0x22, 0xd5, 0x75, 0x3,  0x46, 0x62, 0x88, 0xba, 0x7c, 0x39},
+        },
+        {
+            {0xe7, 0x79, 0x13, 0xc8, 0xfb, 0xc3, 0x15, 0x78, 0xf1, 0x2a, 0xe1,
+             0xdd, 0x20, 0x94, 0x61, 0xa6, 0xd5, 0xfd, 0xa8, 0x85, 0xf8, 0xc0,
+             0xa9, 0xff, 0x52, 0xc2, 0xe1, 0xc1, 0x22, 0x40, 0x1b, 0x77},
+            {0xb2, 0x59, 0x59, 0xf0, 0x93, 0x30, 0xc1, 0x30, 0x76, 0x79, 0xa9,
+             0xe9, 0x8d, 0xa1, 0x3a, 0xe2, 0x26, 0x5e, 0x1d, 0x72, 0x91, 0xd4,
+             0x2f, 0x22, 0x3a, 0x6c, 0x6e, 0x76, 0x20, 0xd3, 0x39, 0x23},
+            {0xa7, 0x2f, 0x3a, 0x51, 0x86, 0xd9, 0x7d, 0xd8, 0x8,  0xcf, 0xd4,
+             0xf9, 0x71, 0x9b, 0xac, 0xf5, 0xb3, 0x83, 0xa2, 0x1e, 0x1b, 0xc3,
+             0x6b, 0xd0, 0x76, 0x1a, 0x97, 0x19, 0x92, 0x18, 0x1a, 0x33},
+        },
+        {
+            {0xaf, 0x72, 0x75, 0x9d, 0x3a, 0x2f, 0x51, 0x26, 0x9e, 0x4a, 0x7,
+             0x68, 0x88, 0xe2, 0xcb, 0x5b, 0xc4, 0xf7, 0x80, 0x11, 0xc1, 0xc1,
+             0xed, 0x84, 0x7b, 0xa6, 0x49, 0xf6, 0x9f, 0x61, 0xc9, 0x1a},
+            {0xc6, 0x80, 0x4f, 0xfb, 0x45, 0x6f, 0x16, 0xf5, 0xcf, 0x75, 0xc7,
+             0x61, 0xde, 0xc7, 0x36, 0x9c, 0x1c, 0xd9, 0x41, 0x90, 0x1b, 0xe8,
+             0xd4, 0xe3, 0x21, 0xfe, 0xbd, 0x83, 0x6b, 0x7c, 0x16, 0x31},
+            {0x68, 0x10, 0x4b, 0x52, 0x42, 0x38, 0x2b, 0xf2, 0x87, 0xe9, 0x9c,
+             0xee, 0x3b, 0x34, 0x68, 0x50, 0xc8, 0x50, 0x62, 0x4a, 0x84, 0x71,
+             0x9d, 0xfc, 0x11, 0xb1, 0x8,  0x1f, 0x34, 0x36, 0x24, 0x61},
+        },
+        {
+            {0x38, 0x26, 0x2d, 0x1a, 0xe3, 0x49, 0x63, 0x8b, 0x35, 0xfd, 0xd3,
+             0x9b, 0x0,  0xb7, 0xdf, 0x9d, 0xa4, 0x6b, 0xa0, 0xa3, 0xb8, 0xf1,
+             0x8b, 0x7f, 0x45, 0x4,  0xd9, 0x78, 0x31, 0xaa, 0x22, 0x15},
+            {0x8d, 0x89, 0x4e, 0x87, 0xdb, 0x41, 0x9d, 0xd9, 0x20, 0xdc, 0x7,
+             0x6c, 0xf1, 0xa5, 0xfe, 0x9,  0xbc, 0x9b, 0xf,  0xd0, 0x67, 0x2c,
+             0x3d, 0x79, 0x40, 0xff, 0x5e, 0x9e, 0x30, 0xe2, 0xeb, 0x46},
+            {0x38, 0x49, 0x61, 0x69, 0x53, 0x2f, 0x38, 0x2c, 0x10, 0x6d, 0x2d,
+             0xb7, 0x9a, 0x40, 0xfe, 0xda, 0x27, 0xf2, 0x46, 0xb6, 0x91, 0x33,
+             0xc8, 0xe8, 0x6c, 0x30, 0x24, 0x5,  0xf5, 0x70, 0xfe, 0x45},
+        },
+        {
+            {0x91, 0x14, 0x95, 0xc8, 0x20, 0x49, 0xf2, 0x62, 0xa2, 0xc,  0x63,
+             0x3f, 0xc8, 0x7,  0xf0, 0x5,  0xb8, 0xd4, 0xc9, 0xf5, 0xd2, 0x45,
+             0xbb, 0x6f, 0x45, 0x22, 0x7a, 0xb5, 0x6d, 0x9f, 0x61, 0x16},
+            {0x8c, 0xb,  0xc,  0x96, 0xa6, 0x75, 0x48, 0xda, 0x20, 0x2f, 0xe,
+             0xef, 0x76, 0xd0, 0x68, 0x5b, 0xd4, 0x8f, 0xb,  0x3d, 0xcf, 0x51,
+             0xfb, 0x7,  0xd4, 0x92, 0xe3, 0xa0, 0x23, 0x16, 0x8d, 0x42},
+            {0xfd, 0x8,  0xa3, 0x1,  0x44, 0x4a, 0x4f, 0x8,  0xac, 0xca, 0xa5,
+             0x76, 0xc3, 0x19, 0x22, 0xa8, 0x7d, 0xbc, 0xd1, 0x43, 0x46, 0xde,
+             0xb8, 0xde, 0xc6, 0x38, 0xbd, 0x60, 0x2d, 0x59, 0x81, 0x1d},
+        },
+    },
+    {
+        {
+            {0xe8, 0xc5, 0x85, 0x7b, 0x9f, 0xb6, 0x65, 0x87, 0xb2, 0xba, 0x68,
+             0xd1, 0x8b, 0x67, 0xf0, 0x6f, 0x9b, 0xf,  0x33, 0x1d, 0x7c, 0xe7,
+             0x70, 0x3a, 0x7c, 0x8e, 0xaf, 0xb0, 0x51, 0x6d, 0x5f, 0x3a},
+            {0x5f, 0xac, 0xd,  0xa6, 0x56, 0x87, 0x36, 0x61, 0x57, 0xdc, 0xab,
+             0xeb, 0x6a, 0x2f, 0xe0, 0x17, 0x7d, 0xf,  0xce, 0x4c, 0x2d, 0x3f,
+             0x19, 0x7f, 0xf0, 0xdc, 0xec, 0x89, 0x77, 0x4a, 0x23, 0x20},
+            {0x52, 0xb2, 0x78, 0x71, 0xb6, 0xd,  0xd2, 0x76, 0x60, 0xd1, 0x1e,
+             0xd5, 0xf9, 0x34, 0x1c, 0x7,  0x70, 0x11, 0xe4, 0xb3, 0x20, 0x4a,
+             0x2a, 0xf6, 0x66, 0xe3, 0xff, 0x3c, 0x35, 0x82, 0xd6, 0x7c},
+        },
+        {
+            {0xf3, 0xf4, 0xac, 0x68, 0x60, 0xcd, 0x65, 0xa6, 0xd3, 0xe3, 0xd7,
+             0x3c, 0x18, 0x2d, 0xd9, 0x42, 0xd9, 0x25, 0x60, 0x33, 0x9d, 0x38,
+             0x59, 0x57, 0xff, 0xd8, 0x2c, 0x2b, 0x3b, 0x25, 0xf0, 0x3e},
+            {0xb6, 0xfa, 0x87, 0xd8, 0x5b, 0xa4, 0xe1, 0xb,  0x6e, 0x3b, 0x40,
+             0xba, 0x32, 0x6a, 0x84, 0x2a, 0x0,  0x60, 0x6e, 0xe9, 0x12, 0x10,
+             0x92, 0xd9, 0x43, 0x9,  0xdc, 0x3b, 0x86, 0xc8, 0x38, 0x28},
+            {0x30, 0x50, 0x46, 0x4a, 0xcf, 0xb0, 0x6b, 0xd1, 0xab, 0x77, 0xc5,
+             0x15, 0x41, 0x6b, 0x49, 0xfa, 0x9d, 0x41, 0xab, 0xf4, 0x8a, 0xae,
+             0xcf, 0x82, 0x12, 0x28, 0xa8, 0x6,  0xa6, 0xb8, 0xdc, 0x21},
+        },
+        {
+            {0xba, 0x31, 0x77, 0xbe, 0xfa, 0x0,  0x8d, 0x9a, 0x89, 0x18, 0x9e,
+             0x62, 0x7e, 0x60, 0x3,  0x82, 0x7f, 0xd9, 0xf3, 0x43, 0x37, 0x2,
+             0xcc, 0xb2, 0x8b, 0x67, 0x6f, 0x6c, 0xbf, 0xd,  0x84, 0x5d},
+            {0xc8, 0x9f, 0x9d, 0x8c, 0x46, 0x4,  0x60, 0x5c, 0xcb, 0xa3, 0x2a,
+             0xd4, 0x6e, 0x9,  0x40, 0x25, 0x9c, 0x2f, 0xee, 0x12, 0x4c, 0x4d,
+             0x5b, 0x12, 0xab, 0x1d, 0xa3, 0x94, 0x81, 0xd0, 0xc3, 0xb},
+            {0x8b, 0xe1, 0x9f, 0x30, 0xd,  0x38, 0x6e, 0x70, 0xc7, 0x65, 0xe1,
+             0xb9, 0xa6, 0x2d, 0xb0, 0x6e, 0xab, 0x20, 0xae, 0x7d, 0x99, 0xba,
+             0xbb, 0x57, 0xdd, 0x96, 0xc1, 0x2a, 0x23, 0x76, 0x42, 0x3a},
+        },
+        {
+            {0xcb, 0x7e, 0x44, 0xdb, 0x72, 0xc1, 0xf8, 0x3b, 0xbd, 0x2d, 0x28,
+             0xc6, 0x1f, 0xc4, 0xcf, 0x5f, 0xfe, 0x15, 0xaa, 0x75, 0xc0, 0xff,
+             0xac, 0x80, 0xf9, 0xa9, 0xe1, 0x24, 0xe8, 0xc9, 0x70, 0x7},
+            {0xfa, 0x84, 0x70, 0x8a, 0x2c, 0x43, 0x42, 0x4b, 0x45, 0xe5, 0xb9,
+             0xdf, 0xe3, 0x19, 0x8a, 0x89, 0x5d, 0xe4, 0x58, 0x9c, 0x21, 0x0,
+             0x9f, 0xbe, 0xd1, 0xeb, 0x6d, 0xa1, 0xce, 0x77, 0xf1, 0x1f},
+            {0xfd, 0xb5, 0xb5, 0x45, 0x9a, 0xd9, 0x61, 0xcf, 0x24, 0x79, 0x3a,
+             0x1b, 0xe9, 0x84, 0x9,  0x86, 0x89, 0x3e, 0x3e, 0x30, 0x19, 0x9,
+             0x30, 0xe7, 0x1e, 0xb,  0x50, 0x41, 0xfd, 0x64, 0xf2, 0x39},
+        },
+        {
+            {0xe1, 0x7b, 0x9,  0xfe, 0xab, 0x4a, 0x9b, 0xd1, 0x29, 0x19, 0xe0,
+             0xdf, 0xe1, 0xfc, 0x6d, 0xa4, 0xff, 0xf1, 0xa6, 0x2c, 0x94, 0x8,
+             0xc9, 0xc3, 0x4e, 0xf1, 0x35, 0x2c, 0x27, 0x21, 0xc6, 0x65},
+            {0x9c, 0xe2, 0xe7, 0xdb, 0x17, 0x34, 0xad, 0xa7, 0x9c, 0x13, 0x9c,
+             0x2b, 0x6a, 0x37, 0x94, 0xbd, 0xa9, 0x7b, 0x59, 0x93, 0x8e, 0x1b,
+             0xe9, 0xa0, 0x40, 0x98, 0x88, 0x68, 0x34, 0xd7, 0x12, 0x17},
+            {0xdd, 0x93, 0x31, 0xce, 0xf8, 0x89, 0x2b, 0xe7, 0xbb, 0xc0, 0x25,
+             0xa1, 0x56, 0x33, 0x10, 0x4d, 0x83, 0xfe, 0x1c, 0x2e, 0x3d, 0xa9,
+             0x19, 0x4,  0x72, 0xe2, 0x9c, 0xb1, 0xa,  0x80, 0xf9, 0x22},
+        },
+        {
+            {0xac, 0xfd, 0x6e, 0x9a, 0xdd, 0x9f, 0x2,  0x42, 0x41, 0x49, 0xa5,
+             0x34, 0xbe, 0xce, 0x12, 0xb9, 0x7b, 0xf3, 0xbd, 0x87, 0xb9, 0x64,
+             0xf,  0x64, 0xb4, 0xca, 0x98, 0x85, 0xd3, 0xa4, 0x71, 0x41},
+            {0xcb, 0xf8, 0x9e, 0x3e, 0x8a, 0x36, 0x5a, 0x60, 0x15, 0x47, 0x50,
+             0xa5, 0x22, 0xc0, 0xe9, 0xe3, 0x8f, 0x24, 0x24, 0x5f, 0xb0, 0x48,
+             0x3d, 0x55, 0xe5, 0x26, 0x76, 0x64, 0xcd, 0x16, 0xf4, 0x13},
+            {0x8c, 0x4c, 0xc9, 0x99, 0xaa, 0x58, 0x27, 0xfa, 0x7,  0xb8, 0x0,
+             0xb0, 0x6f, 0x6f, 0x0,  0x23, 0x92, 0x53, 0xda, 0xad, 0xdd, 0x91,
+             0xd2, 0xfb, 0xab, 0xd1, 0x4b, 0x57, 0xfa, 0x14, 0x82, 0x50},
+        },
+        {
+            {0xd6, 0x3,  0xd0, 0x53, 0xbb, 0x15, 0x1a, 0x46, 0x65, 0xc9, 0xf3,
+             0xbc, 0x88, 0x28, 0x10, 0xb2, 0x5a, 0x3a, 0x68, 0x6c, 0x75, 0x76,
+             0xc5, 0x27, 0x47, 0xb4, 0x6c, 0xc8, 0xa4, 0x58, 0x77, 0x3a},
+            {0x4b, 0xfe, 0xd6, 0x3e, 0x15, 0x69, 0x2,  0xc2, 0xc4, 0x77, 0x1d,
+             0x51, 0x39, 0x67, 0x5a, 0xa6, 0x94, 0xaf, 0x14, 0x2c, 0x46, 0x26,
+             0xde, 0xcb, 0x4b, 0xa7, 0xab, 0x6f, 0xec, 0x60, 0xf9, 0x22},
+            {0x76, 0x50, 0xae, 0x93, 0xf6, 0x11, 0x81, 0x54, 0xa6, 0x54, 0xfd,
+             0x1d, 0xdf, 0x21, 0xae, 0x1d, 0x65, 0x5e, 0x11, 0xf3, 0x90, 0x8c,
+             0x24, 0x12, 0x94, 0xf4, 0xe7, 0x8d, 0x5f, 0xd1, 0x9f, 0x5d},
+        },
+        {
+            {0x1e, 0x52, 0xd7, 0xee, 0x2a, 0x4d, 0x24, 0x3f, 0x15, 0x96, 0x2e,
+             0x43, 0x28, 0x90, 0x3a, 0x8e, 0xd4, 0x16, 0x9c, 0x2e, 0x77, 0xba,
+             0x64, 0xe1, 0xd8, 0x98, 0xeb, 0x47, 0xfa, 0x87, 0xc1, 0x3b},
+            {0x7f, 0x72, 0x63, 0x6d, 0xd3, 0x8,  0x14, 0x3,  0x33, 0xb5, 0xc7,
+             0xd7, 0xef, 0x9a, 0x37, 0x6a, 0x4b, 0xe2, 0xae, 0xcc, 0xc5, 0x8f,
+             0xe1, 0xa9, 0xd3, 0xbe, 0x8f, 0x4f, 0x91, 0x35, 0x2f, 0x33},
+            {0xc,  0xc2, 0x86, 0xea, 0x15, 0x1,  0x47, 0x6d, 0x25, 0xd1, 0x46,
+             0x6c, 0xcb, 0xb7, 0x8a, 0x99, 0x88, 0x1,  0x66, 0x3a, 0xb5, 0x32,
+             0x78, 0xd7, 0x3,  0xba, 0x6f, 0x90, 0xce, 0x81, 0xd,  0x45},
+        },
+    },
+    {
+        {
+            {0x3f, 0x74, 0xae, 0x1c, 0x96, 0xd8, 0x74, 0xd0, 0xed, 0x63, 0x1c,
+             0xee, 0xf5, 0x18, 0x6d, 0xf8, 0x29, 0xed, 0xf4, 0xe7, 0x5b, 0xc5,
+             0xbd, 0x97, 0x8,  0xb1, 0x3a, 0x66, 0x79, 0xd2, 0xba, 0x4c},
+            {0x75, 0x52, 0x20, 0xa6, 0xa1, 0xb6, 0x7b, 0x6e, 0x83, 0x8e, 0x3c,
+             0x41, 0xd7, 0x21, 0x4f, 0xaa, 0xb2, 0x5c, 0x8f, 0xe8, 0x55, 0xd1,
+             0x56, 0x6f, 0xe1, 0x5b, 0x34, 0xa6, 0x4b, 0x5d, 0xe2, 0x2d},
+            {0xcd, 0x1f, 0xd7, 0xa0, 0x24, 0x90, 0xd1, 0x80, 0xf8, 0x8a, 0x28,
+             0xfb, 0xa,  0xc2, 0x25, 0xc5, 0x19, 0x64, 0x3a, 0x5f, 0x4b, 0x97,
+             0xa3, 0xb1, 0x33, 0x72, 0x0,  0xe2, 0xef, 0xbc, 0x7f, 0x7d},
+        },
+        {
+            {0x94, 0x90, 0xc2, 0xf3, 0xc5, 0x5d, 0x7c, 0xcd, 0xab, 0x5,  0x91,
+             0x2a, 0x9a, 0xa2, 0x81, 0xc7, 0x58, 0x30, 0x1c, 0x42, 0x36, 0x1d,
+             0xc6, 0x80, 0xd7, 0xd4, 0xd8, 0xdc, 0x96, 0xd1, 0x9c, 0x4f},
+            {0x1,  0x28, 0x6b, 0x26, 0x6a, 0x1e, 0xef, 0xfa, 0x16, 0x9f, 0x73,
+             0xd5, 0xc4, 0x68, 0x6c, 0x86, 0x2c, 0x76, 0x3,  0x1b, 0xbc, 0x2f,
+             0x8a, 0xf6, 0x8d, 0x5a, 0xb7, 0x87, 0x5e, 0x43, 0x75, 0x59},
+            {0x68, 0x37, 0x7b, 0x6a, 0xd8, 0x97, 0x92, 0x19, 0x63, 0x7a, 0xd1,
+             0x1a, 0x24, 0x58, 0xd0, 0xd0, 0x17, 0xc,  0x1c, 0x5c, 0xad, 0x9c,
+             0x2,  0xba, 0x7,  0x3,  0x7a, 0x38, 0x84, 0xd0, 0xcd, 0x7c},
+        },
+        {
+            {0x93, 0xcc, 0x60, 0x67, 0x18, 0x84, 0xc,  0x9b, 0x99, 0x2a, 0xb3,
+             0x1a, 0x7a, 0x0,  0xae, 0xcd, 0x18, 0xda, 0xb,  0x62, 0x86, 0xec,
+             0x8d, 0xa8, 0x44, 0xca, 0x90, 0x81, 0x84, 0xca, 0x93, 0x35},
+            {0x17, 0x4,  0x26, 0x6d, 0x2c, 0x42, 0xa6, 0xdc, 0xbd, 0x40, 0x82,
+             0x94, 0x50, 0x3d, 0x15, 0xae, 0x77, 0xc6, 0x68, 0xfb, 0xb4, 0xc1,
+             0xc0, 0xa9, 0x53, 0xcf, 0xd0, 0x61, 0xed, 0xd0, 0x8b, 0x42},
+            {0xa7, 0x9a, 0x84, 0x5e, 0x9a, 0x18, 0x13, 0x92, 0xcd, 0xfa, 0xd8,
+             0x65, 0x35, 0xc3, 0xd8, 0xd4, 0xd1, 0xbb, 0xfd, 0x53, 0x5b, 0x54,
+             0x52, 0x8c, 0xe6, 0x63, 0x2d, 0xda, 0x8,  0x83, 0x39, 0x27},
+        },
+        {
+            {0x53, 0x24, 0x70, 0xa,  0x4c, 0xe,  0xa1, 0xb9, 0xde, 0x1b, 0x7d,
+             0xd5, 0x66, 0x58, 0xa2, 0xf,  0xf7, 0xda, 0x27, 0xcd, 0xb5, 0xd9,
+             0xb9, 0xff, 0xfd, 0x33, 0x2c, 0x49, 0x45, 0x29, 0x2c, 0x57},
+            {0x13, 0xd4, 0x5e, 0x43, 0x28, 0x8d, 0xc3, 0x42, 0xc9, 0xcc, 0x78,
+             0x32, 0x60, 0xf3, 0x50, 0xbd, 0xef, 0x3,  0xda, 0x79, 0x1a, 0xab,
+             0x7,  0xbb, 0x55, 0x33, 0x8c, 0xbe, 0xae, 0x97, 0x95, 0x26},
+            {0xbe, 0x30, 0xcd, 0xd6, 0x45, 0xc7, 0x7f, 0xc7, 0xfb, 0xae, 0xba,
+             0xe3, 0xd3, 0xe8, 0xdf, 0xe4, 0xc,  0xda, 0x5d, 0xaa, 0x30, 0x88,
+             0x2c, 0xa2, 0x80, 0xca, 0x5b, 0xc0, 0x98, 0x54, 0x98, 0x7f},
+        },
+        {
+            {0x63, 0x63, 0xbf, 0xf,  0x52, 0x15, 0x56, 0xd3, 0xa6, 0xfb, 0x4d,
+             0xcf, 0x45, 0x5a, 0x4,  0x8,  0xc2, 0xa0, 0x3f, 0x87, 0xbc, 0x4f,
+             0xc2, 0xee, 0xe7, 0x12, 0x9b, 0xd6, 0x3c, 0x65, 0xf2, 0x30},
+            {0x17, 0xe1, 0xb,  0x9f, 0x88, 0xce, 0x49, 0x38, 0x88, 0xa2, 0x54,
+             0x7b, 0x1b, 0xad, 0x5,  0x80, 0x1c, 0x92, 0xfc, 0x23, 0x9f, 0xc3,
+             0xa3, 0x3d, 0x4,  0xf3, 0x31, 0xa,  0x47, 0xec, 0xc2, 0x76},
+            {0x85, 0xc,  0xc1, 0xaa, 0x38, 0xc9, 0x8,  0x8a, 0xcb, 0x6b, 0x27,
+             0xdb, 0x60, 0x9b, 0x17, 0x46, 0x70, 0xac, 0x6f, 0xe,  0x1e, 0xc0,
+             0x20, 0xa9, 0xda, 0x73, 0x64, 0x59, 0xf1, 0x73, 0x12, 0x2f},
+        },
+        {
+            {0xc0, 0xb,  0xa7, 0x55, 0xd7, 0x8b, 0x48, 0x30, 0xe7, 0x42, 0xd4,
+             0xf1, 0xa4, 0xb5, 0xd6, 0x6,  0x62, 0x61, 0x59, 0xbc, 0x9e, 0xa6,
+             0xd1, 0xea, 0x84, 0xf7, 0xc5, 0xed, 0x97, 0x19, 0xac, 0x38},
+            {0x11, 0x1e, 0xe0, 0x8a, 0x7c, 0xfc, 0x39, 0x47, 0x9f, 0xab, 0x6a,
+             0x4a, 0x90, 0x74, 0x52, 0xfd, 0x2e, 0x8f, 0x72, 0x87, 0x82, 0x8a,
+             0xd9, 0x41, 0xf2, 0x69, 0x5b, 0xd8, 0x2a, 0x57, 0x9e, 0x5d},
+            {0x3b, 0xb1, 0x51, 0xa7, 0x17, 0xb5, 0x66, 0x6,  0x8c, 0x85, 0x9b,
+             0x7e, 0x86, 0x6,  0x7d, 0x74, 0x49, 0xde, 0x4d, 0x45, 0x11, 0xc0,
+             0xac, 0xac, 0x9c, 0xe6, 0xe9, 0xbf, 0x9c, 0xcd, 0xdf, 0x22},
+        },
+        {
+            {0xa1, 0xe0, 0x3b, 0x10, 0xb4, 0x59, 0xec, 0x56, 0x69, 0xf9, 0x59,
+             0xd2, 0xec, 0xba, 0xe3, 0x2e, 0x32, 0xcd, 0xf5, 0x13, 0x94, 0xb2,
+             0x7c, 0x79, 0x72, 0xe4, 0xcd, 0x24, 0x78, 0x87, 0xe9, 0xf},
+            {0xd9, 0xc,  0xd,  0xc3, 0xe0, 0xd2, 0xdb, 0x8d, 0x33, 0x43, 0xbb,
+             0xac, 0x5f, 0x66, 0x8e, 0xad, 0x1f, 0x96, 0x2a, 0x32, 0x8c, 0x25,
+             0x6b, 0x8f, 0xc7, 0xc1, 0x48, 0x54, 0xc0, 0x16, 0x29, 0x6b},
+            {0x3b, 0x91, 0xba, 0xa,  0xd1, 0x34, 0xdb, 0x7e, 0xe,  0xac, 0x6d,
+             0x2e, 0x82, 0xcd, 0xa3, 0x4e, 0x15, 0xf8, 0x78, 0x65, 0xff, 0x3d,
+             0x8,  0x66, 0x17, 0xa,  0xf0, 0x7f, 0x30, 0x3f, 0x30, 0x4c},
+        },
+        {
+            {0x0,  0x45, 0xd9, 0xd,  0x58, 0x3,  0xfc, 0x29, 0x93, 0xec, 0xbb,
+             0x6f, 0xa4, 0x7a, 0xd2, 0xec, 0xf8, 0xa7, 0xe2, 0xc2, 0x5f, 0x15,
+             0xa,  0x13, 0xd5, 0xa1, 0x6,  0xb7, 0x1a, 0x15, 0x6b, 0x41},
+            {0x85, 0x8c, 0xb2, 0x17, 0xd6, 0x3b, 0xa,  0xd3, 0xea, 0x3b, 0x77,
+             0x39, 0xb7, 0x77, 0xd3, 0xc5, 0xbf, 0x5c, 0x6a, 0x1e, 0x8c, 0xe7,
+             0xc6, 0xc6, 0xc4, 0xb7, 0x2a, 0x8b, 0xf7, 0xb8, 0x61, 0xd},
+            {0xb0, 0x36, 0xc1, 0xe9, 0xef, 0xd7, 0xa8, 0x56, 0x20, 0x4b, 0xe4,
+             0x58, 0xcd, 0xe5, 0x7,  0xbd, 0xab, 0xe0, 0x57, 0x1b, 0xda, 0x2f,
+             0xe6, 0xaf, 0xd2, 0xe8, 0x77, 0x42, 0xf7, 0x2a, 0x1a, 0x19},
+        },
+    },
+    {
+        {
+            {0xfb, 0xe,  0x46, 0x4f, 0x43, 0x2b, 0xe6, 0x9f, 0xd6, 0x7, 0x36,
+             0xa6, 0xd4, 0x3,  0xd3, 0xde, 0x24, 0xda, 0xa0, 0xb7, 0xe, 0x21,
+             0x52, 0xf0, 0x93, 0x5b, 0x54, 0x0,  0xbe, 0x7d, 0x7e, 0x23},
+            {0x31, 0x14, 0x3c, 0xc5, 0x4b, 0xf7, 0x16, 0xce, 0xde, 0xed, 0x72,
+             0x20, 0xce, 0x25, 0x97, 0x2b, 0xe7, 0x3e, 0xb2, 0xb5, 0x6f, 0xc3,
+             0xb9, 0xb8, 0x8,  0xc9, 0x5c, 0xb,  0x45, 0xe,  0x2e, 0x7e},
+            {0x30, 0xb4, 0x1,  0x67, 0xed, 0x75, 0x35, 0x1,  0x10, 0xfd, 0xb,
+             0x9f, 0xe6, 0x94, 0x10, 0x23, 0x22, 0x7f, 0xe4, 0x83, 0x15, 0xf,
+             0x32, 0x75, 0xe3, 0x55, 0x11, 0xb1, 0x99, 0xa6, 0xaf, 0x71},
+        },
+        {
+            {0xd6, 0x50, 0x3b, 0x47, 0x1c, 0x3c, 0x42, 0xea, 0x10, 0xef, 0x38,
+             0x3b, 0x1f, 0x7a, 0xe8, 0x51, 0x95, 0xbe, 0xc9, 0xb2, 0x5f, 0xbf,
+             0x84, 0x9b, 0x1c, 0x9a, 0xf8, 0x78, 0xbc, 0x1f, 0x73, 0x0},
+            {0x1d, 0xb6, 0x53, 0x39, 0x9b, 0x6f, 0xce, 0x65, 0xe6, 0x41, 0xa1,
+             0xaf, 0xea, 0x39, 0x58, 0xc6, 0xfe, 0x59, 0xf7, 0xa9, 0xfd, 0x5f,
+             0x43, 0xf,  0x8e, 0xc2, 0xb1, 0xc2, 0xe9, 0x42, 0x11, 0x2},
+            {0x80, 0x18, 0xf8, 0x48, 0x18, 0xc7, 0x30, 0xe4, 0x19, 0xc1, 0xce,
+             0x5e, 0x22, 0xc,  0x96, 0xbf, 0xe3, 0x15, 0xba, 0x6b, 0x83, 0xe0,
+             0xda, 0xb6, 0x8,  0x58, 0xe1, 0x47, 0x33, 0x6f, 0x4d, 0x4c},
+        },
+        {
+            {0x70, 0x19, 0x8f, 0x98, 0xfc, 0xdd, 0xc,  0x2f, 0x1b, 0xf5, 0xb9,
+             0xb0, 0x27, 0x62, 0x91, 0x6b, 0xbe, 0x76, 0x91, 0x77, 0xc4, 0xb6,
+             0xc7, 0x6e, 0xa8, 0x9f, 0x8f, 0xa8, 0x0,  0x95, 0xbf, 0x38},
+            {0xc9, 0x1f, 0x7d, 0xc1, 0xcf, 0xec, 0xf7, 0x18, 0x14, 0x3c, 0x40,
+             0x51, 0xa6, 0xf5, 0x75, 0x6c, 0xdf, 0xc,  0xee, 0xf7, 0x2b, 0x71,
+             0xde, 0xdb, 0x22, 0x7a, 0xe4, 0xa7, 0xaa, 0xdd, 0x3f, 0x19},
+            {0x6f, 0x87, 0xe8, 0x37, 0x3c, 0xc9, 0xd2, 0x1f, 0x2c, 0x46, 0xd1,
+             0x18, 0x5a, 0x1e, 0xf6, 0xa2, 0x76, 0x12, 0x24, 0x39, 0x82, 0xf5,
+             0x80, 0x50, 0x69, 0x49, 0xd,  0xbf, 0x9e, 0xb9, 0x6f, 0x6a},
+        },
+        {
+            {0xc6, 0x23, 0xe4, 0xb6, 0xb5, 0x22, 0xb1, 0xee, 0x8e, 0xff, 0x86,
+             0xf2, 0x10, 0x70, 0x9d, 0x93, 0x8c, 0x5d, 0xcf, 0x1d, 0x83, 0x2a,
+             0xa9, 0x90, 0x10, 0xeb, 0xc5, 0x42, 0x9f, 0xda, 0x6f, 0x13},
+            {0xeb, 0x55, 0x8,  0x56, 0xbb, 0xc1, 0x46, 0x6a, 0x9d, 0xf0, 0x93,
+             0xf8, 0x38, 0xbb, 0x16, 0x24, 0xc1, 0xac, 0x71, 0x8f, 0x37, 0x11,
+             0x1d, 0xd7, 0xea, 0x96, 0x18, 0xa3, 0x14, 0x69, 0xf7, 0x75},
+            {0xd1, 0xbd, 0x5,  0xa3, 0xb1, 0xdf, 0x4c, 0xf9, 0x8,  0x2c, 0xf8,
+             0x9f, 0x9d, 0x4b, 0x36, 0xf,  0x8a, 0x58, 0xbb, 0xc3, 0xa5, 0xd8,
+             0x87, 0x2a, 0xba, 0xdc, 0xe8, 0xb,  0x51, 0x83, 0x21, 0x2},
+        },
+        {
+            {0x7f, 0x7a, 0x30, 0x43, 0x1,  0x71, 0x5a, 0x9d, 0x5f, 0xa4, 0x7d,
+             0xc4, 0x9e, 0xde, 0x63, 0xb0, 0xd3, 0x7a, 0x92, 0xbe, 0x52, 0xfe,
+             0xbb, 0x22, 0x6c, 0x42, 0x40, 0xfd, 0x41, 0xc4, 0x87, 0x13},
+            {0x14, 0x2d, 0xad, 0x5e, 0x38, 0x66, 0xf7, 0x4a, 0x30, 0x58, 0x7c,
+             0xca, 0x80, 0xd8, 0x8e, 0xa0, 0x3d, 0x1e, 0x21, 0x10, 0xe6, 0xa6,
+             0x13, 0xd,  0x3,  0x6c, 0x80, 0x7b, 0xe1, 0x1c, 0x7,  0x6a},
+            {0xf8, 0x8a, 0x97, 0x87, 0xd1, 0xc3, 0xd3, 0xb5, 0x13, 0x44, 0xe,
+             0x7f, 0x3d, 0x5a, 0x2b, 0x72, 0xa0, 0x7c, 0x47, 0xbb, 0x48, 0x48,
+             0x7b, 0xd,  0x92, 0xdc, 0x1e, 0xaf, 0x6a, 0xb2, 0x71, 0x31},
+        },
+        {
+            {0xd1, 0x47, 0x8a, 0xb2, 0xd8, 0xb7, 0xd,  0xa6, 0xf1, 0xa4, 0x70,
+             0x17, 0xd6, 0x14, 0xbf, 0xa6, 0x58, 0xbd, 0xdd, 0x53, 0x93, 0xf8,
+             0xa1, 0xd4, 0xe9, 0x43, 0x42, 0x34, 0x63, 0x4a, 0x51, 0x6c},
+            {0xa8, 0x4c, 0x56, 0x97, 0x90, 0x31, 0x2f, 0xa9, 0x19, 0xe1, 0x75,
+             0x22, 0x4c, 0xb8, 0x7b, 0xff, 0x50, 0x51, 0x87, 0xa4, 0x37, 0xfe,
+             0x55, 0x4f, 0x5a, 0x83, 0xf0, 0x3c, 0x87, 0xd4, 0x1f, 0x22},
+            {0x41, 0x63, 0x15, 0x3a, 0x4f, 0x20, 0x22, 0x23, 0x2d, 0x3,  0xa,
+             0xba, 0xe9, 0xe0, 0x73, 0xfb, 0xe,  0x3,  0xf,  0x41, 0x4c, 0xdd,
+             0xe0, 0xfc, 0xaa, 0x4a, 0x92, 0xfb, 0x96, 0xa5, 0xda, 0x48},
+        },
+        {
+            {0x93, 0x97, 0x4c, 0xc8, 0x5d, 0x1d, 0xf6, 0x14, 0x6,  0x82, 0x41,
+             0xef, 0xe3, 0xf9, 0x41, 0x99, 0xac, 0x77, 0x62, 0x34, 0x8f, 0xb8,
+             0xf5, 0xcd, 0xa9, 0x79, 0x8a, 0xe,  0xfa, 0x37, 0xc8, 0x58},
+            {0xc7, 0x9c, 0xa5, 0x5c, 0x66, 0x8e, 0xca, 0x6e, 0xa0, 0xac, 0x38,
+             0x2e, 0x4b, 0x25, 0x47, 0xa8, 0xce, 0x17, 0x1e, 0xd2, 0x8,  0xc7,
+             0xaf, 0x31, 0xf7, 0x4a, 0xd8, 0xca, 0xfc, 0xd6, 0x6d, 0x67},
+            {0x58, 0x90, 0xfc, 0x96, 0x85, 0x68, 0xf9, 0xc,  0x1b, 0xa0, 0x56,
+             0x7b, 0xf3, 0xbb, 0xdc, 0x1d, 0x6a, 0xd6, 0x35, 0x49, 0x7d, 0xe7,
+             0xc2, 0xdc, 0xa,  0x7f, 0xa5, 0xc6, 0xf2, 0x73, 0x4f, 0x1c},
+        },
+        {
+            {0x84, 0x34, 0x7c, 0xfc, 0x6e, 0x70, 0x6e, 0xb3, 0x61, 0xcf, 0xc1,
+             0xc3, 0xb4, 0xc9, 0xdf, 0x73, 0xe5, 0xc7, 0x1c, 0x78, 0xc9, 0x79,
+             0x1d, 0xeb, 0x5c, 0x67, 0xaf, 0x7d, 0xdb, 0x9a, 0x45, 0x70},
+            {0xbb, 0xa0, 0x5f, 0x30, 0xbd, 0x4f, 0x7a, 0xe,  0xad, 0x63, 0xc6,
+             0x54, 0xe0, 0x4c, 0x9d, 0x82, 0x48, 0x38, 0xe3, 0x2f, 0x83, 0xc3,
+             0x21, 0xf4, 0x42, 0x4c, 0xf6, 0x1b, 0xd,  0xc8, 0x5a, 0x79},
+            {0xb3, 0x2b, 0xb4, 0x91, 0x49, 0xdb, 0x91, 0x1b, 0xca, 0xdc, 0x2,
+             0x4b, 0x23, 0x96, 0x26, 0x57, 0xdc, 0x78, 0x8c, 0x1f, 0xe5, 0x9e,
+             0xdf, 0x9f, 0xd3, 0x1f, 0xe2, 0x8c, 0x84, 0x62, 0xe1, 0x5f},
+        },
+    },
+    {
+        {
+            {0x8,  0xb2, 0x7c, 0x5d, 0x2d, 0x85, 0x79, 0x28, 0xe7, 0xf2, 0x7d,
+             0x68, 0x70, 0xdd, 0xde, 0xb8, 0x91, 0x78, 0x68, 0x21, 0xab, 0xff,
+             0xb,  0xdc, 0x35, 0xaa, 0x7d, 0x67, 0x43, 0xc0, 0x44, 0x2b},
+            {0x1a, 0x96, 0x94, 0xe1, 0x4f, 0x21, 0x59, 0x4e, 0x4f, 0xcd, 0x71,
+             0xd,  0xc7, 0x7d, 0xbe, 0x49, 0x2d, 0xf2, 0x50, 0x3b, 0xd2, 0xcf,
+             0x0,  0x93, 0x32, 0x72, 0x91, 0xfc, 0x46, 0xd4, 0x89, 0x47},
+            {0x8e, 0xb7, 0x4e, 0x7,  0xab, 0x87, 0x1c, 0x1a, 0x67, 0xf4, 0xda,
+             0x99, 0x8e, 0xd1, 0xc6, 0xfa, 0x67, 0x90, 0x4f, 0x48, 0xcd, 0xbb,
+             0xac, 0x3e, 0xe4, 0xa4, 0xb9, 0x2b, 0xef, 0x2e, 0xc5, 0x60},
+        },
+        {
+            {0x11, 0x6d, 0xae, 0x7c, 0xc2, 0xc5, 0x2b, 0x70, 0xab, 0x8c, 0xa4,
+             0x54, 0x9b, 0x69, 0xc7, 0x44, 0xb2, 0x2e, 0x49, 0xba, 0x56, 0x40,
+             0xbc, 0xef, 0x6d, 0x67, 0xb6, 0xd9, 0x48, 0x72, 0xd7, 0x70},
+            {0xf1, 0x8b, 0xfd, 0x3b, 0xbc, 0x89, 0x5d, 0xb,  0x1a, 0x55, 0xf3,
+             0xc9, 0x37, 0x92, 0x6b, 0xb0, 0xf5, 0x28, 0x30, 0xd5, 0xb0, 0x16,
+             0x4c, 0xe,  0xab, 0xca, 0xcf, 0x2c, 0x31, 0x9c, 0xbc, 0x10},
+            {0x5b, 0xa0, 0xc2, 0x3e, 0x4b, 0xe8, 0x8a, 0xaa, 0xe0, 0x81, 0x17,
+             0xed, 0xf4, 0x9e, 0x69, 0x98, 0xd1, 0x85, 0x8e, 0x70, 0xe4, 0x13,
+             0x45, 0x79, 0x13, 0xf4, 0x76, 0xa9, 0xd3, 0x5b, 0x75, 0x63},
+        },
+        {
+            {0xb7, 0xac, 0xf1, 0x97, 0x18, 0x10, 0xc7, 0x3d, 0xd8, 0xbb, 0x65,
+             0xc1, 0x5e, 0x7d, 0xda, 0x5d, 0xf,  0x2,  0xa1, 0xf,  0x9c, 0x5b,
+             0x8e, 0x50, 0x56, 0x2a, 0xc5, 0x37, 0x17, 0x75, 0x63, 0x27},
+            {0x53, 0x8,  0xd1, 0x2a, 0x3e, 0xa0, 0x5f, 0xb5, 0x69, 0x35, 0xe6,
+             0x9e, 0x90, 0x75, 0x6f, 0x35, 0x90, 0xb8, 0x69, 0xbe, 0xfd, 0xf1,
+             0xf9, 0x9f, 0x84, 0x6f, 0xc1, 0x8b, 0xc4, 0xc1, 0x8c, 0xd},
+            {0xa9, 0x19, 0xb4, 0x6e, 0xd3, 0x2,  0x94, 0x2,  0xa5, 0x60, 0xb4,
+             0x77, 0x7e, 0x4e, 0xb4, 0xf0, 0x56, 0x49, 0x3c, 0xd4, 0x30, 0x62,
+             0xa8, 0xcf, 0xe7, 0x66, 0xd1, 0x7a, 0x8a, 0xdd, 0xc2, 0x70},
+        },
+        {
+            {0x13, 0x7e, 0xed, 0xb8, 0x7d, 0x96, 0xd4, 0x91, 0x7a, 0x81, 0x76,
+             0xd7, 0xa,  0x2f, 0x25, 0x74, 0x64, 0x25, 0x85, 0xd,  0xe0, 0x82,
+             0x9,  0xe4, 0xe5, 0x3c, 0xa5, 0x16, 0x38, 0x61, 0xb8, 0x32},
+            {0xe,  0xec, 0x6f, 0x9f, 0x50, 0x94, 0x61, 0x65, 0x8d, 0x51, 0xc6,
+             0x46, 0xa9, 0x7e, 0x2e, 0xee, 0x5c, 0x9b, 0xe0, 0x67, 0xf3, 0xc1,
+             0x33, 0x97, 0x95, 0x84, 0x94, 0x63, 0x63, 0xac, 0xf,  0x2e},
+            {0x64, 0xcd, 0x48, 0xe4, 0xbe, 0xf7, 0xe7, 0x79, 0xd0, 0x86, 0x78,
+             0x8,  0x67, 0x3a, 0xc8, 0x6a, 0x2e, 0xdb, 0xe4, 0xa0, 0xd9, 0xd4,
+             0x9f, 0xf8, 0x41, 0x4f, 0x5a, 0x73, 0x5c, 0x21, 0x79, 0x41},
+        },
+        {
+            {0x34, 0xcd, 0x6b, 0x28, 0xb9, 0x33, 0xae, 0xe4, 0xdc, 0xd6, 0x9d,
+             0x55, 0xb6, 0x7e, 0xef, 0xb7, 0x1f, 0x8e, 0xd3, 0xb3, 0x1f, 0x14,
+             0x8b, 0x27, 0x86, 0xc2, 0x41, 0x22, 0x66, 0x85, 0xfa, 0x31},
+            {0x2a, 0xed, 0xdc, 0xd7, 0xe7, 0x94, 0x70, 0x8c, 0x70, 0x9c, 0xd3,
+             0x47, 0xc3, 0x8a, 0xfb, 0x97, 0x2,  0xd9, 0x6,  0xa9, 0x33, 0xe0,
+             0x3b, 0xe1, 0x76, 0x9d, 0xd9, 0xc,  0xa3, 0x44, 0x3,  0x70},
+            {0xf4, 0x22, 0x36, 0x2e, 0x42, 0x6c, 0x82, 0xaf, 0x2d, 0x50, 0x33,
+             0x98, 0x87, 0x29, 0x20, 0xc1, 0x23, 0x91, 0x38, 0x2b, 0xe1, 0xb7,
+             0xc1, 0x9b, 0x89, 0x24, 0x95, 0xa9, 0x12, 0x23, 0xbb, 0x24},
+        },
+        {
+            {0x6b, 0x5c, 0xf8, 0xf5, 0x2a, 0xc,  0xf8, 0x41, 0x94, 0x67, 0xfa,
+             0x4,  0xc3, 0x84, 0x72, 0x68, 0xad, 0x1b, 0xba, 0xa3, 0x99, 0xdf,
+             0x45, 0x89, 0x16, 0x5d, 0xeb, 0xff, 0xf9, 0x2a, 0x1d, 0xd},
+            {0xc3, 0x67, 0xde, 0x32, 0x17, 0xed, 0xa8, 0xb1, 0x48, 0x49, 0x1b,
+             0x46, 0x18, 0x94, 0xb4, 0x3c, 0xd2, 0xbc, 0xcf, 0x76, 0x43, 0x43,
+             0xbd, 0x8e, 0x8,  0x80, 0x18, 0x1e, 0x87, 0x3e, 0xee, 0xf},
+            {0xdf, 0x1e, 0x62, 0x32, 0xa1, 0x8a, 0xda, 0xa9, 0x79, 0x65, 0x22,
+             0x59, 0xa1, 0x22, 0xb8, 0x30, 0x93, 0xc1, 0x9a, 0xa7, 0x7b, 0x19,
+             0x4,  0x40, 0x76, 0x1d, 0x53, 0x18, 0x97, 0xd7, 0xac, 0x16},
+        },
+        {
+            {0xad, 0xb6, 0x87, 0x78, 0xc5, 0xc6, 0x59, 0xc9, 0xba, 0xfe, 0x90,
+             0x5f, 0xad, 0x9e, 0xe1, 0x94, 0x4,  0xf5, 0x42, 0xa3, 0x62, 0x4e,
+             0xe2, 0x16, 0x0,  0x17, 0x16, 0x18, 0x4b, 0xd3, 0x4e, 0x16},
+            {0x3d, 0x1d, 0x9b, 0x2d, 0xaf, 0x72, 0xdf, 0x72, 0x5a, 0x24, 0x32,
+             0xa4, 0x36, 0x2a, 0x46, 0x63, 0x37, 0x96, 0xb3, 0x16, 0x79, 0xa0,
+             0xce, 0x3e, 0x9,  0x23, 0x30, 0xb9, 0xf6, 0xe,  0x3e, 0x12},
+            {0x9a, 0xe6, 0x2f, 0x19, 0x4c, 0xd9, 0x7e, 0x48, 0x13, 0x15, 0x91,
+             0x3a, 0xea, 0x2c, 0xae, 0x61, 0x27, 0xde, 0xa4, 0xb9, 0xd3, 0xf6,
+             0x7b, 0x87, 0xeb, 0xf3, 0x73, 0x10, 0xc6, 0xf,  0xda, 0x78},
+        },
+        {
+            {0x94, 0x3a, 0xc,  0x68, 0xf1, 0x80, 0x9f, 0xa2, 0xe6, 0xe7, 0xe9,
+             0x1a, 0x15, 0x7e, 0xf7, 0x71, 0x73, 0x79, 0x1,  0x48, 0x58, 0xf1,
+             0x0,  0x11, 0xdd, 0x8d, 0xb3, 0x16, 0xb3, 0xa4, 0x4a, 0x5},
+            {0x6a, 0xc6, 0x2b, 0xe5, 0x28, 0x5d, 0xf1, 0x5b, 0x8e, 0x1a, 0xf0,
+             0x70, 0x18, 0xe3, 0x47, 0x2c, 0xdd, 0x8b, 0xc2, 0x6,  0xbc, 0xaf,
+             0x19, 0x24, 0x3a, 0x17, 0x6b, 0x25, 0xeb, 0xde, 0x25, 0x2d},
+            {0xb8, 0x7c, 0x26, 0x19, 0x8d, 0x46, 0xc8, 0xdf, 0xaf, 0x4d, 0xe5,
+             0x66, 0x9c, 0x78, 0x28, 0xb,  0x17, 0xec, 0x6e, 0x66, 0x2a, 0x1d,
+             0xeb, 0x2a, 0x60, 0xa7, 0x7d, 0xab, 0xa6, 0x10, 0x46, 0x13},
+        },
+    },
+    {
+        {
+            {0x15, 0xf5, 0xd1, 0x77, 0xe7, 0x65, 0x2a, 0xcd, 0xf1, 0x60, 0xaa,
+             0x8f, 0x87, 0x91, 0x89, 0x54, 0xe5, 0x6,  0xbc, 0xda, 0xbc, 0x3b,
+             0xb7, 0xb1, 0xfb, 0xc9, 0x7c, 0xa9, 0xcb, 0x78, 0x48, 0x65},
+            {0xfe, 0xb0, 0xf6, 0x8d, 0xc7, 0x8e, 0x13, 0x51, 0x1b, 0xf5, 0x75,
+             0xe5, 0x89, 0xda, 0x97, 0x53, 0xb9, 0xf1, 0x7a, 0x71, 0x1d, 0x7a,
+             0x20, 0x9,  0x50, 0xd6, 0x20, 0x2b, 0xba, 0xfd, 0x2,  0x21},
+            {0xa1, 0xe6, 0x5c, 0x5,  0x5,  0xe4, 0x9e, 0x96, 0x29, 0xad, 0x51,
+             0x12, 0x68, 0xa7, 0xbc, 0x36, 0x15, 0xa4, 0x7d, 0xaa, 0x17, 0xf5,
+             0x1a, 0x3a, 0xba, 0xb2, 0xec, 0x29, 0xdb, 0x25, 0xd7, 0xa},
+        },
+        {
+            {0x85, 0x6f, 0x5,  0x9b, 0xc,  0xbc, 0xc7, 0xfe, 0xd7, 0xff, 0xf5,
+             0xe7, 0x68, 0x52, 0x7d, 0x53, 0xfa, 0xae, 0x12, 0x43, 0x62, 0xc6,
+             0xaf, 0x77, 0xd9, 0x9f, 0x39, 0x2,  0x53, 0x5f, 0x67, 0x4f},
+            {0x57, 0x24, 0x4e, 0x83, 0xb1, 0x67, 0x42, 0xdc, 0xc5, 0x1b, 0xce,
+             0x70, 0xb5, 0x44, 0x75, 0xb6, 0xd7, 0x5e, 0xd1, 0xf7, 0xb,  0x7a,
+             0xf0, 0x1a, 0x50, 0x36, 0xa0, 0x71, 0xfb, 0xcf, 0xef, 0x4a},
+            {0x1e, 0x17, 0x15, 0x4,  0x36, 0x36, 0x2d, 0xc3, 0x3b, 0x48, 0x98,
+             0x89, 0x11, 0xef, 0x2b, 0xcd, 0x10, 0x51, 0x94, 0xd0, 0xad, 0x6e,
+             0xa,  0x87, 0x61, 0x65, 0xa8, 0xa2, 0x72, 0xbb, 0xcc, 0xb},
+        },
+        {
+            {0x96, 0x12, 0xfe, 0x50, 0x4c, 0x5e, 0x6d, 0x18, 0x7e, 0x9f, 0xe8,
+             0xfe, 0x82, 0x7b, 0x39, 0xe0, 0xb0, 0x31, 0x70, 0x50, 0xc5, 0xf6,
+             0xc7, 0x3b, 0xc2, 0x37, 0x8f, 0x10, 0x69, 0xfd, 0x78, 0x66},
+            {0xc8, 0xa9, 0xb1, 0xea, 0x2f, 0x96, 0x5e, 0x18, 0xcd, 0x7d, 0x14,
+             0x65, 0x35, 0xe6, 0xe7, 0x86, 0xf2, 0x6d, 0x5b, 0xbb, 0x31, 0xe0,
+             0x92, 0xb0, 0x3e, 0xb7, 0xd6, 0x59, 0xab, 0xf0, 0x24, 0x40},
+            {0xc2, 0x63, 0x68, 0x63, 0x31, 0xfa, 0x86, 0x15, 0xf2, 0x33, 0x2d,
+             0x57, 0x48, 0x8c, 0xf6, 0x7,  0xfc, 0xae, 0x9e, 0x78, 0x9f, 0xcc,
+             0x73, 0x4f, 0x1,  0x47, 0xad, 0x8e, 0x10, 0xe2, 0x42, 0x2d},
+        },
+        {
+            {0x93, 0x75, 0x53, 0xf,  0xd,  0x7b, 0x71, 0x21, 0x4c, 0x6,  0x1e,
+             0x13, 0xb,  0x69, 0x4e, 0x91, 0x9f, 0xe0, 0x2a, 0x75, 0xae, 0x87,
+             0xb6, 0x1b, 0x6e, 0x3c, 0x42, 0x9b, 0xa7, 0xf3, 0xb,  0x42},
+            {0x9b, 0xd2, 0xdf, 0x94, 0x15, 0x13, 0xf5, 0x97, 0x6a, 0x4c, 0x3f,
+             0x31, 0x5d, 0x98, 0x55, 0x61, 0x10, 0x50, 0x45, 0x8,  0x7,  0x3f,
+             0xa1, 0xeb, 0x22, 0xd3, 0xd2, 0xb8, 0x8,  0x26, 0x6b, 0x67},
+            {0x47, 0x2b, 0x5b, 0x1c, 0x65, 0xba, 0x38, 0x81, 0x80, 0x1b, 0x1b,
+             0x31, 0xec, 0xb6, 0x71, 0x86, 0xb0, 0x35, 0x31, 0xbc, 0xb1, 0xc,
+             0xff, 0x7b, 0xe0, 0xf1, 0xc,  0x9c, 0xfa, 0x2f, 0x5d, 0x74},
+        },
+        {
+            {0x6a, 0x4e, 0xd3, 0x21, 0x57, 0xdf, 0x36, 0x60, 0xd0, 0xb3, 0x7b,
+             0x99, 0x27, 0x88, 0xdb, 0xb1, 0xfa, 0x6a, 0x75, 0xc8, 0xc3, 0x9,
+             0xc2, 0xd3, 0x39, 0xc8, 0x1d, 0x4c, 0xe5, 0x5b, 0xe1, 0x6},
+            {0xbd, 0xc8, 0xc9, 0x2b, 0x1e, 0x5a, 0x52, 0xbf, 0x81, 0x9d, 0x47,
+             0x26, 0x8,  0x26, 0x5b, 0xea, 0xdb, 0x55, 0x1,  0xdf, 0xe,  0xc7,
+             0x11, 0xd5, 0xd0, 0xf5, 0xc,  0x96, 0xeb, 0x3c, 0xe2, 0x1a},
+            {0x4a, 0x99, 0x32, 0x19, 0x87, 0x5d, 0x72, 0x5b, 0xb0, 0xda, 0xb1,
+             0xce, 0xb5, 0x1c, 0x35, 0x32, 0x5,  0xca, 0xb7, 0xda, 0x49, 0x15,
+             0xc4, 0x7d, 0xf7, 0xc1, 0x8e, 0x27, 0x61, 0xd8, 0xde, 0x58},
+        },
+        {
+            {0xa8, 0xc9, 0xc2, 0xb6, 0xa8, 0x5b, 0xfb, 0x2d, 0x8c, 0x59, 0x2c,
+             0xf5, 0x8e, 0xef, 0xee, 0x48, 0x73, 0x15, 0x2d, 0xf1, 0x7,  0x91,
+             0x80, 0x33, 0xd8, 0x5b, 0x1d, 0x53, 0x6b, 0x69, 0xba, 0x8},
+            {0x5c, 0xc5, 0x66, 0xf2, 0x93, 0x37, 0x17, 0xd8, 0x49, 0x4e, 0x45,
+             0xcc, 0xc5, 0x76, 0xc9, 0xc8, 0xa8, 0xc3, 0x26, 0xbc, 0xf8, 0x82,
+             0xe3, 0x5c, 0xf9, 0xf6, 0x85, 0x54, 0xe8, 0x9d, 0xf3, 0x2f},
+            {0x7a, 0xc5, 0xef, 0xc3, 0xee, 0x3e, 0xed, 0x77, 0x11, 0x48, 0xff,
+             0xd4, 0x17, 0x55, 0xe0, 0x4,  0xcb, 0x71, 0xa6, 0xf1, 0x3f, 0x7a,
+             0x3d, 0xea, 0x54, 0xfe, 0x7c, 0x94, 0xb4, 0x33, 0x6,  0x12},
+        },
+        {
+            {0xa,  0x10, 0x12, 0x49, 0x47, 0x31, 0xbd, 0x82, 0x6,  0xbe, 0x6f,
+             0x7e, 0x6d, 0x7b, 0x23, 0xde, 0xc6, 0x79, 0xea, 0x11, 0x19, 0x76,
+             0x1e, 0xe1, 0xde, 0x3b, 0x39, 0xcb, 0xe3, 0x3b, 0x43, 0x7},
+            {0x42, 0x0,  0x61, 0x91, 0x78, 0x98, 0x94, 0xb,  0xe8, 0xfa, 0xeb,
+             0xec, 0x3c, 0xb1, 0xe7, 0x4e, 0xc0, 0xa4, 0xf0, 0x94, 0x95, 0x73,
+             0xbe, 0x70, 0x85, 0x91, 0xd5, 0xb4, 0x99, 0xa,  0xd3, 0x35},
+            {0xf4, 0x97, 0xe9, 0x5c, 0xc0, 0x44, 0x79, 0xff, 0xa3, 0x51, 0x5c,
+             0xb0, 0xe4, 0x3d, 0x5d, 0x57, 0x7c, 0x84, 0x76, 0x5a, 0xfd, 0x81,
+             0x33, 0x58, 0x9f, 0xda, 0xf6, 0x7a, 0xde, 0x3e, 0x87, 0x2d},
+        },
+        {
+            {0x81, 0xf9, 0x5d, 0x4e, 0xe1, 0x2,  0x62, 0xaa, 0xf5, 0xe1, 0x15,
+             0x50, 0x17, 0x59, 0xd,  0xa2, 0x6c, 0x1d, 0xe2, 0xba, 0xd3, 0x75,
+             0xa2, 0x18, 0x53, 0x2,  0x60, 0x1,  0x8a, 0x61, 0x43, 0x5},
+            {0x9,  0x34, 0x37, 0x43, 0x64, 0x31, 0x7a, 0x15, 0xd9, 0x81, 0xaa,
+             0xf4, 0xee, 0xb7, 0xb8, 0xfa, 0x6,  0x48, 0xa6, 0xf5, 0xe6, 0xfe,
+             0x93, 0xb0, 0xb6, 0xa7, 0x7f, 0x70, 0x54, 0x36, 0x77, 0x2e},
+            {0xc1, 0x23, 0x4c, 0x97, 0xf4, 0xbd, 0xea, 0xd,  0x93, 0x46, 0xce,
+             0x9d, 0x25, 0xa,  0x6f, 0xaa, 0x2c, 0xba, 0x9a, 0xa2, 0xb8, 0x2c,
+             0x20, 0x4,  0xd,  0x96, 0x7,  0x2d, 0x36, 0x43, 0x14, 0x4b},
+        },
+    },
+    {
+        {
+            {0xcb, 0x9c, 0x52, 0x1c, 0xe9, 0x54, 0x7c, 0x96, 0xfb, 0x35, 0xc6,
+             0x64, 0x92, 0x26, 0xf6, 0x30, 0x65, 0x19, 0x12, 0x78, 0xf4, 0xaf,
+             0x47, 0x27, 0x5c, 0x6f, 0xf6, 0xea, 0x18, 0x84, 0x3,  0x17},
+            {0x7a, 0x1f, 0x6e, 0xb6, 0xc7, 0xb7, 0xc4, 0xcc, 0x7e, 0x2f, 0xc,
+             0xf5, 0x25, 0x7e, 0x15, 0x44, 0x1c, 0xaf, 0x3e, 0x71, 0xfc, 0x6d,
+             0xf0, 0x3e, 0xf7, 0x63, 0xda, 0x52, 0x67, 0x44, 0x2f, 0x58},
+            {0xe4, 0x4c, 0x32, 0x20, 0xd3, 0x7b, 0x31, 0xc6, 0xc4, 0x8b, 0x48,
+             0xa4, 0xe8, 0x42, 0x10, 0xa8, 0x64, 0x13, 0x5a, 0x4e, 0x8b, 0xf1,
+             0x1e, 0xb2, 0xc9, 0x8d, 0xa2, 0xcd, 0x4b, 0x1c, 0x2a, 0xc},
+        },
+        {
+            {0x45, 0x69, 0xbd, 0x69, 0x48, 0x81, 0xc4, 0xed, 0x22, 0x8d, 0x1c,
+             0xbe, 0x7d, 0x90, 0x6d, 0xd,  0xab, 0xc5, 0x5c, 0xd5, 0x12, 0xd2,
+             0x3b, 0xc6, 0x83, 0xdc, 0x14, 0xa3, 0x30, 0x9b, 0x6a, 0x5a},
+            {0x47, 0x4,  0x1f, 0x6f, 0xd0, 0xc7, 0x4d, 0xd2, 0x59, 0xc0, 0x87,
+             0xdb, 0x3e, 0x9e, 0x26, 0xb2, 0x8f, 0xd2, 0xb2, 0xfb, 0x72, 0x2,
+             0x5b, 0xd1, 0x77, 0x48, 0xf6, 0xc6, 0xd1, 0x8b, 0x55, 0x7c},
+            {0x3d, 0x46, 0x96, 0xd3, 0x24, 0x15, 0xec, 0xd0, 0xf0, 0x24, 0x5a,
+             0xc3, 0x8a, 0x62, 0xbb, 0x12, 0xa4, 0x5f, 0xbc, 0x1c, 0x79, 0x3a,
+             0xc,  0xa5, 0xc3, 0xaf, 0xfb, 0xa,  0xca, 0xa5, 0x4,  0x4},
+        },
+        {
+            {0xd1, 0x6f, 0x41, 0x2a, 0x1b, 0x9e, 0xbc, 0x62, 0x8b, 0x59, 0x50,
+             0xe3, 0x28, 0xf7, 0xc6, 0xb5, 0x67, 0x69, 0x5d, 0x3d, 0xd8, 0x3f,
+             0x34, 0x4,  0x98, 0xee, 0xf8, 0xe7, 0x16, 0x75, 0x52, 0x39},
+            {0xd6, 0x43, 0xa7, 0xa,  0x7,  0x40, 0x1f, 0x8c, 0xe8, 0x5e, 0x26,
+             0x5b, 0xcb, 0xd0, 0xba, 0xcc, 0xde, 0xd2, 0x8f, 0x66, 0x6b, 0x4,
+             0x4b, 0x57, 0x33, 0x96, 0xdd, 0xca, 0xfd, 0x5b, 0x39, 0x46},
+            {0x9c, 0x9a, 0x5d, 0x1a, 0x2d, 0xdb, 0x7f, 0x11, 0x2a, 0x5c, 0x0,
+             0xd1, 0xbc, 0x45, 0x77, 0x9c, 0xea, 0x6f, 0xd5, 0x54, 0xf1, 0xbe,
+             0xd4, 0xef, 0x16, 0xd0, 0x22, 0xe8, 0x29, 0x9a, 0x57, 0x76},
+        },
+        {
+            {0xf2, 0x34, 0xb4, 0x52, 0x13, 0xb5, 0x3c, 0x33, 0xe1, 0x80, 0xde,
+             0x93, 0x49, 0x28, 0x32, 0xd8, 0xce, 0x35, 0xd,  0x75, 0x87, 0x28,
+             0x51, 0xb5, 0xc1, 0x77, 0x27, 0x2a, 0xbb, 0x14, 0xc5, 0x2},
+            {0x17, 0x2a, 0xc0, 0x49, 0x7e, 0x8e, 0xb6, 0x45, 0x7f, 0xa3, 0xa9,
+             0xbc, 0xa2, 0x51, 0xcd, 0x23, 0x1b, 0x4c, 0x22, 0xec, 0x11, 0x5f,
+             0xd6, 0x3e, 0xb1, 0xbd, 0x5,  0x9e, 0xdc, 0x84, 0xa3, 0x43},
+            {0x45, 0xb6, 0xf1, 0x8b, 0xda, 0xd5, 0x4b, 0x68, 0x53, 0x4b, 0xb5,
+             0xf6, 0x7e, 0xd3, 0x8b, 0xfb, 0x53, 0xd2, 0xb0, 0xa9, 0xd7, 0x16,
+             0x39, 0x31, 0x59, 0x80, 0x54, 0x61, 0x9,  0x92, 0x60, 0x11},
+        },
+        {
+            {0xcd, 0x4d, 0x9b, 0x36, 0x16, 0x56, 0x38, 0x7a, 0x63, 0x35, 0x5c,
+             0x65, 0xa7, 0x2c, 0xc0, 0x75, 0x21, 0x80, 0xf1, 0xd4, 0xf9, 0x1b,
+             0xc2, 0x7d, 0x42, 0xe0, 0xe6, 0x91, 0x74, 0x7d, 0x63, 0x2f},
+            {0xaa, 0xcf, 0xda, 0x29, 0x69, 0x16, 0x4d, 0xb4, 0x8f, 0x59, 0x13,
+             0x84, 0x4c, 0x9f, 0x52, 0xda, 0x59, 0x55, 0x3d, 0x45, 0xca, 0x63,
+             0xef, 0xe9, 0xb,  0x8e, 0x69, 0xc5, 0x5b, 0x12, 0x1e, 0x35},
+            {0xbe, 0x7b, 0xf6, 0x1a, 0x46, 0x9b, 0xb4, 0xd4, 0x61, 0x89, 0xab,
+             0xc8, 0x7a, 0x3,  0x3,  0xd6, 0xfb, 0x99, 0xa6, 0xf9, 0x9f, 0xe1,
+             0xde, 0x71, 0x9a, 0x2a, 0xce, 0xe7, 0x6,  0x2d, 0x18, 0x7f},
+        },
+        {
+            {0x22, 0x75, 0x21, 0x8e, 0x72, 0x4b, 0x45, 0x9,  0xd8, 0xb8, 0x84,
+             0xd4, 0xf4, 0xe8, 0x58, 0xaa, 0x3c, 0x90, 0x46, 0x7f, 0x4d, 0x25,
+             0x58, 0xd3, 0x17, 0x52, 0x1c, 0x24, 0x43, 0xc0, 0xac, 0x44},
+            {0xec, 0x68, 0x1,  0xab, 0x64, 0x8e, 0x7c, 0x7a, 0x43, 0xc5, 0xed,
+             0x15, 0x55, 0x4a, 0x5a, 0xcb, 0xda, 0xe,  0xcd, 0x47, 0xd3, 0x19,
+             0x55, 0x9,  0xb0, 0x93, 0x3e, 0x34, 0x8c, 0xac, 0xd4, 0x67},
+            {0x77, 0x57, 0x7a, 0x4f, 0xbb, 0x6b, 0x7d, 0x1c, 0xe1, 0x13, 0x83,
+             0x91, 0xd4, 0xfe, 0x35, 0x8b, 0x84, 0x46, 0x6b, 0xc9, 0xc6, 0xa1,
+             0xdc, 0x4a, 0xbd, 0x71, 0xad, 0x12, 0x83, 0x1c, 0x6d, 0x55},
+        },
+        {
+            {0x21, 0xe8, 0x1b, 0xb1, 0x56, 0x67, 0xf0, 0x81, 0xdd, 0xf3, 0xa3,
+             0x10, 0x23, 0xf8, 0xaf, 0xf,  0x5d, 0x46, 0x99, 0x6a, 0x55, 0xd0,
+             0xb2, 0xf8, 0x5,  0x7f, 0x8c, 0xcc, 0x38, 0xbe, 0x7a, 0x9},
+            {0x82, 0x39, 0x8d, 0xc,  0xe3, 0x40, 0xef, 0x17, 0x34, 0xfa, 0xa3,
+             0x15, 0x3e, 0x7,  0xf7, 0x31, 0x6e, 0x64, 0x73, 0x7,  0xcb, 0xf3,
+             0x21, 0x4f, 0xff, 0x4e, 0x82, 0x1d, 0x6d, 0x6c, 0x6c, 0x74},
+            {0xa4, 0x2d, 0xa5, 0x7e, 0x87, 0xc9, 0x49, 0xc,  0x43, 0x1d, 0xdc,
+             0x9b, 0x55, 0x69, 0x43, 0x4c, 0xd2, 0xeb, 0xcc, 0xf7, 0x9,  0x38,
+             0x2c, 0x2,  0xbd, 0x84, 0xee, 0x4b, 0xa3, 0x14, 0x7e, 0x57},
+        },
+        {
+            {0x2b, 0xd7, 0x4d, 0xbd, 0xbe, 0xce, 0xfe, 0x94, 0x11, 0x22, 0xf,
+             0x6,  0xda, 0x4f, 0x6a, 0xf4, 0xff, 0xd1, 0xc8, 0xc0, 0x77, 0x59,
+             0x4a, 0x12, 0x95, 0x92, 0x0,  0xfb, 0xb8, 0x4,  0x53, 0x70},
+            {0xa,  0x3b, 0xa7, 0x61, 0xac, 0x68, 0xe2, 0xf0, 0xf5, 0xa5, 0x91,
+             0x37, 0x10, 0xfa, 0xfa, 0xf2, 0xe9, 0x0,  0x6d, 0x6b, 0x82, 0x3e,
+             0xe1, 0xc1, 0x42, 0x8f, 0xd7, 0x6f, 0xe9, 0x7e, 0xfa, 0x60},
+            {0xc6, 0x6e, 0x29, 0x4d, 0x35, 0x1d, 0x3d, 0xb6, 0xd8, 0x31, 0xad,
+             0x5f, 0x3e, 0x5,  0xc3, 0xf3, 0xec, 0x42, 0xbd, 0xb4, 0x8c, 0x95,
+             0xb,  0x67, 0xfd, 0x53, 0x63, 0xa1, 0xc,  0x8e, 0x39, 0x21},
+        },
+    },
+    {
+        {
+            {0x1,  0x56, 0xb7, 0xb4, 0xf9, 0xaa, 0x98, 0x27, 0x72, 0xad, 0x8d,
+             0x5c, 0x13, 0x72, 0xac, 0x5e, 0x23, 0xa0, 0xb7, 0x61, 0x61, 0xaa,
+             0xce, 0xd2, 0x4e, 0x7d, 0x8f, 0xe9, 0x84, 0xb2, 0xbf, 0x1b},
+            {0xf3, 0x33, 0x2b, 0x38, 0x8a, 0x5,  0xf5, 0x89, 0xb4, 0xc0, 0x48,
+             0xad, 0xb,  0xba, 0xe2, 0x5a, 0x6e, 0xb3, 0x3d, 0xa5, 0x3,  0xb5,
+             0x93, 0x8f, 0xe6, 0x32, 0xa2, 0x95, 0x9d, 0xed, 0xa3, 0x5a},
+            {0x61, 0x65, 0xd9, 0xc7, 0xe9, 0x77, 0x67, 0x65, 0x36, 0x80, 0xc7,
+             0x72, 0x54, 0x12, 0x2b, 0xcb, 0xee, 0x6e, 0x50, 0xd9, 0x99, 0x32,
+             0x5,  0x65, 0xcc, 0x57, 0x89, 0x5e, 0x4e, 0xe1, 0x7,  0x4a},
+        },
+        {
+            {0x9b, 0xa4, 0x77, 0xc4, 0xcd, 0x58, 0xb,  0x24, 0x17, 0xf0, 0x47,
+             0x64, 0xde, 0xda, 0x38, 0xfd, 0xad, 0x6a, 0xc8, 0xa7, 0x32, 0x8d,
+             0x92, 0x19, 0x81, 0xa0, 0xaf, 0x84, 0xed, 0x7a, 0xaf, 0x50},
+            {0x99, 0xf9, 0xd,  0x98, 0xcb, 0x12, 0xe4, 0x4e, 0x71, 0xc7, 0x6e,
+             0x3c, 0x6f, 0xd7, 0x15, 0xa3, 0xfd, 0x77, 0x5c, 0x92, 0xde, 0xed,
+             0xa5, 0xbb, 0x2,  0x34, 0x31, 0x1d, 0x39, 0xac, 0xb,  0x3f},
+            {0xe5, 0x5b, 0xf6, 0x15, 0x1,  0xde, 0x4f, 0x6e, 0xb2, 0x9,  0x61,
+             0x21, 0x21, 0x26, 0x98, 0x29, 0xd9, 0xd6, 0xad, 0xb,  0x81, 0x5,
+             0x2,  0x78, 0x6,  0xd0, 0xeb, 0xba, 0x16, 0xa3, 0x21, 0x19},
+        },
+        {
+            {0x8b, 0xc1, 0xf3, 0xd9, 0x9a, 0xad, 0x5a, 0xd7, 0x9c, 0xc1, 0xb1,
+             0x60, 0xef, 0xe,  0x6a, 0x56, 0xd9, 0xe,  0x5c, 0x25, 0xac, 0xb,
+             0x9a, 0x3e, 0xf5, 0xc7, 0x62, 0xa0, 0xec, 0x9d, 0x4,  0x7b},
+            {0xfc, 0x70, 0xb8, 0xdf, 0x7e, 0x2f, 0x42, 0x89, 0xbd, 0xb3, 0x76,
+             0x4f, 0xeb, 0x6b, 0x29, 0x2c, 0xf7, 0x4d, 0xc2, 0x36, 0xd4, 0xf1,
+             0x38, 0x7,  0xb0, 0xae, 0x73, 0xe2, 0x41, 0xdf, 0x58, 0x64},
+            {0x83, 0x44, 0x44, 0x35, 0x7a, 0xe3, 0xcb, 0xdc, 0x93, 0xbe, 0xed,
+             0xf,  0x33, 0x79, 0x88, 0x75, 0x87, 0xdd, 0xc5, 0x12, 0xc3, 0x4,
+             0x60, 0x78, 0x64, 0xe,  0x95, 0xc2, 0xcb, 0xdc, 0x93, 0x60},
+        },
+        {
+            {0x4b, 0x3,  0x84, 0x60, 0xbe, 0xee, 0xde, 0x6b, 0x54, 0xb8, 0xf,
+             0x78, 0xb6, 0xc2, 0x99, 0x31, 0x95, 0x6,  0x2d, 0xb6, 0xab, 0x76,
+             0x33, 0x97, 0x90, 0x7d, 0x64, 0x8b, 0xc9, 0x80, 0x31, 0x6e},
+            {0x6d, 0x70, 0xe0, 0x85, 0x85, 0x9a, 0xf3, 0x1f, 0x33, 0x39, 0xe7,
+             0xb3, 0xd8, 0xa5, 0xd0, 0x36, 0x3b, 0x45, 0x8f, 0x71, 0xe1, 0xf2,
+             0xb9, 0x43, 0x7c, 0xa9, 0x27, 0x48, 0x8,  0xea, 0xd1, 0x57},
+            {0x71, 0xb0, 0x28, 0xa1, 0xe7, 0xb6, 0x7a, 0xee, 0xaa, 0x8b, 0xa8,
+             0x93, 0x6d, 0x59, 0xc1, 0xa4, 0x30, 0x61, 0x21, 0xb2, 0x82, 0xde,
+             0xb4, 0xf7, 0x18, 0xbd, 0x97, 0xdd, 0x9d, 0x99, 0x3e, 0x36},
+        },
+        {
+            {0xc6, 0xae, 0x4b, 0xe2, 0xdc, 0x48, 0x18, 0x2f, 0x60, 0xaf, 0xbc,
+             0xba, 0x55, 0x72, 0x9b, 0x76, 0x31, 0xe9, 0xef, 0x3c, 0x6e, 0x3c,
+             0xcb, 0x90, 0x55, 0xb3, 0xf9, 0xc6, 0x9b, 0x97, 0x1f, 0x23},
+            {0xc4, 0x1f, 0xee, 0x35, 0xc1, 0x43, 0xa8, 0x96, 0xcf, 0xc8, 0xe4,
+             0x8,  0x55, 0xb3, 0x6e, 0x97, 0x30, 0xd3, 0x8c, 0xb5, 0x1,  0x68,
+             0x2f, 0xb4, 0x2b, 0x5,  0x3a, 0x69, 0x78, 0x9b, 0xee, 0x48},
+            {0xc6, 0xf3, 0x2a, 0xcc, 0x4b, 0xde, 0x31, 0x5c, 0x1f, 0x8d, 0x20,
+             0xfe, 0x30, 0xb0, 0x4b, 0xb0, 0x66, 0xb4, 0x4f, 0xc1, 0x9,  0x70,
+             0x8d, 0xb7, 0x13, 0x24, 0x79, 0x8,  0x9b, 0xfa, 0x9b, 0x7},
+        },
+        {
+            {0x45, 0x42, 0xd5, 0xa2, 0x80, 0xed, 0xc9, 0xf3, 0x52, 0x39, 0xf6,
+             0x77, 0x78, 0x8b, 0xa0, 0xa,  0x75, 0x54, 0x8,  0xd1, 0x63, 0xac,
+             0x6d, 0xd7, 0x6b, 0x63, 0x70, 0x94, 0x15, 0xfb, 0xf4, 0x1e},
+            {0xf4, 0xd,  0x30, 0xda, 0x51, 0x3a, 0x90, 0xe3, 0xb0, 0x5a, 0xa9,
+             0x3d, 0x23, 0x64, 0x39, 0x84, 0x80, 0x64, 0x35, 0xb,  0x2d, 0xf1,
+             0x3c, 0xed, 0x94, 0x71, 0x81, 0x84, 0xf6, 0x77, 0x8c, 0x3},
+            {0xec, 0x7b, 0x16, 0x5b, 0xe6, 0x5e, 0x4e, 0x85, 0xc2, 0xcd, 0xd0,
+             0x96, 0x42, 0xa,  0x59, 0x59, 0x99, 0x21, 0x10, 0x98, 0x34, 0xdf,
+             0xb2, 0x72, 0x56, 0xff, 0xb,  0x4a, 0x2a, 0xe9, 0x5e, 0x57},
+        },
+        {
+            {0x1,  0xd8, 0xa4, 0xa,  0x45, 0xbc, 0x46, 0x5d, 0xd8, 0xb9, 0x33,
+             0xa5, 0x27, 0x12, 0xaf, 0xc3, 0xc2, 0x6,  0x89, 0x2b, 0x26, 0x3b,
+             0x9e, 0x38, 0x1b, 0x58, 0x2f, 0x38, 0x7e, 0x1e, 0xa,  0x20},
+            {0xcf, 0x2f, 0x18, 0x8a, 0x90, 0x80, 0xc0, 0xd4, 0xbd, 0x9d, 0x48,
+             0x99, 0xc2, 0x70, 0xe1, 0x30, 0xde, 0x33, 0xf7, 0x52, 0x57, 0xbd,
+             0xba, 0x5,  0x0,  0xfd, 0xd3, 0x2c, 0x11, 0xe7, 0xd4, 0x43},
+            {0xc5, 0x3a, 0xf9, 0xea, 0x67, 0xb9, 0x8d, 0x51, 0xc0, 0x52, 0x66,
+             0x5,  0x9b, 0x98, 0xbc, 0x71, 0xf5, 0x97, 0x71, 0x56, 0xd9, 0x85,
+             0x2b, 0xfe, 0x38, 0x4e, 0x1e, 0x65, 0x52, 0xca, 0xe,  0x5},
+        },
+        {
+            {0xea, 0x68, 0xe6, 0x60, 0x76, 0x39, 0xac, 0x97, 0x97, 0xb4, 0x3a,
+             0x15, 0xfe, 0xbb, 0x19, 0x9b, 0x9f, 0xa7, 0xec, 0x34, 0xb5, 0x79,
+             0xb1, 0x4c, 0x57, 0xae, 0x31, 0xa1, 0x9f, 0xc0, 0x51, 0x61},
+            {0x9c, 0xc,  0x3f, 0x45, 0xde, 0x1a, 0x43, 0xc3, 0x9b, 0x3b, 0x70,
+             0xff, 0x5e, 0x4,  0xf5, 0xe9, 0x3d, 0x7b, 0x84, 0xed, 0xc9, 0x7a,
+             0xd9, 0xfc, 0xc6, 0xf4, 0x58, 0x1c, 0xc2, 0xe6, 0xe,  0x4b},
+            {0x96, 0x5d, 0xf0, 0xfd, 0xd,  0x5c, 0xf5, 0x3a, 0x7a, 0xee, 0xb4,
+             0x2a, 0xe0, 0x2e, 0x26, 0xdd, 0x9,  0x17, 0x17, 0x12, 0x87, 0xbb,
+             0xb2, 0x11, 0xb,  0x3,  0xf,  0x80, 0xfa, 0x24, 0xef, 0x1f},
+        },
+    },
+    {
+        {
+            {0x86, 0x6b, 0x97, 0x30, 0xf5, 0xaf, 0xd2, 0x22, 0x4,  0x46, 0xd2,
+             0xc2, 0x6,  0xb8, 0x90, 0x8d, 0xe5, 0xba, 0xe5, 0x4d, 0x6c, 0x89,
+             0xa1, 0xdc, 0x17, 0xc,  0x34, 0xc8, 0xe6, 0x5f, 0x0,  0x28},
+            {0x96, 0x31, 0xa7, 0x1a, 0xfb, 0x53, 0xd6, 0x37, 0x18, 0x64, 0xd7,
+             0x3f, 0x30, 0x95, 0x94, 0xf,  0xb2, 0x17, 0x3a, 0xfb, 0x9,  0xb,
+             0x20, 0xad, 0x3e, 0x61, 0xc8, 0x2f, 0x29, 0x49, 0x4d, 0x54},
+            {0x88, 0x86, 0x52, 0x34, 0x9f, 0xba, 0xef, 0x6a, 0xa1, 0x7d, 0x10,
+             0x25, 0x94, 0xff, 0x1b, 0x5c, 0x36, 0x4b, 0xd9, 0x66, 0xcd, 0xbb,
+             0x5b, 0xf7, 0xfa, 0x6d, 0x31, 0xf,  0x93, 0x72, 0xe4, 0x72},
+        },
+        {
+            {0x27, 0x76, 0x2a, 0xd3, 0x35, 0xf6, 0xf3, 0x7,  0xf0, 0x66, 0x65,
+             0x5f, 0x86, 0x4d, 0xaa, 0x7a, 0x50, 0x44, 0xd0, 0x28, 0x97, 0xe7,
+             0x85, 0x3c, 0x38, 0x64, 0xe0, 0xf,  0x0,  0x7f, 0xee, 0x1f},
+            {0x4f, 0x8,  0x81, 0x97, 0x8c, 0x20, 0x95, 0x26, 0xe1, 0xe,  0x45,
+             0x23, 0xb,  0x2a, 0x50, 0xb1, 0x2,  0xde, 0xef, 0x3,  0xa6, 0xae,
+             0x9d, 0xfd, 0x4c, 0xa3, 0x33, 0x27, 0x8c, 0x2e, 0x9d, 0x5a},
+            {0xe5, 0xf7, 0xdb, 0x3,  0xda, 0x5,  0x53, 0x76, 0xbd, 0xcd, 0x34,
+             0x14, 0x49, 0xf2, 0xda, 0xa4, 0xec, 0x88, 0x4a, 0xd2, 0xcd, 0xd5,
+             0x4a, 0x7b, 0x43, 0x5,  0x4,  0xee, 0x51, 0x40, 0xf9, 0x0},
+        },
+        {
+            {0x53, 0x97, 0xaf, 0x7,  0xbb, 0x93, 0xef, 0xd7, 0xa7, 0x66, 0xb7,
+             0x3d, 0xcf, 0xd0, 0x3e, 0x58, 0xc5, 0x1e, 0xb,  0x6e, 0xbf, 0x98,
+             0x69, 0xce, 0x52, 0x4,  0xd4, 0x5d, 0xd2, 0xff, 0xb7, 0x47},
+            {0xb2, 0x30, 0xd3, 0xc3, 0x23, 0x6b, 0x35, 0x8d, 0x6,  0x1b, 0x47,
+             0xb0, 0x9b, 0x8b, 0x1c, 0xf2, 0x3c, 0xb8, 0x42, 0x6e, 0x6c, 0x31,
+             0x6c, 0xb3, 0xd,  0xb1, 0xea, 0x8b, 0x7e, 0x9c, 0xd7, 0x7},
+            {0x12, 0xdd, 0x8,  0xbc, 0x9c, 0xfb, 0xfb, 0x87, 0x9b, 0xc2, 0xee,
+             0xe1, 0x3a, 0x6b, 0x6,  0x8a, 0xbf, 0xc1, 0x1f, 0xdb, 0x2b, 0x24,
+             0x57, 0xd,  0xb6, 0x4b, 0xa6, 0x5e, 0xa3, 0x20, 0x35, 0x1c},
+        },
+        {
+            {0x59, 0xc0, 0x6b, 0x21, 0x40, 0x6f, 0xa8, 0xcd, 0x7e, 0xd8, 0xbc,
+             0x12, 0x1d, 0x23, 0xbb, 0x1f, 0x90, 0x9,  0xc7, 0x17, 0x9e, 0x6a,
+             0x95, 0xb4, 0x55, 0x2e, 0xd1, 0x66, 0x3b, 0xc,  0x75, 0x38},
+            {0x4a, 0xa3, 0xcb, 0xbc, 0xa6, 0x53, 0xd2, 0x80, 0x9b, 0x21, 0x38,
+             0x38, 0xa1, 0xc3, 0x61, 0x3e, 0x96, 0xe3, 0x82, 0x98, 0x1,  0xb6,
+             0xc3, 0x90, 0x6f, 0xe6, 0xe,  0x5d, 0x77, 0x5,  0x3d, 0x1c},
+            {0x1a, 0xe5, 0x22, 0x94, 0x40, 0xf1, 0x2e, 0x69, 0x71, 0xf6, 0x5d,
+             0x2b, 0x3c, 0xc7, 0xc0, 0xcb, 0x29, 0xe0, 0x4c, 0x74, 0xe7, 0x4f,
+             0x1,  0x21, 0x7c, 0x48, 0x30, 0xd3, 0xc7, 0xe2, 0x21, 0x6},
+        },
+        {
+            {0xf3, 0xf0, 0xdb, 0xb0, 0x96, 0x17, 0xae, 0xb7, 0x96, 0xe1, 0x7c,
+             0xe1, 0xb9, 0xaf, 0xdf, 0x54, 0xb4, 0xa3, 0xaa, 0xe9, 0x71, 0x30,
+             0x92, 0x25, 0x9d, 0x2e, 0x0,  0xa1, 0x9c, 0x58, 0x8e, 0x5d},
+            {0x8d, 0x83, 0x59, 0x82, 0xcc, 0x60, 0x98, 0xaf, 0xdc, 0x9a, 0x9f,
+             0xc6, 0xc1, 0x48, 0xea, 0x90, 0x30, 0x1e, 0x58, 0x65, 0x37, 0x48,
+             0x26, 0x65, 0xbc, 0xa5, 0xd3, 0x7b, 0x9,  0xd6, 0x7,  0x0},
+            {0x4b, 0xa9, 0x42, 0x8,  0x95, 0x1d, 0xbf, 0xc0, 0x3e, 0x2e, 0x8f,
+             0x58, 0x63, 0xc3, 0xd3, 0xb2, 0xef, 0xe2, 0x51, 0xbb, 0x38, 0x14,
+             0x96, 0xa,  0x86, 0xbf, 0x1c, 0x3c, 0x78, 0xd7, 0x83, 0x15},
+        },
+        {
+            {0xc7, 0x28, 0x9d, 0xcc, 0x4,  0x47, 0x3,  0x90, 0x8f, 0xc5, 0x2c,
+             0xf7, 0x9e, 0x67, 0x1b, 0x1d, 0x26, 0x87, 0x5b, 0xbe, 0x5f, 0x2b,
+             0xe1, 0x16, 0xa,  0x58, 0xc5, 0x83, 0x4e, 0x6,  0x58, 0x49},
+            {0xe1, 0x7a, 0xa2, 0x5d, 0xef, 0xa2, 0xee, 0xec, 0x74, 0x1,  0x67,
+             0x55, 0x14, 0x3a, 0x7c, 0x59, 0x7a, 0x16, 0x9,  0x66, 0x12, 0x2a,
+             0xa6, 0xc9, 0x70, 0x8f, 0xed, 0x81, 0x2e, 0x5f, 0x2a, 0x25},
+            {0xd,  0xe8, 0x66, 0x50, 0x26, 0x94, 0x28, 0xd,  0x6b, 0x8c, 0x7c,
+             0x30, 0x85, 0xf7, 0xc3, 0xfc, 0xfd, 0x12, 0x11, 0xc,  0x78, 0xda,
+             0x53, 0x1b, 0x88, 0xb3, 0x43, 0xd8, 0xb,  0x17, 0x9c, 0x7},
+        },
+        {
+            {0x56, 0xd0, 0xd5, 0xc0, 0x50, 0xcd, 0xd6, 0xcd, 0x3b, 0x57, 0x3,
+             0xbb, 0x6d, 0x68, 0xf7, 0x9a, 0x48, 0xef, 0xc3, 0xf3, 0x3f, 0x72,
+             0xa6, 0x3c, 0xcc, 0x8a, 0x7b, 0x31, 0xd7, 0xc0, 0x68, 0x67},
+            {0xff, 0x6f, 0xfa, 0x64, 0xe4, 0xec, 0x6,  0x5,  0x23, 0xe5, 0x5,
+             0x62, 0x1e, 0x43, 0xe3, 0xbe, 0x42, 0xea, 0xb8, 0x51, 0x24, 0x42,
+             0x79, 0x35, 0x0,  0xfb, 0xc9, 0x4a, 0xe3, 0x5,  0xec, 0x6d},
+            {0xb3, 0xc1, 0x55, 0xf1, 0xe5, 0x25, 0xb6, 0x94, 0x91, 0x7b, 0x7b,
+             0x99, 0xa7, 0xf3, 0x7b, 0x41, 0x0,  0x26, 0x6b, 0x6d, 0xdc, 0xbd,
+             0x2c, 0xc2, 0xf4, 0x52, 0xcd, 0xdd, 0x14, 0x5e, 0x44, 0x51},
+        },
+        {
+            {0x55, 0xa4, 0xbe, 0x2b, 0xab, 0x47, 0x31, 0x89, 0x29, 0x91, 0x7,
+             0x92, 0x4f, 0xa2, 0x53, 0x8c, 0xa7, 0xf7, 0x30, 0xbe, 0x48, 0xf9,
+             0x49, 0x4b, 0x3d, 0xd4, 0x4f, 0x6e, 0x8,  0x90, 0xe9, 0x12},
+            {0x51, 0x49, 0x14, 0x3b, 0x4b, 0x2b, 0x50, 0x57, 0xb3, 0xbc, 0x4b,
+             0x44, 0x6b, 0xff, 0x67, 0x8e, 0xdb, 0x85, 0x63, 0x16, 0x27, 0x69,
+             0xbd, 0xb8, 0xc8, 0x95, 0x92, 0xe3, 0x31, 0x6f, 0x18, 0x13},
+            {0x2e, 0xbb, 0xdf, 0x7f, 0xb3, 0x96, 0xc,  0xf1, 0xf9, 0xea, 0x1c,
+             0x12, 0x5e, 0x93, 0x9a, 0x9f, 0x3f, 0x98, 0x5b, 0x3a, 0xc4, 0x36,
+             0x11, 0xdf, 0xaf, 0x99, 0x3e, 0x5d, 0xf0, 0xe3, 0xb2, 0x77},
+        },
+    },
+    {
+        {
+            {0xa4, 0xb0, 0xdd, 0x12, 0x9c, 0x63, 0x98, 0xd5, 0x6b, 0x86, 0x24,
+             0xc0, 0x30, 0x9f, 0xd1, 0xa5, 0x60, 0xe4, 0xfc, 0x58, 0x3,  0x2f,
+             0x7c, 0xd1, 0x8a, 0x5e, 0x9,  0x2e, 0x15, 0x95, 0xa1, 0x7},
+            {0xde, 0xc4, 0x2e, 0x9c, 0xc5, 0xa9, 0x6f, 0x29, 0xcb, 0xf3, 0x84,
+             0x4f, 0xbf, 0x61, 0x8b, 0xbc, 0x8,  0xf9, 0xa8, 0x17, 0xd9, 0x6,
+             0x77, 0x1c, 0x5d, 0x25, 0xd3, 0x7a, 0xfc, 0x95, 0xb7, 0x63},
+            {0xc8, 0x5f, 0x9e, 0x38, 0x2,  0x8f, 0x36, 0xa8, 0x3b, 0xe4, 0x8d,
+             0xcf, 0x2,  0x3b, 0x43, 0x90, 0x43, 0x26, 0x41, 0xc5, 0x5d, 0xfd,
+             0xa1, 0xaf, 0x37, 0x1,  0x2f, 0x3,  0x3d, 0xe8, 0x8f, 0x3e},
+        },
+        {
+            {0x3c, 0xd1, 0xef, 0xe8, 0x8d, 0x4c, 0x70, 0x8,  0x31, 0x37, 0xe0,
+             0x33, 0x8e, 0x1a, 0xc5, 0xdf, 0xe3, 0xcd, 0x60, 0x12, 0xa5, 0x5d,
+             0x9d, 0xa5, 0x86, 0x8c, 0x25, 0xa6, 0x99, 0x8,  0xd6, 0x22},
+            {0x94, 0xa2, 0x70, 0x5,  0xb9, 0x15, 0x8b, 0x2f, 0x49, 0x45, 0x8,
+             0x67, 0x70, 0x42, 0xf2, 0x94, 0x84, 0xfd, 0xbb, 0x61, 0xe1, 0x5a,
+             0x1c, 0xde, 0x7,  0x40, 0xac, 0x7f, 0x79, 0x3b, 0xba, 0x75},
+            {0x96, 0xd1, 0xcd, 0x70, 0xc0, 0xdb, 0x39, 0x62, 0x9a, 0x8a, 0x7d,
+             0x6c, 0x8b, 0x8a, 0xfe, 0x60, 0x60, 0x12, 0x40, 0xeb, 0xbc, 0x47,
+             0x88, 0xb3, 0x5e, 0x9e, 0x77, 0x87, 0x7b, 0xd0, 0x4,  0x9},
+        },
+        {
+            {0xb9, 0x40, 0xf9, 0x48, 0x66, 0x2d, 0x32, 0xf4, 0x39, 0xc,  0x2d,
+             0xbd, 0xc,  0x2f, 0x95, 0x6,  0x31, 0xf9, 0x81, 0xa0, 0xad, 0x97,
+             0x76, 0x16, 0x6c, 0x2a, 0xf7, 0xba, 0xce, 0xaa, 0x40, 0x62},
+            {0x9c, 0x91, 0xba, 0xdd, 0xd4, 0x1f, 0xce, 0xb4, 0xaa, 0x8d, 0x4c,
+             0xc7, 0x3e, 0xdb, 0x31, 0xcf, 0x51, 0xcc, 0x86, 0xad, 0x63, 0xcc,
+             0x63, 0x2c, 0x7,  0xde, 0x1d, 0xbc, 0x3f, 0x14, 0xe2, 0x43},
+            {0xa0, 0x95, 0xa2, 0x5b, 0x9c, 0x74, 0x34, 0xf8, 0x5a, 0xd2, 0x37,
+             0xca, 0x5b, 0x7c, 0x94, 0xd6, 0x6a, 0x31, 0xc9, 0xe7, 0xa7, 0x3b,
+             0xf1, 0x66, 0xac, 0xc,  0xb4, 0x8d, 0x23, 0xaf, 0xbd, 0x56},
+        },
+        {
+            {0xb2, 0x3b, 0x9d, 0xc1, 0x6c, 0xd3, 0x10, 0x13, 0xb9, 0x86, 0x23,
+             0x62, 0xb7, 0x6b, 0x2a, 0x6,  0x5c, 0x4f, 0xa1, 0xd7, 0x91, 0x85,
+             0x9b, 0x7c, 0x54, 0x57, 0x1e, 0x7e, 0x50, 0x31, 0xaa, 0x3},
+            {0xeb, 0x33, 0x35, 0xf5, 0xe3, 0xb9, 0x2a, 0x36, 0x40, 0x3d, 0xb9,
+             0x6e, 0xd5, 0x68, 0x85, 0x33, 0x72, 0x55, 0x5a, 0x1d, 0x52, 0x14,
+             0xe,  0x9e, 0x18, 0x13, 0x74, 0x83, 0x6d, 0xa8, 0x24, 0x1d},
+            {0x1f, 0xce, 0xd4, 0xff, 0x48, 0x76, 0xec, 0xf4, 0x1c, 0x8c, 0xac,
+             0x54, 0xf0, 0xea, 0x45, 0xe0, 0x7c, 0x35, 0x9,  0x1d, 0x82, 0x25,
+             0xd2, 0x88, 0x59, 0x48, 0xeb, 0x9a, 0xdc, 0x61, 0xb2, 0x43},
+        },
+        {
+            {0x64, 0x13, 0x95, 0x6c, 0x8b, 0x3d, 0x51, 0x19, 0x7b, 0xf4, 0xb,
+             0x0,  0x26, 0x71, 0xfe, 0x94, 0x67, 0x95, 0x4f, 0xd5, 0xdd, 0x10,
+             0x8d, 0x2,  0x64, 0x9,  0x94, 0x42, 0xe2, 0xd5, 0xb4, 0x2},
+            {0xbb, 0x79, 0xbb, 0x88, 0x19, 0x1e, 0x5b, 0xe5, 0x9d, 0x35, 0x7a,
+             0xc1, 0x7d, 0xd0, 0x9e, 0xa0, 0x33, 0xea, 0x3d, 0x60, 0xe2, 0x2e,
+             0x2c, 0xb0, 0xc2, 0x6b, 0x27, 0x5b, 0xcf, 0x55, 0x60, 0x32},
+            {0xf2, 0x8d, 0xd1, 0x28, 0xcb, 0x55, 0xa1, 0xb4, 0x8,  0xe5, 0x6c,
+             0x18, 0x46, 0x46, 0xcc, 0xea, 0x89, 0x43, 0x82, 0x6c, 0x93, 0xf4,
+             0x9c, 0xc4, 0x10, 0x34, 0x5d, 0xae, 0x9,  0xc8, 0xa6, 0x27},
+        },
+        {
+            {0x54, 0x69, 0x3d, 0xc4, 0xa,  0x27, 0x2c, 0xcd, 0xb2, 0xca, 0x66,
+             0x6a, 0x57, 0x3e, 0x4a, 0xdd, 0x6c, 0x3,  0xd7, 0x69, 0x24, 0x59,
+             0xfa, 0x79, 0x99, 0x25, 0x8c, 0x3d, 0x60, 0x3,  0x15, 0x22},
+            {0x88, 0xb1, 0xd,  0x1f, 0xcd, 0xeb, 0xa6, 0x8b, 0xe8, 0x5b, 0x5a,
+             0x67, 0x3a, 0xd7, 0xd3, 0x37, 0x5a, 0x58, 0xf5, 0x15, 0xa3, 0xdf,
+             0x2e, 0xf2, 0x7e, 0xa1, 0x60, 0xff, 0x74, 0x71, 0xb6, 0x2c},
+            {0xd0, 0xe1, 0xb,  0x39, 0xf9, 0xcd, 0xee, 0x59, 0xf1, 0xe3, 0x8c,
+             0x72, 0x44, 0x20, 0x42, 0xa9, 0xf4, 0xf0, 0x94, 0x7a, 0x66, 0x1c,
+             0x89, 0x82, 0x36, 0xf4, 0x90, 0x38, 0xb7, 0xf4, 0x1d, 0x7b},
+        },
+        {
+            {0x8c, 0xf5, 0xf8, 0x7,  0x18, 0x22, 0x2e, 0x5f, 0xd4, 0x9,  0x94,
+             0xd4, 0x9f, 0x5c, 0x55, 0xe3, 0x30, 0xa6, 0xb6, 0x1f, 0x8d, 0xa8,
+             0xaa, 0xb2, 0x3d, 0xe0, 0x52, 0xd3, 0x45, 0x82, 0x69, 0x68},
+            {0x24, 0xa2, 0xb2, 0xb3, 0xe0, 0xf2, 0x92, 0xe4, 0x60, 0x11, 0x55,
+             0x2b, 0x6,  0x9e, 0x6c, 0x7c, 0xe,  0x7b, 0x7f, 0xd,  0xe2, 0x8f,
+             0xeb, 0x15, 0x92, 0x59, 0xfc, 0x58, 0x26, 0xef, 0xfc, 0x61},
+            {0x7a, 0x18, 0x18, 0x2a, 0x85, 0x5d, 0xb1, 0xdb, 0xd7, 0xac, 0xdd,
+             0x86, 0xd3, 0xaa, 0xe4, 0xf3, 0x82, 0xc4, 0xf6, 0xf,  0x81, 0xe2,
+             0xba, 0x44, 0xcf, 0x1,  0xaf, 0x3d, 0x47, 0x4c, 0xcf, 0x46},
+        },
+        {
+            {0x40, 0x81, 0x49, 0xf1, 0xa7, 0x6e, 0x3c, 0x21, 0x54, 0x48, 0x2b,
+             0x39, 0xf8, 0x7e, 0x1e, 0x7c, 0xba, 0xce, 0x29, 0x56, 0x8c, 0xc3,
+             0x88, 0x24, 0xbb, 0xc5, 0x8c, 0xd,  0xe5, 0xaa, 0x65, 0x10},
+            {0xf9, 0xe5, 0xc4, 0x9e, 0xed, 0x25, 0x65, 0x42, 0x3,  0x33, 0x90,
+             0x16, 0x1,  0xda, 0x5e, 0xe,  0xdc, 0xca, 0xe5, 0xcb, 0xf2, 0xa7,
+             0xb1, 0x72, 0x40, 0x5f, 0xeb, 0x14, 0xcd, 0x7b, 0x38, 0x29},
+            {0x57, 0xd,  0x20, 0xdf, 0x25, 0x45, 0x2c, 0x1c, 0x4a, 0x67, 0xca,
+             0xbf, 0xd6, 0x2d, 0x3b, 0x5c, 0x30, 0x40, 0x83, 0xe1, 0xb1, 0xe7,
+             0x7,  0xa,  0x16, 0xe7, 0x1c, 0x4f, 0xe6, 0x98, 0xa1, 0x69},
+        },
+    },
+    {
+        {
+            {0xed, 0xca, 0xc5, 0xdc, 0x34, 0x44, 0x1,  0xe1, 0x33, 0xfb, 0x84,
+             0x3c, 0x96, 0x5d, 0xed, 0x47, 0xe7, 0xa0, 0x86, 0xed, 0x76, 0x95,
+             0x1,  0x70, 0xe4, 0xf9, 0x67, 0xd2, 0x7b, 0x69, 0xb2, 0x25},
+            {0xbc, 0x78, 0x1a, 0xd9, 0xe0, 0xb2, 0x62, 0x90, 0x67, 0x96, 0x50,
+             0xc8, 0x9c, 0x88, 0xc9, 0x47, 0xb8, 0x70, 0x50, 0x40, 0x66, 0x4a,
+             0xf5, 0x9d, 0xbf, 0xa1, 0x93, 0x24, 0xa9, 0xe6, 0x69, 0x73},
+            {0x64, 0x68, 0x98, 0x13, 0xfb, 0x3f, 0x67, 0x9d, 0xb8, 0xc7, 0x5d,
+             0x41, 0xd9, 0xfb, 0xa5, 0x3c, 0x5e, 0x3b, 0x27, 0xdf, 0x3b, 0xcc,
+             0x4e, 0xe0, 0xd2, 0x4c, 0x4e, 0xb5, 0x3d, 0x68, 0x20, 0x14},
+        },
+        {
+            {0xd0, 0x5a, 0xcc, 0xc1, 0x6f, 0xbb, 0xee, 0x34, 0x8b, 0xac, 0x46,
+             0x96, 0xe9, 0xc,  0x1b, 0x6a, 0x53, 0xde, 0x6b, 0xa6, 0x49, 0xda,
+             0xb0, 0xd3, 0xc1, 0x81, 0xd0, 0x61, 0x41, 0x3b, 0xe8, 0x31},
+            {0x97, 0xd1, 0x9d, 0x24, 0x1e, 0xbd, 0x78, 0xb4, 0x2,  0xc1, 0x58,
+             0x5e, 0x0,  0x35, 0xc,  0x62, 0x5c, 0xac, 0xba, 0xcc, 0x2f, 0xd3,
+             0x2,  0xfb, 0x2d, 0xa7, 0x8,  0xf5, 0xeb, 0x3b, 0xb6, 0x60},
+            {0x4f, 0x2b, 0x6,  0x9e, 0x12, 0xc7, 0xe8, 0x97, 0xd8, 0xa,  0x32,
+             0x29, 0x4f, 0x8f, 0xe4, 0x49, 0x3f, 0x68, 0x18, 0x6f, 0x4b, 0xe1,
+             0xec, 0x5b, 0x17, 0x3,  0x55, 0x2d, 0xb6, 0x1e, 0xcf, 0x55},
+        },
+        {
+            {0x52, 0x8c, 0xf5, 0x7d, 0xe3, 0xb5, 0x76, 0x30, 0x36, 0xcc, 0x99,
+             0xe7, 0xdd, 0xb9, 0x3a, 0xd7, 0x20, 0xee, 0x13, 0x49, 0xe3, 0x1c,
+             0x83, 0xbd, 0x33, 0x1,  0xba, 0x62, 0xaa, 0xfb, 0x56, 0x1a},
+            {0x58, 0x3d, 0xc2, 0x65, 0x10, 0x10, 0x79, 0x58, 0x9c, 0x81, 0x94,
+             0x50, 0x6d, 0x8,  0x9d, 0x8b, 0xa7, 0x5f, 0xc5, 0x12, 0xa9, 0x2f,
+             0x40, 0xe2, 0xd4, 0x91, 0x8,  0x57, 0x64, 0x65, 0x9a, 0x66},
+            {0xec, 0xc9, 0x9d, 0x5c, 0x50, 0x6b, 0x3e, 0x94, 0x1a, 0x37, 0x7c,
+             0xa7, 0xbb, 0x57, 0x25, 0x30, 0x51, 0x76, 0x34, 0x41, 0x56, 0xae,
+             0x73, 0x98, 0x5c, 0x8a, 0xc5, 0x99, 0x67, 0x83, 0xc4, 0x13},
+        },
+        {
+            {0x80, 0xd0, 0x8b, 0x5d, 0x6a, 0xfb, 0xdc, 0xc4, 0x42, 0x48, 0x1a,
+             0x57, 0xec, 0xc4, 0xeb, 0xde, 0x65, 0x53, 0xe5, 0xb8, 0x83, 0xe8,
+             0xb2, 0xd4, 0x27, 0xb8, 0xe5, 0xc8, 0x7d, 0xc8, 0xbd, 0x50},
+            {0xb9, 0xe1, 0xb3, 0x5a, 0x46, 0x5d, 0x3a, 0x42, 0x61, 0x3f, 0xf1,
+             0xc7, 0x87, 0xc1, 0x13, 0xfc, 0xb6, 0xb9, 0xb5, 0xec, 0x64, 0x36,
+             0xf8, 0x19, 0x7,  0xb6, 0x37, 0xa6, 0x93, 0xc,  0xf8, 0x66},
+            {0x11, 0xe1, 0xdf, 0x6e, 0x83, 0x37, 0x6d, 0x60, 0xd9, 0xab, 0x11,
+             0xf0, 0x15, 0x3e, 0x35, 0x32, 0x96, 0x3b, 0xb7, 0x25, 0xc3, 0x3a,
+             0xb0, 0x64, 0xae, 0xd5, 0x5f, 0x72, 0x44, 0x64, 0xd5, 0x1d},
+        },
+        {
+            {0x9a, 0xc8, 0xba, 0x8,  0x0,  0xe6, 0x97, 0xc2, 0xe0, 0xc3, 0xe1,
+             0xea, 0x11, 0xea, 0x4c, 0x7d, 0x7c, 0x97, 0xe7, 0x9f, 0xe1, 0x8b,
+             0xe3, 0xf3, 0xcd, 0x5,  0xa3, 0x63, 0xf,  0x45, 0x3a, 0x3a},
+            {0x7d, 0x12, 0x62, 0x33, 0xf8, 0x7f, 0xa4, 0x8f, 0x15, 0x7c, 0xcd,
+             0x71, 0xc4, 0x6a, 0x9f, 0xbc, 0x8b, 0xc,  0x22, 0x49, 0x43, 0x45,
+             0x71, 0x6e, 0x2e, 0x73, 0x9f, 0x21, 0x12, 0x59, 0x64, 0xe},
+            {0x27, 0x46, 0x39, 0xd8, 0x31, 0x2f, 0x8f, 0x7,  0x10, 0xa5, 0x94,
+             0xde, 0x83, 0x31, 0x9d, 0x38, 0x80, 0x6f, 0x99, 0x17, 0x6d, 0x6c,
+             0xe3, 0xd1, 0x7b, 0xa8, 0xa9, 0x93, 0x93, 0x8d, 0x8c, 0x31},
+        },
+        {
+            {0x98, 0xd3, 0x1d, 0xab, 0x29, 0x9e, 0x66, 0x5d, 0x3b, 0x9e, 0x2d,
+             0x34, 0x58, 0x16, 0x92, 0xfc, 0xcd, 0x73, 0x59, 0xf3, 0xfd, 0x1d,
+             0x85, 0x55, 0xf6, 0xa,  0x95, 0x25, 0xc3, 0x41, 0x9a, 0x50},
+            {0x19, 0xfe, 0xff, 0x2a, 0x3,  0x5d, 0x74, 0xf2, 0x66, 0xdb, 0x24,
+             0x7f, 0x49, 0x3c, 0x9f, 0xc,  0xef, 0x98, 0x85, 0xba, 0xe3, 0xd3,
+             0x98, 0xbc, 0x14, 0x53, 0x1d, 0x9a, 0x67, 0x7c, 0x4c, 0x22},
+            {0xe9, 0x25, 0xf9, 0xa6, 0xdc, 0x6e, 0xc0, 0xbd, 0x33, 0x1f, 0x1b,
+             0x64, 0xf4, 0xf3, 0x3e, 0x79, 0x89, 0x3e, 0x83, 0x9d, 0x80, 0x12,
+             0xec, 0x82, 0x89, 0x13, 0xa1, 0x28, 0x23, 0xf0, 0xbf, 0x5},
+        },
+        {
+            {0xe4, 0x12, 0xc5, 0xd,  0xdd, 0xa0, 0x81, 0x68, 0xfe, 0xfa, 0xa5,
+             0x44, 0xc8, 0xd,  0xe7, 0x4f, 0x40, 0x52, 0x4a, 0x8f, 0x6b, 0x8e,
+             0x74, 0x1f, 0xea, 0xa3, 0x1,  0xee, 0xcd, 0x77, 0x62, 0x57},
+            {0xb,  0xe0, 0xca, 0x23, 0x70, 0x13, 0x32, 0x36, 0x59, 0xcf, 0xac,
+             0xd1, 0xa,  0xcf, 0x4a, 0x54, 0x88, 0x1c, 0x1a, 0xd2, 0x49, 0x10,
+             0x74, 0x96, 0xa7, 0x44, 0x2a, 0xfa, 0xc3, 0x8c, 0xb,  0x78},
+            {0x5f, 0x30, 0x4f, 0x23, 0xbc, 0x8a, 0xf3, 0x1e, 0x8,  0xde, 0x5,
+             0x14, 0xbd, 0x7f, 0x57, 0x9a, 0xd,  0x2a, 0xe6, 0x34, 0x14, 0xa5,
+             0x82, 0x5e, 0xa1, 0xb7, 0x71, 0x62, 0x72, 0x18, 0xf4, 0x5f},
+        },
+        {
+            {0x40, 0x95, 0xb6, 0x13, 0xe8, 0x47, 0xdb, 0xe5, 0xe1, 0x10, 0x26,
+             0x43, 0x3b, 0x2a, 0x5d, 0xf3, 0x76, 0x12, 0x78, 0x38, 0xe9, 0x26,
+             0x1f, 0xac, 0x69, 0xcb, 0xa0, 0xa0, 0x8c, 0xdb, 0xd4, 0x29},
+            {0x9d, 0xdb, 0x89, 0x17, 0xc,  0x8,  0x8e, 0x39, 0xf5, 0x78, 0xe7,
+             0xf3, 0x25, 0x20, 0x60, 0xa7, 0x5d, 0x3,  0xbd, 0x6,  0x4c, 0x89,
+             0x98, 0xfa, 0xbe, 0x66, 0xa9, 0x25, 0xdc, 0x3,  0x6a, 0x10},
+            {0xd0, 0x53, 0x33, 0x33, 0xaf, 0xa,  0xad, 0xd9, 0xe5, 0x9,  0xd3,
+             0xac, 0xa5, 0x9d, 0x66, 0x38, 0xf0, 0xf7, 0x88, 0xc8, 0x8a, 0x65,
+             0x57, 0x3c, 0xfa, 0xbe, 0x2c, 0x5,  0x51, 0x8a, 0xb3, 0x4a},
+        },
+    },
+    {
+        {
+            {0x9c, 0xc0, 0xdd, 0x5f, 0xef, 0xd1, 0xcf, 0xd6, 0xce, 0x5d, 0x57,
+             0xf7, 0xfd, 0x3e, 0x2b, 0xe8, 0xc2, 0x34, 0x16, 0x20, 0x5d, 0x6b,
+             0xd5, 0x25, 0x9b, 0x2b, 0xed, 0x4,  0xbb, 0xc6, 0x41, 0x30},
+            {0x93, 0xd5, 0x68, 0x67, 0x25, 0x2b, 0x7c, 0xda, 0x13, 0xca, 0x22,
+             0x44, 0x57, 0xc0, 0xc1, 0x98, 0x1d, 0xce, 0xa,  0xca, 0xd5, 0xb,
+             0xa8, 0xf1, 0x90, 0xa6, 0x88, 0xc0, 0xad, 0xd1, 0xcd, 0x29},
+            {0x48, 0xe1, 0x56, 0xd9, 0xf9, 0xf2, 0xf2, 0xf,  0x2e, 0x6b, 0x35,
+             0x9f, 0x75, 0x97, 0xe7, 0xad, 0x5c, 0x2,  0x6c, 0x5f, 0xbb, 0x98,
+             0x46, 0x1a, 0x7b, 0x9a, 0x4,  0x14, 0x68, 0xbd, 0x4b, 0x10},
+        },
+        {
+            {0x63, 0xf1, 0x7f, 0xd6, 0x5f, 0x9a, 0x5d, 0xa9, 0x81, 0x56, 0xc7,
+             0x4c, 0x9d, 0xe6, 0x2b, 0xe9, 0x57, 0xf2, 0x20, 0xde, 0x4c, 0x2,
+             0xf8, 0xb7, 0xf5, 0x2d, 0x7,  0xfb, 0x20, 0x2a, 0x4f, 0x20},
+            {0x67, 0xed, 0xf1, 0x68, 0x31, 0xfd, 0xf0, 0x51, 0xc2, 0x3b, 0x6f,
+             0xd8, 0xcd, 0x1d, 0x81, 0x2c, 0xde, 0xf2, 0xd2, 0x4,  0x43, 0x5c,
+             0xdc, 0x44, 0x49, 0x71, 0x2a, 0x9,  0x57, 0xcc, 0xe8, 0x5b},
+            {0x79, 0xb0, 0xeb, 0x30, 0x3d, 0x3b, 0x14, 0xc8, 0x30, 0x2e, 0x65,
+             0xbd, 0x5a, 0x15, 0x89, 0x75, 0x31, 0x5c, 0x6d, 0x8f, 0x31, 0x3c,
+             0x3c, 0x65, 0x1f, 0x16, 0x79, 0xc2, 0x17, 0xfb, 0x70, 0x25},
+        },
+        {
+            {0x5a, 0x24, 0xb8, 0xb,  0x55, 0xa9, 0x2e, 0x19, 0xd1, 0x50, 0x90,
+             0x8f, 0xa8, 0xfb, 0xe6, 0xc8, 0x35, 0xc9, 0xa4, 0x88, 0x2d, 0xea,
+             0x86, 0x79, 0x68, 0x86, 0x1,  0xde, 0x91, 0x5f, 0x1c, 0x24},
+            {0x75, 0x15, 0xb6, 0x2c, 0x7f, 0x36, 0xfa, 0x3e, 0x6c, 0x2,  0xd6,
+             0x1c, 0x76, 0x6f, 0xf9, 0xf5, 0x62, 0x25, 0xb5, 0x65, 0x2a, 0x14,
+             0xc7, 0xe8, 0xcd, 0xa,  0x3,  0x53, 0xea, 0x65, 0xcb, 0x3d},
+            {0xaa, 0x6c, 0xde, 0x40, 0x29, 0x17, 0xd8, 0x28, 0x3a, 0x73, 0xd9,
+             0x22, 0xf0, 0x2c, 0xbf, 0x8f, 0xd1, 0x1,  0x5b, 0x23, 0xdd, 0xfc,
+             0xd7, 0x16, 0xe5, 0xf0, 0xcd, 0x5f, 0xdd, 0xe,  0x42, 0x8},
+        },
+        {
+            {0xce, 0x10, 0xf4, 0x4,  0x4e, 0xc3, 0x58, 0x3,  0x85, 0x6,  0x6e,
+             0x27, 0x5a, 0x5b, 0x13, 0xb6, 0x21, 0x15, 0xb9, 0xeb, 0xc7, 0x70,
+             0x96, 0x5d, 0x9c, 0x88, 0xdb, 0x21, 0xf3, 0x54, 0xd6, 0x4},
+            {0x4a, 0xfa, 0x62, 0x83, 0xab, 0x20, 0xff, 0xcd, 0x6e, 0x3e, 0x1a,
+             0xe2, 0xd4, 0x18, 0xe1, 0x57, 0x2b, 0xe6, 0x39, 0xfc, 0x17, 0x96,
+             0x17, 0xe3, 0xfd, 0x69, 0x17, 0xbc, 0xef, 0x53, 0x9a, 0xd},
+            {0xd5, 0xb5, 0xbd, 0xdd, 0x16, 0xc1, 0x7d, 0x5e, 0x2d, 0xdd, 0xa5,
+             0x8d, 0xb6, 0xde, 0x54, 0x29, 0x92, 0xa2, 0x34, 0x33, 0x17, 0x8,
+             0xb6, 0x1c, 0xd7, 0x1a, 0x99, 0x18, 0x26, 0x4f, 0x7a, 0x4a},
+        },
+        {
+            {0x4b, 0x2a, 0x37, 0xaf, 0x91, 0xb2, 0xc3, 0x24, 0xf2, 0x47, 0x81,
+             0x71, 0x70, 0x82, 0xda, 0x93, 0xf2, 0x9e, 0x89, 0x86, 0x64, 0x85,
+             0x84, 0xdd, 0x33, 0xee, 0xe0, 0x23, 0x42, 0x31, 0x96, 0x4a},
+            {0x95, 0x5f, 0xb1, 0x5f, 0x2,  0x18, 0xa7, 0xf4, 0x8f, 0x1b, 0x5c,
+             0x6b, 0x34, 0x5f, 0xf6, 0x3d, 0x12, 0x11, 0xe0, 0x0,  0x85, 0xf0,
+             0xfc, 0xcd, 0x48, 0x18, 0xd3, 0xdd, 0x4c, 0xc,  0xb5, 0x11},
+            {0xd6, 0xff, 0xa4, 0x8,  0x44, 0x27, 0xe8, 0xa6, 0xd9, 0x76, 0x15,
+             0x9c, 0x7e, 0x17, 0x8e, 0x73, 0xf2, 0xb3, 0x2,  0x3d, 0xb6, 0x48,
+             0x33, 0x77, 0x51, 0xcc, 0x6b, 0xce, 0x4d, 0xce, 0x4b, 0x4f},
+        },
+        {
+            {0x6f, 0xb,  0x9d, 0xc4, 0x6e, 0x61, 0xe2, 0x30, 0x17, 0x23, 0xec,
+             0xca, 0x8f, 0x71, 0x56, 0xe4, 0xa6, 0x4f, 0x6b, 0xf2, 0x9b, 0x40,
+             0xeb, 0x48, 0x37, 0x5f, 0x59, 0x61, 0xe5, 0xce, 0x42, 0x30},
+            {0x84, 0x25, 0x24, 0xe2, 0x5a, 0xce, 0x1f, 0xa7, 0x9e, 0x8a, 0xf5,
+             0x92, 0x56, 0x72, 0xea, 0x26, 0xf4, 0x3c, 0xea, 0x1c, 0xd7, 0x9,
+             0x1a, 0xd2, 0xe6, 0x1,  0x1c, 0xb7, 0x14, 0xdd, 0xfc, 0x73},
+            {0x41, 0xac, 0x9b, 0x44, 0x79, 0x70, 0x7e, 0x42, 0xa,  0x31, 0xe2,
+             0xbc, 0x6d, 0xe3, 0x5a, 0x85, 0x7c, 0x1a, 0x84, 0x5f, 0x21, 0x76,
+             0xae, 0x4c, 0xd6, 0xe1, 0x9c, 0x9a, 0xc,  0x74, 0x9e, 0x38},
+        },
+        {
+            {0x28, 0xac, 0xe,  0x57, 0xf6, 0x78, 0xbd, 0xc9, 0xe1, 0x9c, 0x91,
+             0x27, 0x32, 0xb,  0x5b, 0xe5, 0xed, 0x91, 0x9b, 0xa1, 0xab, 0x3e,
+             0xfc, 0x65, 0x90, 0x36, 0x26, 0xd6, 0xe5, 0x25, 0xc4, 0x25},
+            {0xce, 0xb9, 0xdc, 0x34, 0xae, 0xb3, 0xfc, 0x64, 0xad, 0xd0, 0x48,
+             0xe3, 0x23, 0x3,  0x50, 0x97, 0x1b, 0x38, 0xc6, 0x62, 0x7d, 0xf0,
+             0xb3, 0x45, 0x88, 0x67, 0x5a, 0x46, 0x79, 0x53, 0x54, 0x61},
+            {0x6e, 0xde, 0xd7, 0xf1, 0xa6, 0x6,  0x3e, 0x3f, 0x8,  0x23, 0x6,
+             0x8e, 0x27, 0x76, 0xf9, 0x3e, 0x77, 0x6c, 0x8a, 0x4e, 0x26, 0xf6,
+             0x14, 0x8c, 0x59, 0x47, 0x48, 0x15, 0x89, 0xa0, 0x39, 0x65},
+        },
+        {
+            {0x19, 0x4a, 0xbb, 0x14, 0xd4, 0xdb, 0xc4, 0xdd, 0x8e, 0x4f, 0x42,
+             0x98, 0x3c, 0xbc, 0xb2, 0x19, 0x69, 0x71, 0xca, 0x36, 0xd7, 0x9f,
+             0xa8, 0x48, 0x90, 0xbd, 0x19, 0xf0, 0xe,  0x32, 0x65, 0xf},
+            {0x73, 0xf7, 0xd2, 0xc3, 0x74, 0x1f, 0xd2, 0xe9, 0x45, 0x68, 0xc4,
+             0x25, 0x41, 0x54, 0x50, 0xc1, 0x33, 0x9e, 0xb9, 0xf9, 0xe8, 0x5c,
+             0x4e, 0x62, 0x6c, 0x18, 0xcd, 0xc5, 0xaa, 0xe4, 0xc5, 0x11},
+            {0xc6, 0xe0, 0xfd, 0xca, 0xb1, 0xd1, 0x86, 0xd4, 0x81, 0x51, 0x3b,
+             0x16, 0xe3, 0xe6, 0x3f, 0x4f, 0x9a, 0x93, 0xf2, 0xfa, 0xd,  0xaf,
+             0xa8, 0x59, 0x2a, 0x7,  0x33, 0xec, 0xbd, 0xc7, 0xab, 0x4c},
+        },
+    },
+    {
+        {
+            {0x89, 0xd2, 0x78, 0x3f, 0x8f, 0x78, 0x8f, 0xc0, 0x9f, 0x4d, 0x40,
+             0xa1, 0x2c, 0xa7, 0x30, 0xfe, 0x9d, 0xcc, 0x65, 0xcf, 0xfc, 0x8b,
+             0x77, 0xf2, 0x21, 0x20, 0xcb, 0x5a, 0x16, 0x98, 0xe4, 0x7e},
+            {0x2e, 0xa,  0x9c, 0x8,  0x24, 0x96, 0x9e, 0x23, 0x38, 0x47, 0xfe,
+             0x3a, 0xc0, 0xc4, 0x48, 0xc7, 0x2a, 0xa1, 0x4f, 0x76, 0x2a, 0xed,
+             0xdb, 0x17, 0x82, 0x85, 0x1c, 0x32, 0xf0, 0x93, 0x9b, 0x63},
+            {0xc3, 0xa1, 0x11, 0x91, 0xe3, 0x8,  0xd5, 0x7b, 0x89, 0x74, 0x90,
+             0x80, 0xd4, 0x90, 0x2b, 0x2b, 0x19, 0xfd, 0x72, 0xae, 0xc2, 0xae,
+             0xd2, 0xe7, 0xa6, 0x2,  0xb6, 0x85, 0x3c, 0x49, 0xdf, 0xe},
+        },
+        {
+            {0x13, 0x41, 0x76, 0x84, 0xd2, 0xc4, 0x67, 0x67, 0x35, 0xf8, 0xf5,
+             0xf7, 0x3f, 0x40, 0x90, 0xa0, 0xde, 0xbe, 0xe6, 0xca, 0xfa, 0xcf,
+             0x8f, 0x1c, 0x69, 0xa3, 0xdf, 0xd1, 0x54, 0xc,  0xc0, 0x4},
+            {0x68, 0x5a, 0x9b, 0x59, 0x58, 0x81, 0xcc, 0xae, 0xe,  0xe2, 0xad,
+             0xeb, 0xf,  0x4f, 0x57, 0xea, 0x7,  0x7f, 0xb6, 0x22, 0x74, 0x1d,
+             0xe4, 0x4f, 0xb4, 0x4f, 0x9d, 0x1,  0xe3, 0x92, 0x3b, 0x40},
+            {0xf8, 0x5c, 0x46, 0x8b, 0x81, 0x2f, 0xc2, 0x4d, 0xf8, 0xef, 0x80,
+             0x14, 0x5a, 0xf3, 0xa0, 0x71, 0x57, 0xd6, 0xc7, 0x4,  0xad, 0xbf,
+             0xe8, 0xae, 0xf4, 0x76, 0x61, 0xb2, 0x2a, 0xb1, 0x5b, 0x35},
+        },
+        {
+            {0x18, 0x73, 0x8c, 0x5a, 0xc7, 0xda, 0x1,  0xa3, 0x11, 0xaa, 0xce,
+             0xb3, 0x9d, 0x3,  0x90, 0xed, 0x2d, 0x3f, 0xae, 0x3b, 0xbf, 0x7c,
+             0x7,  0x6f, 0x8e, 0xad, 0x52, 0xe0, 0xf8, 0xea, 0x18, 0x75},
+            {0xf4, 0xbb, 0x93, 0x74, 0xcc, 0x64, 0x1e, 0xa7, 0xc3, 0xb0, 0xa3,
+             0xec, 0xd9, 0x84, 0xbd, 0xe5, 0x85, 0xe7, 0x5,  0xfa, 0xc,  0xc5,
+             0x6b, 0xa,  0x12, 0xc3, 0x2e, 0x18, 0x32, 0x81, 0x9b, 0xf},
+            {0x32, 0x6c, 0x7f, 0x1b, 0xc4, 0x59, 0x88, 0xa4, 0x98, 0x32, 0x38,
+             0xf4, 0xbc, 0x60, 0x2d, 0xf,  0xd9, 0xd1, 0xb1, 0xc9, 0x29, 0xa9,
+             0x15, 0x18, 0xc4, 0x55, 0x17, 0xbb, 0x1b, 0x87, 0xc3, 0x47},
+        },
+        {
+            {0xb0, 0x66, 0x50, 0xc8, 0x50, 0x5d, 0xe6, 0xfb, 0xb0, 0x99, 0xa2,
+             0xb3, 0xb0, 0xc4, 0xec, 0x62, 0xe0, 0xe8, 0x1a, 0x44, 0xea, 0x54,
+             0x37, 0xe5, 0x5f, 0x8d, 0xd4, 0xe8, 0x2c, 0xa0, 0xfe, 0x8},
+            {0x48, 0x4f, 0xec, 0x71, 0x97, 0x53, 0x44, 0x51, 0x6e, 0x5d, 0x8c,
+             0xc9, 0x7d, 0xb1, 0x5,  0xf8, 0x6b, 0xc6, 0xc3, 0x47, 0x1a, 0xc1,
+             0x62, 0xf7, 0xdc, 0x99, 0x46, 0x76, 0x85, 0x9b, 0xb8, 0x0},
+            {0xd0, 0xea, 0xde, 0x68, 0x76, 0xdd, 0x4d, 0x82, 0x23, 0x5d, 0x68,
+             0x4b, 0x20, 0x45, 0x64, 0xc8, 0x65, 0xd6, 0x89, 0x5d, 0xcd, 0xcf,
+             0x14, 0xb5, 0x37, 0xd5, 0x75, 0x4f, 0xa7, 0x29, 0x38, 0x47},
+        },
+        {
+            {0xc9, 0x2,  0x39, 0xad, 0x3a, 0x53, 0xd9, 0x23, 0x8f, 0x58, 0x3,
+             0xef, 0xce, 0xdd, 0xc2, 0x64, 0xb4, 0x2f, 0xe1, 0xcf, 0x90, 0x73,
+             0x25, 0x15, 0x90, 0xd3, 0xe4, 0x44, 0x4d, 0x8b, 0x66, 0x6c},
+            {0x18, 0xc4, 0x79, 0x46, 0x75, 0xda, 0xd2, 0x82, 0xf0, 0x8d, 0x61,
+             0xb2, 0xd8, 0xd7, 0x3b, 0xe6, 0xa,  0xeb, 0x47, 0xac, 0x24, 0xef,
+             0x5e, 0x35, 0xb4, 0xc6, 0x33, 0x48, 0x4c, 0x68, 0x78, 0x20},
+            {0xc,  0x82, 0x78, 0x7a, 0x21, 0xcf, 0x48, 0x3b, 0x97, 0x3e, 0x27,
+             0x81, 0xb2, 0xa,  0x6a, 0xf7, 0x7b, 0xed, 0x8e, 0x8c, 0xa7, 0x65,
+             0x6c, 0xa9, 0x3f, 0x43, 0x8a, 0x4f, 0x5,  0xa6, 0x11, 0x74},
+        },
+        {
+            {0xb4, 0x75, 0xb1, 0x18, 0x3d, 0xe5, 0x9a, 0x57, 0x2,  0xa1, 0x92,
+             0xf3, 0x59, 0x31, 0x71, 0x68, 0xf5, 0x35, 0xef, 0x1e, 0xba, 0xec,
+             0x55, 0x84, 0x8f, 0x39, 0x8c, 0x45, 0x72, 0xa8, 0xc9, 0x1e},
+            {0x6d, 0xc8, 0x9d, 0xb9, 0x32, 0x9d, 0x65, 0x4d, 0x15, 0xf1, 0x3a,
+             0x60, 0x75, 0xdc, 0x4c, 0x4,  0x88, 0xe4, 0xc2, 0xdc, 0x2c, 0x71,
+             0x4c, 0xb3, 0xff, 0x34, 0x81, 0xfb, 0x74, 0x65, 0x13, 0x7c},
+            {0x9b, 0x50, 0xa2, 0x0,  0xd4, 0xa4, 0xe6, 0xb8, 0xb4, 0x82, 0xc8,
+             0xb,  0x2,  0xd7, 0x81, 0x9b, 0x61, 0x75, 0x95, 0xf1, 0x9b, 0xcc,
+             0xe7, 0x57, 0x60, 0x64, 0xcd, 0xc7, 0xa5, 0x88, 0xdd, 0x3a},
+        },
+        {
+            {0x46, 0x30, 0x39, 0x59, 0xd4, 0x98, 0xc2, 0x85, 0xec, 0x59, 0xf6,
+             0x5f, 0x98, 0x35, 0x7e, 0x8f, 0x3a, 0x6e, 0xf6, 0xf2, 0x2a, 0xa2,
+             0x2c, 0x1d, 0x20, 0xa7, 0x6,  0xa4, 0x31, 0x11, 0xba, 0x61},
+            {0xf2, 0xdc, 0x35, 0xb6, 0x70, 0x57, 0x89, 0xab, 0xbc, 0x1f, 0x6c,
+             0xf6, 0x6c, 0xef, 0xdf, 0x2,  0x87, 0xd1, 0xb6, 0xbe, 0x68, 0x2,
+             0x53, 0x85, 0x74, 0x9e, 0x87, 0xcc, 0xfc, 0x29, 0x99, 0x24},
+            {0x29, 0x90, 0x95, 0x16, 0xf1, 0xa0, 0xd0, 0xa3, 0x89, 0xbd, 0x7e,
+             0xba, 0x6c, 0x6b, 0x3b, 0x2,  0x7,  0x33, 0x78, 0x26, 0x3e, 0x5a,
+             0xf1, 0x7b, 0xe7, 0xec, 0xd8, 0xbb, 0xc,  0x31, 0x20, 0x56},
+        },
+        {
+            {0xd6, 0x85, 0xe2, 0x77, 0xf4, 0xb5, 0x46, 0x66, 0x93, 0x61, 0x8f,
+             0x6c, 0x67, 0xff, 0xe8, 0x40, 0xdd, 0x94, 0xb5, 0xab, 0x11, 0x73,
+             0xec, 0xa6, 0x4d, 0xec, 0x8c, 0x65, 0xf3, 0x46, 0xc8, 0x7e},
+            {0x43, 0xd6, 0x34, 0x49, 0x43, 0x93, 0x89, 0x52, 0xf5, 0x22, 0x12,
+             0xa5, 0x6,  0xf8, 0xdb, 0xb9, 0x22, 0x1c, 0xf4, 0xc3, 0x8f, 0x87,
+             0x6d, 0x8f, 0x30, 0x97, 0x9d, 0x4d, 0x2a, 0x6a, 0x67, 0x37},
+            {0xc7, 0x2e, 0xa2, 0x1d, 0x3f, 0x8f, 0x5e, 0x9b, 0x13, 0xcd, 0x1,
+             0x6c, 0x77, 0x1d, 0xf,  0x13, 0xb8, 0x9f, 0x98, 0xa2, 0xcf, 0x8f,
+             0x4c, 0x21, 0xd5, 0x9d, 0x9b, 0x39, 0x23, 0xf7, 0xaa, 0x6d},
+        },
+    },
+    {
+        {
+            {0xa2, 0x8e, 0xad, 0xac, 0xbf, 0x4,  0x3b, 0x58, 0x84, 0xe8, 0x8b,
+             0x14, 0xe8, 0x43, 0xb7, 0x29, 0xdb, 0xc5, 0x10, 0x8,  0x3b, 0x58,
+             0x1e, 0x2b, 0xaa, 0xbb, 0xb3, 0x8e, 0xe5, 0x49, 0x54, 0x2b},
+            {0x47, 0xbe, 0x3d, 0xeb, 0x62, 0x75, 0x3a, 0x5f, 0xb8, 0xa0, 0xbd,
+             0x8e, 0x54, 0x38, 0xea, 0xf7, 0x99, 0x72, 0x74, 0x45, 0x31, 0xe5,
+             0xc3, 0x0,  0x51, 0xd5, 0x27, 0x16, 0xe7, 0xe9, 0x4,  0x13},
+            {0xfe, 0x9c, 0xdc, 0x6a, 0xd2, 0x14, 0x98, 0x78, 0xb,  0xdd, 0x48,
+             0x8b, 0x3f, 0xab, 0x1b, 0x3c, 0xa,  0xc6, 0x79, 0xf9, 0xff, 0xe1,
+             0xf,  0xda, 0x93, 0xd6, 0x2d, 0x7c, 0x2d, 0xde, 0x68, 0x44},
+        },
+        {
+            {0xce, 0x7,  0x63, 0xf8, 0xc6, 0xd8, 0x9a, 0x4b, 0x28, 0xc,  0x5d,
+             0x43, 0x31, 0x35, 0x11, 0x21, 0x2c, 0x77, 0x7a, 0x65, 0xc5, 0x66,
+             0xa8, 0xd4, 0x52, 0x73, 0x24, 0x63, 0x7e, 0x42, 0xa6, 0x5d},
+            {0x9e, 0x46, 0x19, 0x94, 0x5e, 0x35, 0xbb, 0x51, 0x54, 0xc7, 0xdd,
+             0x23, 0x4c, 0xdc, 0xe6, 0x33, 0x62, 0x99, 0x7f, 0x44, 0xd6, 0xb6,
+             0xa5, 0x93, 0x63, 0xbd, 0x44, 0xfb, 0x6f, 0x7c, 0xce, 0x6c},
+            {0xca, 0x22, 0xac, 0xde, 0x88, 0xc6, 0x94, 0x1a, 0xf8, 0x1f, 0xae,
+             0xbb, 0xf7, 0x6e, 0x6,  0xb9, 0xf,  0x58, 0x59, 0x8d, 0x38, 0x8c,
+             0xad, 0x88, 0xa8, 0x2c, 0x9f, 0xe7, 0xbf, 0x9a, 0xf2, 0x58},
+        },
+        {
+            {0xf6, 0xcd, 0xe,  0x71, 0xbf, 0x64, 0x5a, 0x4b, 0x3c, 0x29, 0x2c,
+             0x46, 0x38, 0xe5, 0x4c, 0xb1, 0xb9, 0x3a, 0xb,  0xd5, 0x56, 0xd0,
+             0x43, 0x36, 0x70, 0x48, 0x5b, 0x18, 0x24, 0x37, 0xf9, 0x6a},
+            {0x68, 0x3e, 0xe7, 0x8d, 0xab, 0xcf, 0xe,  0xe9, 0xa5, 0x76, 0x7e,
+             0x37, 0x9f, 0x6f, 0x3,  0x54, 0x82, 0x59, 0x1,  0xbe, 0xb,  0x5b,
+             0x49, 0xf0, 0x36, 0x1e, 0xf4, 0xa7, 0xc4, 0x29, 0x76, 0x57},
+            {0x88, 0xa8, 0xc6, 0x9,  0x45, 0x2,  0x20, 0x32, 0x73, 0x89, 0x55,
+             0x4b, 0x13, 0x36, 0xe0, 0xd2, 0x9f, 0x28, 0x33, 0x3c, 0x23, 0x36,
+             0xe2, 0x83, 0x8f, 0xc1, 0xae, 0xc,  0xbb, 0x25, 0x1f, 0x70},
+        },
+        {
+            {0x13, 0xc1, 0xbe, 0x7c, 0xd9, 0xf6, 0x18, 0x9d, 0xe4, 0xdb, 0xbf,
+             0x74, 0xe6, 0x6,  0x4a, 0x84, 0xd6, 0x60, 0x4e, 0xac, 0x22, 0xb5,
+             0xf5, 0x20, 0x51, 0x5e, 0x95, 0x50, 0xc0, 0x5b, 0xa,  0x72},
+            {0xed, 0x6c, 0x61, 0xe4, 0xf8, 0xb0, 0xa8, 0xc3, 0x7d, 0xa8, 0x25,
+             0x9e, 0xe,  0x66, 0x0,  0xf7, 0x9c, 0xa5, 0xbc, 0xf4, 0x1f, 0x6,
+             0xe3, 0x61, 0xe9, 0xb,  0xc4, 0xbd, 0xbf, 0x92, 0xc,  0x2e},
+            {0x35, 0x5a, 0x80, 0x9b, 0x43, 0x9,  0x3f, 0xc,  0xfc, 0xab, 0x42,
+             0x62, 0x37, 0x8b, 0x4e, 0xe8, 0x46, 0x93, 0x22, 0x5c, 0xf3, 0x17,
+             0x14, 0x69, 0xec, 0xf0, 0x4e, 0x14, 0xbb, 0x9c, 0x9b, 0xe},
+        },
+        {
+            {0xee, 0xbe, 0xb1, 0x5d, 0xd5, 0x9b, 0xee, 0x8d, 0xb9, 0x3f, 0x72,
+             0xa,  0x37, 0xab, 0xc3, 0xc9, 0x91, 0xd7, 0x68, 0x1c, 0xbf, 0xf1,
+             0xa8, 0x44, 0xde, 0x3c, 0xfd, 0x1c, 0x19, 0x44, 0x6d, 0x36},
+            {0xad, 0x20, 0x57, 0xfb, 0x8f, 0xd4, 0xba, 0xfb, 0xe,  0xd,  0xf9,
+             0xdb, 0x6b, 0x91, 0x81, 0xee, 0xbf, 0x43, 0x55, 0x63, 0x52, 0x31,
+             0x81, 0xd4, 0xd8, 0x7b, 0x33, 0x3f, 0xeb, 0x4,  0x11, 0x22},
+            {0x14, 0x8c, 0xbc, 0xf2, 0x43, 0x17, 0x3c, 0x9e, 0x3b, 0x6c, 0x85,
+             0xb5, 0xfc, 0x26, 0xda, 0x2e, 0x97, 0xfb, 0xa7, 0x68, 0xe,  0x2f,
+             0xb8, 0xcc, 0x44, 0x32, 0x59, 0xbc, 0xe6, 0xa4, 0x67, 0x41},
+        },
+        {
+            {0xee, 0x8f, 0xce, 0xf8, 0x65, 0x26, 0xbe, 0xc2, 0x2c, 0xd6, 0x80,
+             0xe8, 0x14, 0xff, 0x67, 0xe9, 0xee, 0x4e, 0x36, 0x2f, 0x7e, 0x6e,
+             0x2e, 0xf1, 0xf6, 0xd2, 0x7e, 0xcb, 0x70, 0x33, 0xb3, 0x34},
+            {0x0,  0x27, 0xf6, 0x76, 0x28, 0x9d, 0x3b, 0x64, 0xeb, 0x68, 0x76,
+             0xe,  0x40, 0x9d, 0x1d, 0x5d, 0x84, 0x6,  0xfc, 0x21, 0x3,  0x43,
+             0x4b, 0x1b, 0x6a, 0x24, 0x55, 0x22, 0x7e, 0xbb, 0x38, 0x79},
+            {0xcc, 0xd6, 0x81, 0x86, 0xee, 0x91, 0xc5, 0xcd, 0x53, 0xa7, 0x85,
+             0xed, 0x9c, 0x10, 0x2,  0xce, 0x83, 0x88, 0x80, 0x58, 0xc1, 0x85,
+             0x74, 0xed, 0xe4, 0x65, 0xfe, 0x2d, 0x6e, 0xfc, 0x76, 0x11},
+        },
+        {
+            {0xb8, 0xe,  0x77, 0x49, 0x89, 0xe2, 0x90, 0xdb, 0xa3, 0x40, 0xf4,
+             0xac, 0x2a, 0xcc, 0xfb, 0x98, 0x9b, 0x87, 0xd7, 0xde, 0xfe, 0x4f,
+             0x35, 0x21, 0xb6, 0x6,  0x69, 0xf2, 0x54, 0x3e, 0x6a, 0x1f},
+            {0x9b, 0x61, 0x9c, 0x5b, 0xd0, 0x6c, 0xaf, 0xb4, 0x80, 0x84, 0xa5,
+             0xb2, 0xf4, 0xc9, 0xdf, 0x2d, 0xc4, 0x4d, 0xe9, 0xeb, 0x2,  0xa5,
+             0x4f, 0x3d, 0x34, 0x5f, 0x7d, 0x67, 0x4c, 0x3a, 0xfc, 0x8},
+            {0xea, 0x34, 0x7,  0xd3, 0x99, 0xc1, 0xa4, 0x60, 0xd6, 0x5c, 0x16,
+             0x31, 0xb6, 0x85, 0xc0, 0x40, 0x95, 0x82, 0x59, 0xf7, 0x23, 0x3e,
+             0x33, 0xe2, 0xd1, 0x0,  0xb9, 0x16, 0x1,  0xad, 0x2f, 0x4f},
+        },
+        {
+            {0x38, 0xb6, 0x3b, 0xb7, 0x1d, 0xd9, 0x2c, 0x96, 0x8,  0x9c, 0x12,
+             0xfc, 0xaa, 0x77, 0x5,  0xe6, 0x89, 0x16, 0xb6, 0xf3, 0x39, 0x9b,
+             0x61, 0x6f, 0x81, 0xee, 0x44, 0x29, 0x5f, 0x99, 0x51, 0x34},
+            {0x54, 0x4e, 0xae, 0x94, 0x41, 0xb2, 0xbe, 0x44, 0x6c, 0xef, 0x57,
+             0x18, 0x51, 0x1c, 0x54, 0x5f, 0x98, 0x4,  0x8d, 0x36, 0x2d, 0x6b,
+             0x1e, 0xa6, 0xab, 0xf7, 0x2e, 0x97, 0xa4, 0x84, 0x54, 0x44},
+            {0x7c, 0x7d, 0xea, 0x9f, 0xd0, 0xfc, 0x52, 0x91, 0xf6, 0x5c, 0x93,
+             0xb0, 0x94, 0x6c, 0x81, 0x4a, 0x40, 0x5c, 0x28, 0x47, 0xaa, 0x9a,
+             0x8e, 0x25, 0xb7, 0x93, 0x28, 0x4,  0xa6, 0x9c, 0xb8, 0x10},
+        },
+    },
+    {
+        {
+            {0x6e, 0xf0, 0x45, 0x5a, 0xbe, 0x41, 0x39, 0x75, 0x65, 0x5f, 0x9c,
+             0x6d, 0xed, 0xae, 0x7c, 0xd0, 0xb6, 0x51, 0xff, 0x72, 0x9c, 0x6b,
+             0x77, 0x11, 0xa9, 0x4d, 0xd,  0xef, 0xd9, 0xd1, 0xd2, 0x17},
+            {0x9c, 0x28, 0x18, 0x97, 0x49, 0x47, 0x59, 0x3d, 0x26, 0x3f, 0x53,
+             0x24, 0xc5, 0xf8, 0xeb, 0x12, 0x15, 0xef, 0xc3, 0x14, 0xcb, 0xbf,
+             0x62, 0x2,  0x8e, 0x51, 0xb7, 0x77, 0xd5, 0x78, 0xb8, 0x20},
+            {0x6a, 0x3e, 0x3f, 0x7,  0x18, 0xaf, 0xf2, 0x27, 0x69, 0x10, 0x52,
+             0xd7, 0x19, 0xe5, 0x3f, 0xfd, 0x22, 0x0,  0xa6, 0x3c, 0x2c, 0xb7,
+             0xe3, 0x22, 0xa7, 0xc6, 0x65, 0xcc, 0x63, 0x4f, 0x21, 0x72},
+        },
+        {
+            {0xc9, 0x29, 0x3b, 0xf4, 0xb9, 0xb7, 0x9d, 0x1d, 0x75, 0x8f, 0x51,
+             0x4f, 0x4a, 0x82, 0x5,  0xd6, 0xc4, 0x9d, 0x2f, 0x31, 0xbd, 0x72,
+             0xc0, 0xf2, 0xb0, 0x45, 0x15, 0x5a, 0x85, 0xac, 0x24, 0x1f},
+            {0x93, 0xa6, 0x7,  0x53, 0x40, 0x7f, 0xe3, 0xb4, 0x95, 0x67, 0x33,
+             0x2f, 0xd7, 0x14, 0xa7, 0xab, 0x99, 0x10, 0x76, 0x73, 0xa7, 0xd0,
+             0xfb, 0xd6, 0xc9, 0xcb, 0x71, 0x81, 0xc5, 0x48, 0xdf, 0x5f},
+            {0xaa, 0x5,  0x95, 0x8e, 0x32, 0x8,  0xd6, 0x24, 0xee, 0x20, 0x14,
+             0xc,  0xd1, 0xc1, 0x48, 0x47, 0xa2, 0x25, 0xfb, 0x6,  0x5c, 0xe4,
+             0xff, 0xc7, 0xe6, 0x95, 0xe3, 0x2a, 0x9e, 0x73, 0xba, 0x0},
+        },
+        {
+            {0x26, 0xbb, 0x88, 0xea, 0xf5, 0x26, 0x44, 0xae, 0xfb, 0x3b, 0x97,
+             0x84, 0xd9, 0x79, 0x6,  0x36, 0x50, 0x4e, 0x69, 0x26, 0xc,  0x3,
+             0x9f, 0x5c, 0x26, 0xd2, 0x18, 0xd5, 0xe7, 0x7d, 0x29, 0x72},
+            {0xd6, 0x90, 0x87, 0x5c, 0xde, 0x98, 0x2e, 0x59, 0xdf, 0xa2, 0xc2,
+             0x45, 0xd3, 0xb7, 0xbf, 0xe5, 0x22, 0x99, 0xb4, 0xf9, 0x60, 0x3b,
+             0x5a, 0x11, 0xf3, 0x78, 0xad, 0x67, 0x3e, 0x3a, 0x28, 0x3},
+            {0x39, 0xb9, 0xc,  0xbe, 0xc7, 0x1d, 0x24, 0x48, 0x80, 0x30, 0x63,
+             0x8b, 0x4d, 0x9b, 0xf1, 0x32, 0x8,  0x93, 0x28, 0x2,  0xd,  0xc9,
+             0xdf, 0xd3, 0x45, 0x19, 0x27, 0x46, 0x68, 0x29, 0xe1, 0x5},
+        },
+        {
+            {0x50, 0x45, 0x2c, 0x24, 0xc8, 0xbb, 0xbf, 0xad, 0xd9, 0x81, 0x30,
+             0xd0, 0xec, 0xc,  0xc8, 0xbc, 0x92, 0xdf, 0xc8, 0xf5, 0xa6, 0x66,
+             0x35, 0x84, 0x4c, 0xce, 0x58, 0x82, 0xd3, 0x25, 0xcf, 0x78},
+            {0x5a, 0x49, 0x9c, 0x2d, 0xb3, 0xee, 0x82, 0xba, 0x7c, 0xb9, 0x2b,
+             0xf1, 0xfc, 0xc8, 0xef, 0xce, 0xe0, 0xd1, 0xb5, 0x93, 0xae, 0xab,
+             0x2d, 0xb0, 0x9b, 0x8d, 0x69, 0x13, 0x9c, 0xc,  0xc0, 0x39},
+            {0x68, 0x9d, 0x48, 0x31, 0x8e, 0x6b, 0xae, 0x15, 0x87, 0xf0, 0x2b,
+             0x9c, 0xab, 0x1c, 0x85, 0xaa, 0x5,  0xfa, 0x4e, 0xf0, 0x97, 0x5a,
+             0xa7, 0xc9, 0x32, 0xf8, 0x3f, 0x6b, 0x7,  0x52, 0x6b, 0x0},
+        },
+        {
+            {0x2d, 0x8,  0xce, 0xb9, 0x16, 0x7e, 0xcb, 0xf5, 0x29, 0xbc, 0x7a,
+             0x41, 0x4c, 0xf1, 0x7,  0x34, 0xab, 0xa7, 0xf4, 0x2b, 0xce, 0x6b,
+             0xb3, 0xd4, 0xce, 0x75, 0x9f, 0x1a, 0x56, 0xe9, 0xe2, 0x7d},
+            {0x1c, 0x78, 0x95, 0x9d, 0xe1, 0xcf, 0xe0, 0x29, 0xe2, 0x10, 0x63,
+             0x96, 0x18, 0xdf, 0x81, 0xb6, 0x39, 0x6b, 0x51, 0x70, 0xd3, 0x39,
+             0xdf, 0x57, 0x22, 0x61, 0xc7, 0x3b, 0x44, 0xe3, 0x57, 0x4d},
+            {0xcb, 0x5e, 0xa5, 0xb6, 0xf4, 0xd4, 0x70, 0xde, 0x99, 0xdb, 0x85,
+             0x5d, 0x7f, 0x52, 0x1,  0x48, 0x81, 0x9a, 0xee, 0xd3, 0x40, 0xc4,
+             0xc9, 0xdb, 0xed, 0x29, 0x60, 0x1a, 0xaf, 0x90, 0x2a, 0x6b},
+        },
+        {
+            {0xa,  0xd8, 0xb2, 0x5b, 0x24, 0xf3, 0xeb, 0x77, 0x9b, 0x7,  0xb9,
+             0x2f, 0x47, 0x1b, 0x30, 0xd8, 0x33, 0x73, 0xee, 0x4c, 0xf2, 0xe6,
+             0x47, 0xc6, 0x9,  0x21, 0x6c, 0x27, 0xc8, 0x12, 0x58, 0x46},
+            {0x97, 0x1e, 0xe6, 0x9a, 0xfc, 0xf4, 0x23, 0x69, 0xd1, 0x5f, 0x3f,
+             0xe0, 0x1d, 0x28, 0x35, 0x57, 0x2d, 0xd1, 0xed, 0xe6, 0x43, 0xae,
+             0x64, 0xa7, 0x4a, 0x3e, 0x2d, 0xd1, 0xe9, 0xf4, 0xd8, 0x5f},
+            {0xd9, 0x62, 0x10, 0x2a, 0xb2, 0xbe, 0x43, 0x4d, 0x16, 0xdc, 0x31,
+             0x38, 0x75, 0xfb, 0x65, 0x70, 0xd7, 0x68, 0x29, 0xde, 0x7b, 0x4a,
+             0xd,  0x18, 0x90, 0x67, 0xb1, 0x1c, 0x2b, 0x2c, 0xb3, 0x5},
+        },
+        {
+            {0x95, 0x81, 0xd5, 0x7a, 0x2c, 0xa4, 0xfc, 0xf7, 0xcc, 0xf3, 0x33,
+             0x43, 0x6e, 0x28, 0x14, 0x32, 0x9d, 0x97, 0xb,  0x34, 0xd,  0x9d,
+             0xc2, 0xb6, 0xe1, 0x7,  0x73, 0x56, 0x48, 0x1a, 0x77, 0x31},
+            {0xfd, 0xa8, 0x4d, 0xd2, 0xcc, 0x5e, 0xc0, 0xc8, 0x83, 0xef, 0xdf,
+             0x5,  0xac, 0x1a, 0xcf, 0xa1, 0x61, 0xcd, 0xf9, 0x7d, 0xf2, 0xef,
+             0xbe, 0xdb, 0x99, 0x1e, 0x47, 0x7b, 0xa3, 0x56, 0x55, 0x3b},
+            {0x82, 0xd4, 0x4d, 0xe1, 0x24, 0xc5, 0xb0, 0x32, 0xb6, 0xa4, 0x2b,
+             0x1a, 0x54, 0x51, 0xb3, 0xed, 0xf3, 0x5a, 0x2b, 0x28, 0x48, 0x60,
+             0xd1, 0xa3, 0xeb, 0x36, 0x73, 0x7a, 0xd2, 0x79, 0xc0, 0x4f},
+        },
+        {
+            {0xd,  0xc5, 0x86, 0xc,  0x44, 0x8b, 0x34, 0xdc, 0x51, 0xe6, 0x94,
+             0xcc, 0xc9, 0xcb, 0x37, 0x13, 0xb9, 0x3c, 0x3e, 0x64, 0x4d, 0xf7,
+             0x22, 0x64, 0x8,  0xcd, 0xe3, 0xba, 0xc2, 0x70, 0x11, 0x24},
+            {0x7f, 0x2f, 0xbf, 0x89, 0xb0, 0x38, 0xc9, 0x51, 0xa7, 0xe9, 0xdf,
+             0x2,  0x65, 0xbd, 0x97, 0x24, 0x53, 0xe4, 0x80, 0x78, 0x9c, 0xc0,
+             0xff, 0xff, 0x92, 0x8e, 0xf9, 0xca, 0xce, 0x67, 0x45, 0x12},
+            {0xb4, 0x73, 0xc4, 0xa,  0x86, 0xab, 0xf9, 0x3f, 0x35, 0xe4, 0x13,
+             0x1,  0xee, 0x1d, 0x91, 0xf0, 0xaf, 0xc4, 0xc6, 0xeb, 0x60, 0x50,
+             0xe7, 0x4a, 0xd,  0x0,  0x87, 0x6c, 0x96, 0x12, 0x86, 0x3f},
+        },
+    },
+    {
+        {
+            {0x13, 0x8d, 0x4,  0x36, 0xfa, 0xfc, 0x18, 0x9c, 0xdd, 0x9d, 0x89,
+             0x73, 0xb3, 0x9d, 0x15, 0x29, 0xaa, 0xd0, 0x92, 0x9f, 0xb,  0x35,
+             0x9f, 0xdc, 0xd4, 0x19, 0x8a, 0x87, 0xee, 0x7e, 0xf5, 0x26},
+            {0xde, 0xd,  0x2a, 0x78, 0xc9, 0xc,  0x9a, 0x55, 0x85, 0x83, 0x71,
+             0xea, 0xb2, 0xcd, 0x1d, 0x55, 0x8c, 0x23, 0xef, 0x31, 0x5b, 0x86,
+             0x62, 0x7f, 0x3d, 0x61, 0x73, 0x79, 0x76, 0xa7, 0x4a, 0x50},
+            {0xb1, 0xef, 0x87, 0x56, 0xd5, 0x2c, 0xab, 0xc,  0x7b, 0xf1, 0x7a,
+             0x24, 0x62, 0xd1, 0x80, 0x51, 0x67, 0x24, 0x5a, 0x4f, 0x34, 0x5a,
+             0xc1, 0x85, 0x69, 0x30, 0xba, 0x9d, 0x3d, 0x94, 0x41, 0x40},
+        },
+        {
+            {0xdd, 0xaa, 0x6c, 0xa2, 0x43, 0x77, 0x21, 0x4b, 0xce, 0xb7, 0x8a,
+             0x64, 0x24, 0xb4, 0xa6, 0x47, 0xe3, 0xc9, 0xfb, 0x3,  0x7a, 0x4f,
+             0x1d, 0xcb, 0x19, 0xd0, 0x0,  0x98, 0x42, 0x31, 0xd9, 0x12},
+            {0x96, 0xcc, 0xeb, 0x43, 0xba, 0xee, 0xc0, 0xc3, 0xaf, 0x9c, 0xea,
+             0x26, 0x9c, 0x9c, 0x74, 0x8d, 0xc6, 0xcc, 0x77, 0x1c, 0xee, 0x95,
+             0xfa, 0xd9, 0xf,  0x34, 0x84, 0x76, 0xd9, 0xa1, 0x20, 0x14},
+            {0x4f, 0x59, 0x37, 0xd3, 0x99, 0x77, 0xc6, 0x0,  0x7b, 0xa4, 0x3a,
+             0xb2, 0x40, 0x51, 0x3c, 0x5e, 0x95, 0xf3, 0x5f, 0xe3, 0x54, 0x28,
+             0x18, 0x44, 0x12, 0xa0, 0x59, 0x43, 0x31, 0x92, 0x4f, 0x1b},
+        },
+        {
+            {0xb1, 0x66, 0x98, 0xa4, 0x30, 0x30, 0xcf, 0x33, 0x59, 0x48, 0x5f,
+             0x21, 0xd2, 0x73, 0x1f, 0x25, 0xf6, 0xf4, 0xde, 0x51, 0x40, 0xaa,
+             0x82, 0xab, 0xf6, 0x23, 0x9a, 0x6f, 0xd5, 0x91, 0xf1, 0x5f},
+            {0x51, 0x9,  0x15, 0x89, 0x9d, 0x10, 0x5c, 0x3e, 0x6a, 0x69, 0xe9,
+             0x2d, 0x91, 0xfa, 0xce, 0x39, 0x20, 0x30, 0x5f, 0x97, 0x3f, 0xe4,
+             0xea, 0x20, 0xae, 0x2d, 0x13, 0x7f, 0x2a, 0x57, 0x9b, 0x23},
+            {0x68, 0x90, 0x2d, 0xac, 0x33, 0xd4, 0x9e, 0x81, 0x23, 0x85, 0xc9,
+             0x5f, 0x79, 0xab, 0x83, 0x28, 0x3d, 0xeb, 0x93, 0x55, 0x80, 0x72,
+             0x45, 0xef, 0xcb, 0x36, 0x8f, 0x75, 0x6a, 0x52, 0xc,  0x2},
+        },
+        {
+            {0x89, 0xcc, 0x42, 0xf0, 0x59, 0xef, 0x31, 0xe9, 0xb6, 0x4b, 0x12,
+             0x8e, 0x9d, 0x9c, 0x58, 0x2c, 0x97, 0x59, 0xc7, 0xae, 0x8a, 0xe1,
+             0xc8, 0xad, 0xc,  0xc5, 0x2,  0x56, 0xa,  0xfe, 0x2c, 0x45},
+            {0xbc, 0xdb, 0xd8, 0x9e, 0xf8, 0x34, 0x98, 0x77, 0x6c, 0xa4, 0x7c,
+             0xdc, 0xf9, 0xaa, 0xf2, 0xc8, 0x74, 0xb0, 0xe1, 0xa3, 0xdc, 0x4c,
+             0x52, 0xa9, 0x77, 0x38, 0x31, 0x15, 0x46, 0xcc, 0xaa, 0x2},
+            {0xdf, 0x77, 0x78, 0x64, 0xa0, 0xf7, 0xa0, 0x86, 0x9f, 0x7c, 0x60,
+             0xe,  0x27, 0x64, 0xc4, 0xbb, 0xc9, 0x11, 0xfb, 0xf1, 0x25, 0xea,
+             0x17, 0xab, 0x7b, 0x87, 0x4b, 0x30, 0x7b, 0x7d, 0xfb, 0x4c},
+        },
+        {
+            {0x12, 0xef, 0x89, 0x97, 0xc2, 0x99, 0x86, 0xe2, 0xd,  0x19, 0x57,
+             0xdf, 0x71, 0xcd, 0x6e, 0x2b, 0xd0, 0x70, 0xc9, 0xec, 0x57, 0xc8,
+             0x43, 0xc3, 0xc5, 0x3a, 0x4d, 0x43, 0xbc, 0x4c, 0x1d, 0x5b},
+            {0xfe, 0x75, 0x9b, 0xb8, 0x6c, 0x3d, 0xb4, 0x72, 0x80, 0xdc, 0x6a,
+             0x9c, 0xd9, 0x94, 0xc6, 0x54, 0x9f, 0x4c, 0xe3, 0x3e, 0x37, 0xaa,
+             0xc3, 0xb8, 0x64, 0x53, 0x7,  0x39, 0x2b, 0x62, 0xb4, 0x14},
+            {0x26, 0x9f, 0xa,  0xcc, 0x15, 0x26, 0xfb, 0xb6, 0xe5, 0xcc, 0x8d,
+             0xb8, 0x2b, 0xe,  0x4f, 0x3a, 0x5,  0xa7, 0x69, 0x33, 0x8b, 0x49,
+             0x1,  0x13, 0xd1, 0x2d, 0x59, 0x58, 0x12, 0xf7, 0x98, 0x2f},
+        },
+        {
+            {0x1,  0xa7, 0x54, 0x4f, 0x44, 0xae, 0x12, 0x2e, 0xde, 0xd7, 0xcb,
+             0xa9, 0xf0, 0x3e, 0xfe, 0xfc, 0xe0, 0x5d, 0x83, 0x75, 0xd,  0x89,
+             0xbf, 0xce, 0x54, 0x45, 0x61, 0xe7, 0xe9, 0x62, 0x80, 0x1d},
+            {0x56, 0x9e, 0xf,  0xb5, 0x4c, 0xa7, 0x94, 0xc,  0x20, 0x13, 0x8e,
+             0x8e, 0xa9, 0xf4, 0x1f, 0x5b, 0x67, 0xf,  0x30, 0x82, 0x21, 0xcc,
+             0x2a, 0x9a, 0xf9, 0xaa, 0x6,  0xd8, 0x49, 0xe2, 0x6a, 0x3a},
+            {0x5a, 0x7c, 0x90, 0xa9, 0x85, 0xda, 0x7a, 0x65, 0x62, 0xf,  0xb9,
+             0x91, 0xb5, 0xa8, 0xe,  0x1a, 0xe9, 0xb4, 0x34, 0xdf, 0xfb, 0x1d,
+             0xe,  0x8d, 0xf3, 0x5f, 0xf2, 0xae, 0xe8, 0x8c, 0x8b, 0x29},
+        },
+        {
+            {0xde, 0x65, 0x21, 0xa,  0xea, 0x72, 0x7a, 0x83, 0xf6, 0x79, 0xcf,
+             0xb,  0xb4, 0x7,  0xab, 0x3f, 0x70, 0xae, 0x38, 0x77, 0xc7, 0x36,
+             0x16, 0x52, 0xdc, 0xd7, 0xa7, 0x3,  0x18, 0x27, 0xa6, 0x6b},
+            {0xb2, 0xc,  0xf7, 0xef, 0x53, 0x79, 0x92, 0x2a, 0x76, 0x70, 0x15,
+             0x79, 0x2a, 0xc9, 0x89, 0x4b, 0x6a, 0xcf, 0xa7, 0x30, 0x7a, 0x45,
+             0x18, 0x94, 0x85, 0xe4, 0x5c, 0x4d, 0x40, 0xa8, 0xb8, 0x34},
+            {0x35, 0x33, 0x69, 0x83, 0xb5, 0xec, 0x6e, 0xc2, 0xfd, 0xfe, 0xb5,
+             0x63, 0xdf, 0x13, 0xa8, 0xd5, 0x73, 0x25, 0xb2, 0xa4, 0x9a, 0xaa,
+             0x93, 0xa2, 0x6a, 0x1c, 0x5e, 0x46, 0xdd, 0x2b, 0xd6, 0x71},
+        },
+        {
+            {0xf5, 0x5e, 0xf7, 0xb1, 0xda, 0xb5, 0x2d, 0xcd, 0xf5, 0x65, 0xb0,
+             0x16, 0xcf, 0x95, 0x7f, 0xd7, 0x85, 0xf0, 0x49, 0x3f, 0xea, 0x1f,
+             0x57, 0x14, 0x3d, 0x2b, 0x2b, 0x26, 0x21, 0x36, 0x33, 0x1c},
+            {0x80, 0xdf, 0x78, 0xd3, 0x28, 0xcc, 0x33, 0x65, 0xb4, 0xa4, 0xf,
+             0xa,  0x79, 0x43, 0xdb, 0xf6, 0x5a, 0xda, 0x1,  0xf7, 0xf9, 0x5f,
+             0x64, 0xe3, 0xa4, 0x2b, 0x17, 0xf3, 0x17, 0xf3, 0xd5, 0x74},
+            {0x81, 0xca, 0xd9, 0x67, 0x54, 0xe5, 0x6f, 0xa8, 0x37, 0x8c, 0x29,
+             0x2b, 0x75, 0x7c, 0x8b, 0x39, 0x3b, 0x62, 0xac, 0xe3, 0x92, 0x8,
+             0x6d, 0xda, 0x8c, 0xd9, 0xe9, 0x47, 0x45, 0xcc, 0xeb, 0x4a},
+        },
+    },
+    {
+        {
+            {0x10, 0xb6, 0x54, 0x73, 0x9e, 0x8d, 0x40, 0xb,  0x6e, 0x5b, 0xa8,
+             0x5b, 0x53, 0x32, 0x6b, 0x80, 0x7,  0xa2, 0x58, 0x4a, 0x3,  0x3a,
+             0xe6, 0xdb, 0x2c, 0xdf, 0xa1, 0xc9, 0xdd, 0xd9, 0x3b, 0x17},
+            {0xc9, 0x1,  0x6d, 0x27, 0x1b, 0x7,  0xf0, 0x12, 0x70, 0x8c, 0xc4,
+             0x86, 0xc5, 0xba, 0xb8, 0xe7, 0xa9, 0xfb, 0xd6, 0x71, 0x9b, 0x12,
+             0x8,  0x53, 0x92, 0xb7, 0x3d, 0x5a, 0xf9, 0xfb, 0x88, 0x5d},
+            {0xdf, 0x72, 0x58, 0xfe, 0x1e, 0xf,  0x50, 0x2b, 0xc1, 0x18, 0x39,
+             0xd4, 0x2e, 0x58, 0xd6, 0x58, 0xe0, 0x3a, 0x67, 0xc9, 0x8e, 0x27,
+             0xed, 0xe6, 0x19, 0xa3, 0x9e, 0xb1, 0x13, 0xcd, 0xe1, 0x6},
+        },
+        {
+            {0x53, 0x3,  0x5b, 0x9e, 0x62, 0xaf, 0x2b, 0x47, 0x47, 0x4,  0x8d,
+             0x27, 0x90, 0xb,  0xaa, 0x3b, 0x27, 0xbf, 0x43, 0x96, 0x46, 0x5f,
+             0x78, 0xc,  0x13, 0x7b, 0x83, 0x8d, 0x1a, 0x6a, 0x3a, 0x7f},
+            {0x23, 0x6f, 0x16, 0x6f, 0x51, 0xad, 0xd0, 0x40, 0xbe, 0x6a, 0xab,
+             0x1f, 0x93, 0x32, 0x8e, 0x11, 0x8e, 0x8,  0x4d, 0xa0, 0x14, 0x5e,
+             0xe3, 0x3f, 0x66, 0x62, 0xe1, 0x26, 0x35, 0x60, 0x80, 0x30},
+            {0xb,  0x80, 0x3d, 0x5d, 0x39, 0x44, 0xe6, 0xf7, 0xf6, 0xed, 0x1,
+             0xc9, 0x55, 0xd5, 0xa8, 0x95, 0x39, 0x63, 0x2c, 0x59, 0x30, 0x78,
+             0xcd, 0x68, 0x7e, 0x30, 0x51, 0x2e, 0xed, 0xfd, 0xd0, 0x30},
+        },
+        {
+            {0x50, 0x47, 0xb8, 0x68, 0x1e, 0x97, 0xb4, 0x9c, 0xcf, 0xbb, 0x64,
+             0x66, 0x29, 0x72, 0x95, 0xa0, 0x2b, 0x41, 0xfa, 0x72, 0x26, 0xe7,
+             0x8d, 0x5c, 0xd9, 0x89, 0xc5, 0x51, 0x43, 0x8,  0x15, 0x46},
+            {0xb3, 0x33, 0x12, 0xf2, 0x1a, 0x4d, 0x59, 0xe0, 0x9c, 0x4d, 0xcc,
+             0xf0, 0x8e, 0xe7, 0xdb, 0x1b, 0x77, 0x9a, 0x49, 0x8f, 0x7f, 0x18,
+             0x65, 0x69, 0x68, 0x98, 0x9,  0x2c, 0x20, 0x14, 0x92, 0xa},
+            {0x2e, 0xa0, 0xb9, 0xae, 0xc0, 0x19, 0x90, 0xbc, 0xae, 0x4c, 0x3,
+             0x16, 0xd,  0x11, 0xc7, 0x55, 0xec, 0x32, 0x99, 0x65, 0x1,  0xf5,
+             0x6d, 0xe,  0xfe, 0x5d, 0xca, 0x95, 0x28, 0xd,  0xca, 0x3b},
+        },
+        {
+            {0xbf, 0x1,  0xcc, 0x9e, 0xb6, 0x8e, 0x68, 0x9c, 0x6f, 0x89, 0x44,
+             0xa6, 0xad, 0x83, 0xbc, 0xf0, 0xe2, 0x9f, 0x7a, 0x5f, 0x5f, 0x95,
+             0x2d, 0xca, 0x41, 0x82, 0xf2, 0x8d, 0x3,  0xb4, 0xa8, 0x4e},
+            {0xa4, 0x62, 0x5d, 0x3c, 0xbc, 0x31, 0xf0, 0x40, 0x60, 0x7a, 0xf0,
+             0xcf, 0x3e, 0x8b, 0xfc, 0x19, 0x45, 0xb5, 0xf,  0x13, 0xa2, 0x3d,
+             0x18, 0x98, 0xcd, 0x13, 0x8f, 0xae, 0xdd, 0xde, 0x31, 0x56},
+            {0x2,  0xd2, 0xca, 0xf1, 0xa,  0x46, 0xed, 0x2a, 0x83, 0xee, 0x8c,
+             0xa4, 0x5,  0x53, 0x30, 0x46, 0x5f, 0x1a, 0xf1, 0x49, 0x45, 0x77,
+             0x21, 0x91, 0x63, 0xa4, 0x2c, 0x54, 0x30, 0x9,  0xce, 0x24},
+        },
+        {
+            {0x85, 0xb,  0xf3, 0xfd, 0x55, 0xa1, 0xcf, 0x3f, 0xa4, 0x2e, 0x37,
+             0x36, 0x8e, 0x16, 0xf7, 0xd2, 0x44, 0xf8, 0x92, 0x64, 0xde, 0x64,
+             0xe0, 0xb2, 0x80, 0x42, 0x4f, 0x32, 0xa7, 0x28, 0x99, 0x54},
+            {0x6,  0xc1, 0x6,  0xfd, 0xf5, 0x90, 0xe8, 0x1f, 0xf2, 0x10, 0x88,
+             0x5d, 0x35, 0x68, 0xc4, 0xb5, 0x3e, 0xaf, 0x8c, 0x6e, 0xfe, 0x8,
+             0x78, 0x82, 0x4b, 0xd7, 0x6,  0x8a, 0xc2, 0xe3, 0xd4, 0x41},
+            {0x2e, 0x1a, 0xee, 0x63, 0xa7, 0x32, 0x6e, 0xf2, 0xea, 0xfd, 0x5f,
+             0xd2, 0xb7, 0xe4, 0x91, 0xae, 0x69, 0x4d, 0x7f, 0xd1, 0x3b, 0xd3,
+             0x3b, 0xbc, 0x6a, 0xff, 0xdc, 0xc0, 0xde, 0x66, 0x1b, 0x49},
+        },
+        {
+            {0xa1, 0x64, 0xda, 0xd0, 0x8e, 0x4a, 0xf0, 0x75, 0x4b, 0x28, 0xe2,
+             0x67, 0xaf, 0x2c, 0x22, 0xed, 0xa4, 0x7b, 0x7b, 0x1f, 0x79, 0xa3,
+             0x34, 0x82, 0x67, 0x8b, 0x1,  0xb7, 0xb0, 0xb8, 0xf6, 0x4c},
+            {0xa7, 0x32, 0xea, 0xc7, 0x3d, 0xb1, 0xf5, 0x98, 0x98, 0xdb, 0x16,
+             0x7e, 0xcc, 0xf8, 0xd5, 0xe3, 0x47, 0xd9, 0xf8, 0xcb, 0x52, 0xbf,
+             0xa,  0xac, 0xac, 0xe4, 0x5e, 0xc8, 0xd0, 0x38, 0xf3, 0x8},
+            {0xbd, 0x73, 0x1a, 0x99, 0x21, 0xa8, 0x83, 0xc3, 0x7a, 0xc,  0x32,
+             0xdf, 0x1,  0xbc, 0x27, 0xab, 0x63, 0x70, 0x77, 0x84, 0x1b, 0x33,
+             0x3d, 0xc1, 0x99, 0x8a, 0x7,  0xeb, 0x82, 0x4a, 0xd,  0x53},
+        },
+        {
+            {0x9e, 0xbf, 0x9a, 0x6c, 0x45, 0x73, 0x69, 0x6d, 0x80, 0xa8, 0x0,
+             0x49, 0xfc, 0xb2, 0x7f, 0x25, 0x50, 0xb8, 0xcf, 0xc8, 0x12, 0xf4,
+             0xac, 0x2b, 0x5b, 0xbd, 0xbf, 0xc,  0xe0, 0xe7, 0xb3, 0xd},
+            {0x25, 0x48, 0xf9, 0xe1, 0x30, 0x36, 0x4c, 0x0,  0x5a, 0x53, 0xab,
+             0x8c, 0x26, 0x78, 0x2d, 0x7e, 0x8b, 0xff, 0x84, 0xcc, 0x23, 0x23,
+             0x48, 0xc7, 0xb9, 0x70, 0x17, 0x10, 0x3f, 0x75, 0xea, 0x65},
+            {0x63, 0x63, 0x9,  0xe2, 0x3e, 0xfc, 0x66, 0x3d, 0x6b, 0xcb, 0xb5,
+             0x61, 0x7f, 0x2c, 0xd6, 0x81, 0x1a, 0x3b, 0x44, 0x13, 0x42, 0x4,
+             0xbe, 0xf,  0xdb, 0xa1, 0xe1, 0x21, 0x19, 0xec, 0xa4, 0x2},
+        },
+        {
+            {0x5f, 0x79, 0xcf, 0xf1, 0x62, 0x61, 0xc8, 0xf5, 0xf2, 0x57, 0xee,
+             0x26, 0x19, 0x86, 0x8c, 0x11, 0x78, 0x35, 0x6,  0x1c, 0x85, 0x24,
+             0x21, 0x17, 0xcf, 0x7f, 0x6,  0xec, 0x5d, 0x2b, 0xd1, 0x36},
+            {0xa2, 0xb8, 0x24, 0x3b, 0x9a, 0x25, 0xe6, 0x5c, 0xb8, 0xa0, 0xaf,
+             0x45, 0xcc, 0x7a, 0x57, 0xb8, 0x37, 0x70, 0xa0, 0x8b, 0xe8, 0xe6,
+             0xcb, 0xcc, 0xbf, 0x9,  0x78, 0x12, 0x51, 0x3c, 0x14, 0x3d},
+            {0x57, 0x45, 0x15, 0x79, 0x91, 0x27, 0x6d, 0x12, 0xa,  0x3a, 0x78,
+             0xfc, 0x5c, 0x8f, 0xe4, 0xd5, 0xac, 0x9b, 0x17, 0xdf, 0xe8, 0xb6,
+             0xbd, 0x36, 0x59, 0x28, 0xa8, 0x5b, 0x88, 0x17, 0xf5, 0x2e},
+        },
+    },
+    {
+        {
+            {0x51, 0x2f, 0x5b, 0x30, 0xfb, 0xbf, 0xee, 0x96, 0xb8, 0x96, 0x95,
+             0x88, 0xad, 0x38, 0xf9, 0xd3, 0x25, 0xdd, 0xd5, 0x46, 0xc7, 0x2d,
+             0xf5, 0xf0, 0x95, 0x0,  0x3a, 0xbb, 0x90, 0x82, 0x96, 0x57},
+            {0xdc, 0xae, 0x58, 0x8c, 0x4e, 0x97, 0x37, 0x46, 0xa4, 0x41, 0xf0,
+             0xab, 0xfb, 0x22, 0xef, 0xb9, 0x8a, 0x71, 0x80, 0xe9, 0x56, 0xd9,
+             0x85, 0xe1, 0xa6, 0xa8, 0x43, 0xb1, 0xfa, 0x78, 0x1b, 0x2f},
+            {0x1,  0xe1, 0x20, 0xa,  0x43, 0xb8, 0x1a, 0xf7, 0x47, 0xec, 0xf0,
+             0x24, 0x8d, 0x65, 0x93, 0xf3, 0xd1, 0xee, 0xe2, 0x6e, 0xa8, 0x9,
+             0x75, 0xcf, 0xe1, 0xa3, 0x2a, 0xdc, 0x35, 0x3e, 0xc4, 0x7d},
+        },
+        {
+            {0x18, 0x97, 0x3e, 0x27, 0x5c, 0x2a, 0x78, 0x5a, 0x94, 0xfd, 0x4e,
+             0x5e, 0x99, 0xc6, 0x76, 0x35, 0x3e, 0x7d, 0x23, 0x1f, 0x5,  0xd8,
+             0x2e, 0xf,  0x99, 0xa,  0xd5, 0x82, 0x1d, 0xb8, 0x4f, 0x4},
+            {0xc3, 0xd9, 0x7d, 0x88, 0x65, 0x66, 0x96, 0x85, 0x55, 0x53, 0xb0,
+             0x4b, 0x31, 0x9b, 0xf,  0xc9, 0xb1, 0x79, 0x20, 0xef, 0xf8, 0x8d,
+             0xe0, 0xc6, 0x2f, 0xc1, 0x8c, 0x75, 0x16, 0x20, 0xf7, 0x7e},
+            {0xd9, 0xe3, 0x7,  0xa9, 0xc5, 0x18, 0xdf, 0xc1, 0x59, 0x63, 0x4c,
+             0xce, 0x1d, 0x37, 0xb3, 0x57, 0x49, 0xbb, 0x1,  0xb2, 0x34, 0x45,
+             0x70, 0xca, 0x2e, 0xdd, 0x30, 0x9c, 0x3f, 0x82, 0x79, 0x7f},
+        },
+        {
+            {0xba, 0x87, 0xf5, 0x68, 0xf0, 0x1f, 0x9c, 0x6a, 0xde, 0xc8, 0x50,
+             0x0,  0x4e, 0x89, 0x27, 0x8,  0xe7, 0x5b, 0xed, 0x7d, 0x55, 0x99,
+             0xbf, 0x3c, 0xf0, 0xd6, 0x6,  0x1c, 0x43, 0xb0, 0xa9, 0x64},
+            {0xe8, 0x13, 0xb5, 0xa3, 0x39, 0xd2, 0x34, 0x83, 0xd8, 0xa8, 0x1f,
+             0xb9, 0xd4, 0x70, 0x36, 0xc1, 0x33, 0xbd, 0x90, 0xf5, 0x36, 0x41,
+             0xb5, 0x12, 0xb4, 0xd9, 0x84, 0xd7, 0x73, 0x3,  0x4e, 0xa},
+            {0x19, 0x29, 0x7d, 0x5b, 0xa1, 0xd6, 0xb3, 0x2e, 0x35, 0x82, 0x3a,
+             0xd5, 0xa0, 0xf6, 0xb4, 0xb0, 0x47, 0x5d, 0xa4, 0x89, 0x43, 0xce,
+             0x56, 0x71, 0x6c, 0x34, 0x18, 0xce, 0xa,  0x7d, 0x1a, 0x7},
+        },
+        {
+            {0x31, 0x44, 0xe1, 0x20, 0x52, 0x35, 0xc,  0xcc, 0x41, 0x51, 0xb1,
+             0x9,  0x7,  0x95, 0x65, 0xd,  0x36, 0x5f, 0x9d, 0x20, 0x1b, 0x62,
+             0xf5, 0x9a, 0xd3, 0x55, 0x77, 0x61, 0xf7, 0xbc, 0x69, 0x7c},
+            {0xb,  0xba, 0x87, 0xc8, 0xaa, 0x2d, 0x7,  0xd3, 0xee, 0x62, 0xa5,
+             0xbf, 0x5,  0x29, 0x26, 0x1,  0x8b, 0x76, 0xef, 0xc0, 0x2,  0x30,
+             0x54, 0xcf, 0x9c, 0x7e, 0xea, 0x46, 0x71, 0xcc, 0x3b, 0x2c},
+            {0x5f, 0x29, 0xe8, 0x4,  0xeb, 0xd7, 0xf0, 0x7,  0x7d, 0xf3, 0x50,
+             0x2f, 0x25, 0x18, 0xdb, 0x10, 0xd7, 0x98, 0x17, 0x17, 0xa3, 0xa9,
+             0x51, 0xe9, 0x1d, 0xa5, 0xac, 0x22, 0x73, 0x9a, 0x5a, 0x6f},
+        },
+        {
+            {0xbe, 0x44, 0xd9, 0xa3, 0xeb, 0xd4, 0x29, 0xe7, 0x9e, 0xaf, 0x78,
+             0x80, 0x40, 0x9,  0x9e, 0x8d, 0x3,  0x9c, 0x86, 0x47, 0x7a, 0x56,
+             0x25, 0x45, 0x24, 0x3b, 0x8d, 0xee, 0x80, 0x96, 0xab, 0x2},
+            {0xc5, 0xc6, 0x41, 0x2f, 0xc,  0x0,  0xa1, 0x8b, 0x9b, 0xfb, 0xfe,
+             0xc,  0xc1, 0x79, 0x9f, 0xc4, 0x9f, 0x1c, 0xc5, 0x3c, 0x70, 0x47,
+             0xfa, 0x4e, 0xca, 0xaf, 0x47, 0xe1, 0xa2, 0x21, 0x4e, 0x49},
+            {0x9a, 0xd,  0xe5, 0xdd, 0x85, 0x8a, 0xa4, 0xef, 0x49, 0xa2, 0xb9,
+             0xf,  0x4e, 0x22, 0x9a, 0x21, 0xd9, 0xf6, 0x1e, 0xd9, 0x1d, 0x1f,
+             0x9,  0xfa, 0x34, 0xbb, 0x46, 0xea, 0xcb, 0x76, 0x5d, 0x6b},
+        },
+        {
+            {0x22, 0x25, 0x78, 0x1e, 0x17, 0x41, 0xf9, 0xe0, 0xd3, 0x36, 0x69,
+             0x3,  0x74, 0xae, 0xe6, 0xf1, 0x46, 0xc7, 0xfc, 0xd0, 0xa2, 0x3e,
+             0x8b, 0x40, 0x3e, 0x31, 0xdd, 0x3,  0x9c, 0x86, 0xfb, 0x16},
+            {0x94, 0xd9, 0xc,  0xec, 0x6c, 0x55, 0x57, 0x88, 0xba, 0x1d, 0xd0,
+             0x5c, 0x6f, 0xdc, 0x72, 0x64, 0x77, 0xb4, 0x42, 0x8f, 0x14, 0x69,
+             0x1,  0xaf, 0x54, 0x73, 0x27, 0x85, 0xf6, 0x33, 0xe3, 0xa},
+            {0x62, 0x9,  0xb6, 0x33, 0x97, 0x19, 0x8e, 0x28, 0x33, 0xe1, 0xab,
+             0xd8, 0xb4, 0x72, 0xfc, 0x24, 0x3e, 0xd0, 0x91, 0x9,  0xed, 0xf7,
+             0x11, 0x48, 0x75, 0xd0, 0x70, 0x8f, 0x8b, 0xe3, 0x81, 0x3f},
+        },
+        {
+            {0x24, 0xc8, 0x17, 0x5f, 0x35, 0x7f, 0xdb, 0xa,  0xa4, 0x99, 0x42,
+             0xd7, 0xc3, 0x23, 0xb9, 0x74, 0xf7, 0xea, 0xf8, 0xcb, 0x8b, 0x3e,
+             0x7c, 0xd5, 0x3d, 0xdc, 0xde, 0x4c, 0xd3, 0xe2, 0xd3, 0xa},
+            {0xfe, 0xaf, 0xd9, 0x7e, 0xcc, 0xf, 0x91, 0x7f, 0x4b, 0x87, 0x65,
+             0x24, 0xa1, 0xb8, 0x5c, 0x54, 0x4, 0x47, 0xc,  0x4b, 0xd2, 0x7e,
+             0x39, 0xa8, 0x93, 0x9,  0xf5, 0x4, 0xc1, 0xf,  0x51, 0x50},
+            {0x9d, 0x24, 0x6e, 0x33, 0xc5, 0xf,  0xc,  0x6f, 0xd9, 0xcf, 0x31,
+             0xc3, 0x19, 0xde, 0x5e, 0x74, 0x1c, 0xfe, 0xee, 0x9,  0x0,  0xfd,
+             0xd6, 0xf2, 0xbe, 0x1e, 0xfa, 0xf0, 0x8b, 0x15, 0x7c, 0x12},
+        },
+        {
+            {0x74, 0xb9, 0x51, 0xae, 0xc4, 0x8f, 0xa2, 0xde, 0x96, 0xfe, 0x4d,
+             0x74, 0xd3, 0x73, 0x99, 0x1d, 0xa8, 0x48, 0x38, 0x87, 0xb,  0x68,
+             0x40, 0x62, 0x95, 0xdf, 0x67, 0xd1, 0x79, 0x24, 0xd8, 0x4e},
+            {0xa2, 0x79, 0x98, 0x2e, 0x42, 0x7c, 0x19, 0xf6, 0x47, 0x36, 0xca,
+             0x52, 0xd4, 0xdd, 0x4a, 0xa4, 0xcb, 0xac, 0x4e, 0x4b, 0xc1, 0x3f,
+             0x41, 0x9b, 0x68, 0x4f, 0xef, 0x7,  0x7d, 0xf8, 0x4e, 0x35},
+            {0x75, 0xd9, 0xc5, 0x60, 0x22, 0xb5, 0xe3, 0xfe, 0xb8, 0xb0, 0x41,
+             0xeb, 0xfc, 0x2e, 0x35, 0x50, 0x3c, 0x65, 0xf6, 0xa9, 0x30, 0xac,
+             0x8,  0x88, 0x6d, 0x23, 0x39, 0x5,  0xd2, 0x92, 0x2d, 0x30},
+        },
+    },
+    {
+        {
+            {0x77, 0xf1, 0xe0, 0xe4, 0xb6, 0x6f, 0xbc, 0x2d, 0x93, 0x6a, 0xbd,
+             0xa4, 0x29, 0xbf, 0xe1, 0x4,  0xe8, 0xf6, 0x7a, 0x78, 0xd4, 0x66,
+             0x19, 0x5e, 0x60, 0xd0, 0x26, 0xb4, 0x5e, 0x5f, 0xdc, 0xe},
+            {0x3d, 0x28, 0xa4, 0xbc, 0xa2, 0xc1, 0x13, 0x78, 0xd9, 0x3d, 0x86,
+             0xa1, 0x91, 0xf0, 0x62, 0xed, 0x86, 0xfa, 0x68, 0xc2, 0xb8, 0xbc,
+             0xc7, 0xae, 0x4c, 0xae, 0x1c, 0x6f, 0xb7, 0xd3, 0xe5, 0x10},
+            {0x67, 0x8e, 0xda, 0x53, 0xd6, 0xbf, 0x53, 0x54, 0x41, 0xf6, 0xa9,
+             0x24, 0xec, 0x1e, 0xdc, 0xe9, 0x23, 0x8a, 0x57, 0x3,  0x3b, 0x26,
+             0x87, 0xbf, 0x72, 0xba, 0x1c, 0x36, 0x51, 0x6c, 0xb4, 0x45},
+        },
+        {
+            {0xe4, 0xe3, 0x7f, 0x8a, 0xdd, 0x4d, 0x9d, 0xce, 0x30, 0xe,  0x62,
+             0x76, 0x56, 0x64, 0x13, 0xab, 0x58, 0x99, 0xe,  0xb3, 0x7b, 0x4f,
+             0x59, 0x4b, 0xdf, 0x29, 0x12, 0x32, 0xef, 0xa,  0x1c, 0x5c},
+            {0xa1, 0x7f, 0x4f, 0x31, 0xbf, 0x2a, 0x40, 0xa9, 0x50, 0xf4, 0x8c,
+             0x8e, 0xdc, 0xf1, 0x57, 0xe2, 0x84, 0xbe, 0xa8, 0x23, 0x4b, 0xd5,
+             0xbb, 0x1d, 0x3b, 0x71, 0xcb, 0x6d, 0xa3, 0xbf, 0x77, 0x21},
+            {0x8f, 0xdb, 0x79, 0xfa, 0xbc, 0x1b, 0x8,  0x37, 0xb3, 0x59, 0x5f,
+             0xc2, 0x1e, 0x81, 0x48, 0x60, 0x87, 0x24, 0x83, 0x9c, 0x65, 0x76,
+             0x7a, 0x8,  0xbb, 0xb5, 0x8a, 0x7d, 0x38, 0x19, 0xe6, 0x4a},
+        },
+        {
+            {0x83, 0xfb, 0x5b, 0x98, 0x44, 0x7e, 0x11, 0x61, 0x36, 0x31, 0x96,
+             0x71, 0x2a, 0x46, 0xe0, 0xfc, 0x4b, 0x90, 0x25, 0xd4, 0x48, 0x34,
+             0xac, 0x83, 0x64, 0x3d, 0xa4, 0x5b, 0xbe, 0x5a, 0x68, 0x75},
+            {0x2e, 0xa3, 0x44, 0x53, 0xaa, 0xf6, 0xdb, 0x8d, 0x78, 0x40, 0x1b,
+             0xb4, 0xb4, 0xea, 0x88, 0x7d, 0x60, 0xd,  0x13, 0x4a, 0x97, 0xeb,
+             0xb0, 0x5e, 0x3,  0x3e, 0xbf, 0x17, 0x1b, 0xd9, 0x0,  0x1a},
+            {0xb2, 0xf2, 0x61, 0xeb, 0x33, 0x9,  0x96, 0x6e, 0x52, 0x49, 0xff,
+             0xc9, 0xa8, 0xf,  0x3d, 0x54, 0x69, 0x65, 0xf6, 0x7a, 0x10, 0x75,
+             0x72, 0xdf, 0xaa, 0xe6, 0xb0, 0x23, 0xb6, 0x29, 0x55, 0x13},
+        },
+        {
+            {0xfe, 0x83, 0x2e, 0xe2, 0xbc, 0x16, 0xc7, 0xf5, 0xc1, 0x85, 0x9,
+             0xe8, 0x19, 0xeb, 0x2b, 0xb4, 0xae, 0x4a, 0x25, 0x14, 0x37, 0xa6,
+             0x9d, 0xec, 0x13, 0xa6, 0x90, 0x15, 0x5,  0xea, 0x72, 0x59},
+            {0x18, 0xd5, 0xd1, 0xad, 0xd7, 0xdb, 0xf0, 0x18, 0x11, 0x1f, 0xc1,
+             0xcf, 0x88, 0x78, 0x9f, 0x97, 0x9b, 0x75, 0x14, 0x71, 0xf0, 0xe1,
+             0x32, 0x87, 0x1,  0x3a, 0xca, 0x65, 0x1a, 0xb8, 0xb5, 0x79},
+            {0x11, 0x78, 0x8f, 0xdc, 0x20, 0xac, 0xd4, 0xf,  0xa8, 0x4f, 0x4d,
+             0xac, 0x94, 0xd2, 0x9a, 0x9a, 0x34, 0x4,  0x36, 0xb3, 0x64, 0x2d,
+             0x1b, 0xc0, 0xdb, 0x3b, 0x5f, 0x90, 0x95, 0x9c, 0x7e, 0x4f},
+        },
+        {
+            {0xfe, 0x99, 0x52, 0x35, 0x3d, 0x44, 0xc8, 0x71, 0xd7, 0xea, 0xeb,
+             0xdb, 0x1c, 0x3b, 0xcd, 0x8b, 0x66, 0x94, 0xa4, 0xf1, 0x9e, 0x49,
+             0x92, 0x80, 0xc8, 0xad, 0x44, 0xa1, 0xc4, 0xee, 0x42, 0x19},
+            {0x2e, 0x30, 0x81, 0x57, 0xbc, 0x4b, 0x67, 0x62, 0xf,  0xdc, 0xad,
+             0x89, 0x39, 0xf,  0x52, 0xd8, 0xc6, 0xd9, 0xfb, 0x53, 0xae, 0x99,
+             0x29, 0x8c, 0x4c, 0x8e, 0x63, 0x2e, 0xd9, 0x3a, 0x99, 0x31},
+            {0x92, 0x49, 0x23, 0xae, 0x19, 0x53, 0xac, 0x7d, 0x92, 0x3e, 0xea,
+             0xc,  0x91, 0x3d, 0x1b, 0x2c, 0x22, 0x11, 0x3c, 0x25, 0x94, 0xe4,
+             0x3c, 0x55, 0x75, 0xca, 0xf9, 0x4e, 0x31, 0x65, 0xa,  0x2a},
+        },
+        {
+            {0x3a, 0x79, 0x1c, 0x3c, 0xcd, 0x1a, 0x36, 0xcf, 0x3b, 0xbc, 0x35,
+             0x5a, 0xac, 0xbc, 0x9e, 0x2f, 0xab, 0xa6, 0xcd, 0xa8, 0xe9, 0x60,
+             0xe8, 0x60, 0x13, 0x1a, 0xea, 0x6d, 0x9b, 0xc3, 0x5d, 0x5},
+            {0xc2, 0x27, 0xf9, 0xf7, 0x7f, 0x93, 0xb7, 0x2d, 0x35, 0xa6, 0xd0,
+             0x17, 0x6,  0x1f, 0x74, 0xdb, 0x76, 0xaf, 0x55, 0x11, 0xa2, 0xf3,
+             0x82, 0x59, 0xed, 0x2d, 0x7c, 0x64, 0x18, 0xe2, 0xf6, 0x4c},
+            {0xb6, 0x5b, 0x8d, 0xc2, 0x7c, 0x22, 0x19, 0xb1, 0xab, 0xff, 0x4d,
+             0x77, 0xbc, 0x4e, 0xe2, 0x7,  0x89, 0x2c, 0xa3, 0xe4, 0xce, 0x78,
+             0x3c, 0xa8, 0xb6, 0x24, 0xaa, 0x10, 0x77, 0x30, 0x1a, 0x12},
+        },
+        {
+            {0xc9, 0x83, 0x74, 0xc7, 0x3e, 0x71, 0x59, 0xd6, 0xaf, 0x96, 0x2b,
+             0xb8, 0x77, 0xe0, 0xbf, 0x88, 0xd3, 0xbc, 0x97, 0x10, 0x23, 0x28,
+             0x9e, 0x28, 0x9b, 0x3a, 0xed, 0x6c, 0x4a, 0xb9, 0x7b, 0x52},
+            {0x97, 0x4a, 0x3,  0x9f, 0x5e, 0x5d, 0xdb, 0xe4, 0x2d, 0xbc, 0x34,
+             0x30, 0x9,  0xfc, 0x53, 0xe1, 0xb1, 0xd3, 0x51, 0x95, 0x91, 0x46,
+             0x5,  0x46, 0x2d, 0xe5, 0x40, 0x7a, 0x6c, 0xc7, 0x3f, 0x33},
+            {0x2e, 0x48, 0x5b, 0x99, 0x2a, 0x99, 0x3d, 0x56, 0x1,  0x38, 0x38,
+             0x6e, 0x7c, 0xd0, 0x5,  0x34, 0xe5, 0xd8, 0x64, 0x2f, 0xde, 0x35,
+             0x50, 0x48, 0xf7, 0xa9, 0xa7, 0x20, 0x9b, 0x6,  0x89, 0x6b},
+        },
+        {
+            {0x77, 0xdb, 0xc7, 0xb5, 0x8c, 0xfa, 0x82, 0x40, 0x55, 0xc1, 0x34,
+             0xc7, 0xf8, 0x86, 0x86, 0x6,  0x7e, 0xa5, 0xe7, 0xf6, 0xd9, 0xc8,
+             0xe6, 0x29, 0xcf, 0x9b, 0x63, 0xa7, 0x8,  0xd3, 0x73, 0x4},
+            {0xd,  0x22, 0x70, 0x62, 0x41, 0xa0, 0x2a, 0x81, 0x4e, 0x5b, 0x24,
+             0xf9, 0xfa, 0x89, 0x5a, 0x99, 0x5,  0xef, 0x72, 0x50, 0xce, 0xc4,
+             0xad, 0xff, 0x73, 0xeb, 0x73, 0xaa, 0x3,  0x21, 0xbc, 0x23},
+            {0x5,  0x9e, 0x58, 0x3,  0x26, 0x79, 0xee, 0xca, 0x92, 0xc4, 0xdc,
+             0x46, 0x12, 0x42, 0x4b, 0x2b, 0x4f, 0xa9, 0x1,  0xe6, 0x74, 0xef,
+             0xa1, 0x2,  0x1a, 0x34, 0x4,  0xde, 0xbf, 0x73, 0x2f, 0x10},
+        },
+    },
+    {
+        {
+            {0x9a, 0x1c, 0x51, 0xb5, 0xe0, 0xda, 0xb4, 0xa2, 0x6,  0xff, 0xff,
+             0x2b, 0x29, 0x60, 0xc8, 0x7a, 0x34, 0x42, 0x50, 0xf5, 0x5d, 0x37,
+             0x1f, 0x98, 0x2d, 0xa1, 0x4e, 0xda, 0x25, 0xd7, 0x6b, 0x3f},
+            {0xc6, 0x45, 0x57, 0x7f, 0xab, 0xb9, 0x18, 0xeb, 0x90, 0xc6, 0x87,
+             0x57, 0xee, 0x8a, 0x3a, 0x2,  0xa9, 0xaf, 0xf7, 0x2d, 0xda, 0x12,
+             0x27, 0xb7, 0x3d, 0x1,  0x5c, 0xea, 0x25, 0x7d, 0x59, 0x36},
+            {0xac, 0x58, 0x60, 0x10, 0x7b, 0x8d, 0x4d, 0x73, 0x5f, 0x90, 0xc6,
+             0x6f, 0x9e, 0x57, 0x40, 0xd9, 0x2d, 0x93, 0x2,  0x92, 0xf9, 0xf8,
+             0x66, 0x64, 0xd0, 0xd6, 0x60, 0xda, 0x19, 0xcc, 0x7e, 0x7b},
+        },
+        {
+            {0x9b, 0xfa, 0x7c, 0xa7, 0x51, 0x4a, 0xae, 0x6d, 0x50, 0x86, 0xa3,
+             0xe7, 0x54, 0x36, 0x26, 0x82, 0xdb, 0x82, 0x2d, 0x8f, 0xcd, 0xff,
+             0xbb, 0x9,  0xba, 0xca, 0xf5, 0x1b, 0x66, 0xdc, 0xbe, 0x3},
+            {0xd,  0x69, 0x5c, 0x69, 0x3c, 0x37, 0xc2, 0x78, 0x6e, 0x90, 0x42,
+             0x6,  0x66, 0x2e, 0x25, 0xdd, 0xd2, 0x2b, 0xe1, 0x4a, 0x44, 0x44,
+             0x1d, 0x95, 0x56, 0x39, 0x74, 0x1,  0x76, 0xad, 0x35, 0x42},
+            {0xf5, 0x75, 0x89, 0x7,  0xd,  0xcb, 0x58, 0x62, 0x98, 0xf2, 0x89,
+             0x91, 0x54, 0x42, 0x29, 0x49, 0xe4, 0x6e, 0xe3, 0xe2, 0x23, 0xb4,
+             0xca, 0xa0, 0xa1, 0x66, 0xf0, 0xcd, 0xb0, 0xe2, 0x7c, 0xe},
+        },
+        {
+            {0xf9, 0x70, 0x4b, 0xd9, 0xdf, 0xfe, 0xa6, 0xfe, 0x2d, 0xba, 0xfc,
+             0xc1, 0x51, 0xc0, 0x30, 0xf1, 0x89, 0xab, 0x2f, 0x7f, 0x7e, 0xd4,
+             0x82, 0x48, 0xb5, 0xee, 0xec, 0x8a, 0x13, 0x56, 0x52, 0x61},
+            {0xa3, 0x85, 0x8c, 0xc4, 0x3a, 0x64, 0x94, 0xc4, 0xad, 0x39, 0x61,
+             0x3c, 0xf4, 0x1d, 0x36, 0xfd, 0x48, 0x4d, 0xe9, 0x3a, 0xdd, 0x17,
+             0xdb, 0x9,  0x4a, 0x67, 0xb4, 0x8f, 0x5d, 0xa,  0x6e, 0x66},
+            {0xd,  0xcb, 0x70, 0x48, 0x4e, 0xf6, 0xbb, 0x2a, 0x6b, 0x8b, 0x45,
+             0xaa, 0xf0, 0xbc, 0x65, 0xcd, 0x5d, 0x98, 0xe8, 0x75, 0xba, 0x4e,
+             0xbe, 0x9a, 0xe4, 0xde, 0x14, 0xd5, 0x10, 0xc8, 0xb,  0x7f},
+        },
+        {
+            {0xa0, 0x13, 0x72, 0x73, 0xad, 0x9d, 0xac, 0x83, 0x98, 0x2e, 0xf7,
+             0x2e, 0xba, 0xf8, 0xf6, 0x9f, 0x57, 0x69, 0xec, 0x43, 0xdd, 0x2e,
+             0x1e, 0x31, 0x75, 0xab, 0xc5, 0xde, 0x7d, 0x90, 0x3a, 0x1d},
+            {0x6f, 0x13, 0xf4, 0x26, 0xa4, 0x6b, 0x0,  0xb9, 0x35, 0x30, 0xe0,
+             0x57, 0x9e, 0x36, 0x67, 0x8d, 0x28, 0x3c, 0x46, 0x4f, 0xd9, 0xdf,
+             0xc8, 0xcb, 0xf5, 0xdb, 0xee, 0xf8, 0xbc, 0x8d, 0x1f, 0xd},
+            {0xdc, 0x81, 0xd0, 0x3e, 0x31, 0x93, 0x16, 0xba, 0x80, 0x34, 0x1b,
+             0x85, 0xad, 0x9f, 0x32, 0x29, 0xcb, 0x21, 0x3,  0x3,  0x3c, 0x1,
+             0x28, 0x1,  0xe3, 0xfd, 0x1b, 0xa3, 0x44, 0x1b, 0x1,  0x0},
+        },
+        {
+            {0x5c, 0xa7, 0xa,  0x6a, 0x69, 0x1f, 0x56, 0x16, 0x6a, 0xbd, 0x52,
+             0x58, 0x5c, 0x72, 0xbf, 0xc1, 0xad, 0x66, 0x79, 0x9a, 0x7f, 0xdd,
+             0xa8, 0x11, 0x26, 0x10, 0x85, 0xd2, 0xa2, 0x88, 0xd9, 0x63},
+            {0xc,  0x6c, 0xc6, 0x3f, 0x6c, 0xa0, 0xdf, 0x3f, 0xd2, 0xd,  0xd6,
+             0x4d, 0x8e, 0xe3, 0x40, 0x5d, 0x71, 0x4d, 0x8e, 0x26, 0x38, 0x8b,
+             0xe3, 0x7a, 0xe1, 0x57, 0x83, 0x6e, 0x91, 0x8d, 0xc4, 0x3a},
+            {0x2e, 0x23, 0xbd, 0xaf, 0x53, 0x7,  0x12, 0x0,  0x83, 0xf6, 0xd8,
+             0xfd, 0xb8, 0xce, 0x2b, 0xe9, 0x91, 0x2b, 0xe7, 0x84, 0xb3, 0x69,
+             0x16, 0xf8, 0x66, 0xa0, 0x68, 0x23, 0x2b, 0xd5, 0xfa, 0x33},
+        },
+        {
+            {0xe8, 0xcf, 0x22, 0xc4, 0xd0, 0xc8, 0x2c, 0x8d, 0xcb, 0x3a, 0xa1,
+             0x5,  0x7b, 0x4f, 0x2b, 0x7,  0x6f, 0xa5, 0xf6, 0xec, 0xe6, 0xb6,
+             0xfe, 0xa3, 0xe2, 0x71, 0xa,  0xb9, 0xcc, 0x55, 0xc3, 0x3c},
+            {0x16, 0x1e, 0xe4, 0xc5, 0xc6, 0x49, 0x6,  0x54, 0x35, 0x77, 0x3f,
+             0x33, 0x30, 0x64, 0xf8, 0xa,  0x46, 0xe7, 0x5,  0xf3, 0xd2, 0xfc,
+             0xac, 0xb2, 0xa7, 0xdc, 0x56, 0xa2, 0x29, 0xf4, 0xc0, 0x16},
+            {0x31, 0x91, 0x3e, 0x90, 0x43, 0x94, 0xb6, 0xe9, 0xce, 0x37, 0x56,
+             0x7a, 0xcb, 0x94, 0xa4, 0xb8, 0x44, 0x92, 0xba, 0xba, 0xa4, 0xd1,
+             0x7c, 0xc8, 0x68, 0x75, 0xae, 0x6b, 0x42, 0xaf, 0x1e, 0x63},
+        },
+        {
+            {0xe8, 0xd,  0x70, 0xa3, 0xb9, 0x75, 0xd9, 0x47, 0x52, 0x5,  0xf8,
+             0xe2, 0xfb, 0xc5, 0x80, 0x72, 0xe1, 0x5d, 0xe4, 0x32, 0x27, 0x8f,
+             0x65, 0x53, 0xb5, 0x80, 0x5f, 0x66, 0x7f, 0x2c, 0x1f, 0x43},
+            {0x9f, 0xfe, 0x66, 0xda, 0x10, 0x4,  0xe9, 0xb3, 0xa6, 0xe5, 0x16,
+             0x6c, 0x52, 0x4b, 0xdd, 0x85, 0x83, 0xbf, 0xf9, 0x1e, 0x61, 0x97,
+             0x3d, 0xbc, 0xb5, 0x19, 0xa9, 0x1e, 0x8b, 0x64, 0x99, 0x55},
+            {0x19, 0x7b, 0x8f, 0x85, 0x44, 0x63, 0x2,  0xd6, 0x4a, 0x51, 0xea,
+             0xa1, 0x2f, 0x35, 0xab, 0x14, 0xd7, 0xa9, 0x90, 0x20, 0x1a, 0x44,
+             0x0,  0x89, 0x26, 0x3b, 0x25, 0x91, 0x5f, 0x71, 0x4,  0x7b},
+        },
+        {
+            {0xc6, 0xba, 0xe6, 0xc4, 0x80, 0xc2, 0x76, 0xb3, 0xb,  0x9b, 0x1d,
+             0x6d, 0xdd, 0xd3, 0xe,  0x97, 0x44, 0xf9, 0xb,  0x45, 0x58, 0x95,
+             0x9a, 0xb0, 0x23, 0xe2, 0xcd, 0x57, 0xfa, 0xac, 0xd0, 0x48},
+            {0x43, 0xae, 0xf6, 0xac, 0x28, 0xbd, 0xed, 0x83, 0xb4, 0x7a, 0x5c,
+             0x7d, 0x8b, 0x7c, 0x35, 0x86, 0x44, 0x2c, 0xeb, 0xb7, 0x69, 0x47,
+             0x40, 0xc0, 0x3f, 0x58, 0xf6, 0xc2, 0xf5, 0x7b, 0xb3, 0x59},
+            {0x71, 0xe6, 0xab, 0x7d, 0xe4, 0x26, 0xf,  0xb6, 0x37, 0x3a, 0x2f,
+             0x62, 0x97, 0xa1, 0xd1, 0xf1, 0x94, 0x3,  0x96, 0xe9, 0x7e, 0xce,
+             0x8,  0x42, 0xdb, 0x3b, 0x6d, 0x33, 0x91, 0x41, 0x23, 0x16},
+        },
+    },
+    {
+        {
+            {0x40, 0x86, 0xf3, 0x1f, 0xd6, 0x9c, 0x49, 0xdd, 0xa0, 0x25, 0x36,
+             0x6,  0xc3, 0x9b, 0xcd, 0x29, 0xc3, 0x3d, 0xd7, 0x3d, 0x2,  0xd8,
+             0xe2, 0x51, 0x31, 0x92, 0x3b, 0x20, 0x7a, 0x70, 0x25, 0x4a},
+            {0xf6, 0x7f, 0x26, 0xf6, 0xde, 0x99, 0xe4, 0xb9, 0x43, 0x8,  0x2c,
+             0x74, 0x7b, 0xca, 0x72, 0x77, 0xb1, 0xf2, 0xa4, 0xe9, 0x3f, 0x15,
+             0xa0, 0x23, 0x6,  0x50, 0xd0, 0xd5, 0xec, 0xdf, 0xdf, 0x2c},
+            {0x6a, 0xed, 0xf6, 0x53, 0x8a, 0x66, 0xb7, 0x2a, 0xa1, 0x70, 0xd1,
+             0x1d, 0x58, 0x42, 0x42, 0x30, 0x61, 0x1,  0xe2, 0x3a, 0x4c, 0x14,
+             0x0,  0x40, 0xfc, 0x49, 0x8e, 0x24, 0x6d, 0x89, 0x21, 0x57},
+        },
+        {
+            {0x4e, 0xda, 0xd0, 0xa1, 0x91, 0x50, 0x5d, 0x28, 0x8,  0x3e, 0xfe,
+             0xb5, 0xa7, 0x6f, 0xaa, 0x4b, 0xb3, 0x93, 0x93, 0xe1, 0x7c, 0x17,
+             0xe5, 0x63, 0xfd, 0x30, 0xb0, 0xc4, 0xaf, 0x35, 0xc9, 0x3},
+            {0xae, 0x1b, 0x18, 0xfd, 0x17, 0x55, 0x6e, 0xb,  0xb4, 0x63, 0xb9,
+             0x2b, 0x9f, 0x62, 0x22, 0x90, 0x25, 0x46, 0x6,  0x32, 0xe9, 0xbc,
+             0x9,  0x55, 0xda, 0x13, 0x3c, 0xf6, 0x74, 0xdd, 0x8e, 0x57},
+            {0x3d, 0xc,  0x2b, 0x49, 0xc6, 0x76, 0x72, 0x99, 0xfc, 0x5,  0xe2,
+             0xdf, 0xc4, 0xc2, 0xcc, 0x47, 0x3c, 0x3a, 0x62, 0xdd, 0x84, 0x9b,
+             0xd2, 0xdc, 0xa2, 0xc7, 0x88, 0x2,  0x59, 0xab, 0xc2, 0x3e},
+        },
+        {
+            {0xcb, 0xd1, 0x32, 0xae, 0x9,  0x3a, 0x21, 0xa7, 0xd5, 0xc2, 0xf5,
+             0x40, 0xdf, 0x87, 0x2b, 0xf,  0x29, 0xab, 0x1e, 0xe8, 0xc6, 0xa4,
+             0xae, 0xb,  0x5e, 0xac, 0xdb, 0x6a, 0x6c, 0xf6, 0x1b, 0xe},
+            {0xb9, 0x7b, 0xd8, 0xe4, 0x7b, 0xd2, 0xa0, 0xa1, 0xed, 0x1a, 0x39,
+             0x61, 0xeb, 0x4d, 0x8b, 0xa9, 0x83, 0x9b, 0xcb, 0x73, 0xd0, 0xdd,
+             0xa0, 0x99, 0xce, 0xca, 0xf,  0x20, 0x5a, 0xc2, 0xd5, 0x2d},
+            {0x7e, 0x88, 0x2c, 0x79, 0xe9, 0xd5, 0xab, 0xe2, 0x5d, 0x6d, 0x92,
+             0xcb, 0x18, 0x0,  0x2,  0x1a, 0x1e, 0x5f, 0xae, 0xba, 0xcd, 0x69,
+             0xba, 0xbf, 0x5f, 0x8f, 0xe8, 0x5a, 0xb3, 0x48, 0x5,  0x73},
+        },
+        {
+            {0x34, 0xe3, 0xd6, 0xa1, 0x4b, 0x9,  0x5b, 0x80, 0x19, 0x3f, 0x35,
+             0x9,  0x77, 0xf1, 0x3e, 0xbf, 0x2b, 0x70, 0x22, 0x6,  0xcb, 0x6,
+             0x3f, 0x42, 0xdd, 0x45, 0x78, 0xd8, 0x77, 0x22, 0x5a, 0x58},
+            {0xee, 0xb8, 0xa8, 0xcb, 0xa3, 0x51, 0x35, 0xc4, 0x16, 0x5f, 0x11,
+             0xb2, 0x1d, 0x6f, 0xa2, 0x65, 0x50, 0x38, 0x8c, 0xab, 0x52, 0x4f,
+             0xf,  0x76, 0xca, 0xb8, 0x1d, 0x41, 0x3b, 0x44, 0x43, 0x30},
+            {0x62, 0x89, 0xd4, 0x33, 0x82, 0x5f, 0x8a, 0xa1, 0x7f, 0x25, 0x78,
+             0xec, 0xb5, 0xc4, 0x98, 0x66, 0xff, 0x41, 0x3e, 0x37, 0xa5, 0x6f,
+             0x8e, 0xa7, 0x1f, 0x98, 0xef, 0x50, 0x89, 0x27, 0x56, 0x76},
+        },
+        {
+            {0x9d, 0xcf, 0x86, 0xea, 0xa3, 0x73, 0x70, 0xe1, 0xdc, 0x5f, 0x15,
+             0x7,  0xb7, 0xfb, 0x8c, 0x3a, 0x8e, 0x8a, 0x83, 0x31, 0xfc, 0xe7,
+             0x53, 0x48, 0x16, 0xf6, 0x13, 0xb6, 0x84, 0xf4, 0xbb, 0x28},
+            {0xc0, 0xc8, 0x1f, 0xd5, 0x59, 0xcf, 0xc3, 0x38, 0xf2, 0xb6, 0x6,
+             0x5,  0xfd, 0xd2, 0xed, 0x9b, 0x8f, 0xe,  0x57, 0xab, 0x9f, 0x10,
+             0xbf, 0x26, 0xa6, 0x46, 0xb8, 0xc1, 0xa8, 0x60, 0x41, 0x3f},
+            {0x7c, 0x6c, 0x13, 0x6f, 0x5c, 0x2f, 0x61, 0xf2, 0xbe, 0x11, 0xdd,
+             0xf6, 0x7,  0xd1, 0xea, 0xaf, 0x33, 0x6f, 0xde, 0x13, 0xd2, 0x9a,
+             0x7e, 0x52, 0x5d, 0xf7, 0x88, 0x81, 0x35, 0xcb, 0x79, 0x1e},
+        },
+        {
+            {0x81, 0x81, 0xe0, 0xf5, 0xd8, 0x53, 0xe9, 0x77, 0xd9, 0xde, 0x9d,
+             0x29, 0x44, 0xc,  0xa5, 0x84, 0xe5, 0x25, 0x45, 0x86, 0xc,  0x2d,
+             0x6c, 0xdc, 0xf4, 0xf2, 0xd1, 0x39, 0x2d, 0xb5, 0x8a, 0x47},
+            {0xf1, 0xe3, 0xf7, 0xee, 0xc3, 0x36, 0x34, 0x1,  0xf8, 0x10, 0x9e,
+             0xfe, 0x7f, 0x6a, 0x8b, 0x82, 0xfc, 0xde, 0xf9, 0xbc, 0xe5, 0x8,
+             0xf9, 0x7f, 0x31, 0x38, 0x3b, 0x3a, 0x1b, 0x95, 0xd7, 0x65},
+            {0x59, 0xd1, 0x52, 0x92, 0xd3, 0xa4, 0xa6, 0x66, 0x7,  0xc8, 0x1a,
+             0x87, 0xbc, 0xe1, 0xdd, 0xe5, 0x6f, 0xc9, 0xc1, 0xa6, 0x40, 0x6b,
+             0x2c, 0xb8, 0x14, 0x22, 0x21, 0x1a, 0x41, 0x7a, 0xd8, 0x16},
+        },
+        {
+            {0x83, 0x5,  0x4e, 0xd5, 0xe2, 0xd5, 0xa4, 0xfb, 0xfa, 0x99, 0xbd,
+             0x2e, 0xd7, 0xaf, 0x1f, 0xe2, 0x8f, 0x77, 0xe9, 0x6e, 0x73, 0xc2,
+             0x7a, 0x49, 0xde, 0x6d, 0x5a, 0x7a, 0x57, 0xb,  0x99, 0x1f},
+            {0x15, 0x62, 0x6,  0x42, 0x5a, 0x7e, 0xbd, 0xb3, 0xc1, 0x24, 0x5a,
+             0xc,  0xcd, 0xe3, 0x9b, 0x87, 0xb7, 0x94, 0xf9, 0xd6, 0xb1, 0x5d,
+             0xc0, 0x57, 0xa6, 0x8c, 0xf3, 0x65, 0x81, 0x7c, 0xf8, 0x28},
+            {0xd6, 0xf7, 0xe8, 0x1b, 0xad, 0x4e, 0x34, 0xa3, 0x8f, 0x79, 0xea,
+             0xac, 0xeb, 0x50, 0x1e, 0x7d, 0x52, 0xe0, 0xd,  0x52, 0x9e, 0x56,
+             0xc6, 0x77, 0x3e, 0x6d, 0x4d, 0x53, 0xe1, 0x2f, 0x88, 0x45},
+        },
+        {
+            {0xe4, 0x6f, 0x3c, 0x94, 0x29, 0x99, 0xac, 0xd8, 0xa2, 0x92, 0x83,
+             0xa3, 0x61, 0xf1, 0xf9, 0xb5, 0xf3, 0x9a, 0xc8, 0xbe, 0x13, 0xdb,
+             0x99, 0x26, 0x74, 0xf0, 0x5,  0xe4, 0x3c, 0x84, 0xcf, 0x7d},
+            {0xd6, 0x83, 0x79, 0x75, 0x5d, 0x34, 0x69, 0x66, 0xa6, 0x11, 0xaa,
+             0x17, 0x11, 0xed, 0xb6, 0x62, 0x8f, 0x12, 0x5e, 0x98, 0x57, 0x18,
+             0xdd, 0x7d, 0xdd, 0xf6, 0x26, 0xf6, 0xb8, 0xe5, 0x8f, 0x68},
+            {0xc0, 0x32, 0x47, 0x4a, 0x48, 0xd6, 0x90, 0x6c, 0x99, 0x32, 0x56,
+             0xca, 0xfd, 0x43, 0x21, 0xd5, 0xe1, 0xc6, 0x5d, 0x91, 0xc3, 0x28,
+             0xbe, 0xb3, 0x1b, 0x19, 0x27, 0x73, 0x7e, 0x68, 0x39, 0x67},
+        },
+    },
+    {
+        {
+            {0xc0, 0x1a, 0xc,  0xc8, 0x9d, 0xcc, 0x6d, 0xa6, 0x36, 0xa4, 0x38,
+             0x1b, 0xf4, 0x5c, 0xa0, 0x97, 0xc6, 0xd7, 0xdb, 0x95, 0xbe, 0xf3,
+             0xeb, 0xa7, 0xab, 0x7d, 0x7e, 0x8d, 0xf6, 0xb8, 0xa0, 0x7d},
+            {0xa6, 0x75, 0x56, 0x38, 0x14, 0x20, 0x78, 0xef, 0xe8, 0xa9, 0xfd,
+             0xaa, 0x30, 0x9f, 0x64, 0xa2, 0xcb, 0xa8, 0xdf, 0x5c, 0x50, 0xeb,
+             0xd1, 0x4c, 0xb3, 0xc0, 0x4d, 0x1d, 0xba, 0x5a, 0x11, 0x46},
+            {0x76, 0xda, 0xb5, 0xc3, 0x53, 0x19, 0xf,  0xd4, 0x9b, 0x9e, 0x11,
+             0x21, 0x73, 0x6f, 0xac, 0x1d, 0x60, 0x59, 0xb2, 0xfe, 0x21, 0x60,
+             0xcc, 0x3,  0x4b, 0x4b, 0x67, 0x83, 0x7e, 0x88, 0x5f, 0x5a},
+        },
+        {
+            {0xb9, 0x43, 0xa6, 0xa0, 0xd3, 0x28, 0x96, 0x9e, 0x64, 0x20, 0xc3,
+             0xe6, 0x0,  0xcb, 0xc3, 0xb5, 0x32, 0xec, 0x2d, 0x7c, 0x89, 0x2,
+             0x53, 0x9b, 0xc,  0xc7, 0xd1, 0xd5, 0xe2, 0x7a, 0xe3, 0x43},
+            {0x11, 0x3d, 0xa1, 0x70, 0xcf, 0x1,  0x63, 0x8f, 0xc4, 0xd0, 0xd,
+             0x35, 0x15, 0xb8, 0xce, 0xcf, 0x7e, 0xa4, 0xbc, 0xa4, 0xd4, 0x97,
+             0x2,  0xf7, 0x34, 0x14, 0x4d, 0xe4, 0x56, 0xb6, 0x69, 0x36},
+            {0x33, 0xe1, 0xa6, 0xed, 0x6,  0x3f, 0x7e, 0x38, 0xc0, 0x3a, 0xa1,
+             0x99, 0x51, 0x1d, 0x30, 0x67, 0x11, 0x38, 0x26, 0x36, 0xf8, 0xd8,
+             0x5a, 0xbd, 0xbe, 0xe9, 0xd5, 0x4f, 0xcd, 0xe6, 0x21, 0x6a},
+        },
+        {
+            {0xe3, 0xb2, 0x99, 0x66, 0x12, 0x29, 0x41, 0xef, 0x1,  0x13, 0x8d,
+             0x70, 0x47, 0x8,  0xd3, 0x71, 0xbd, 0xb0, 0x82, 0x11, 0xd0, 0x32,
+             0x54, 0x32, 0x36, 0x8b, 0x1e, 0x0,  0x7,  0x1b, 0x37, 0x45},
+            {0x5f, 0xe6, 0x46, 0x30, 0xa,  0x17, 0xc6, 0xf1, 0x24, 0x35, 0xd2,
+             0x0,  0x2a, 0x2a, 0x71, 0x58, 0x55, 0xb7, 0x82, 0x8c, 0x3c, 0xbd,
+             0xdb, 0x69, 0x57, 0xff, 0x95, 0xa1, 0xf1, 0xf9, 0x6b, 0x58},
+            {0xb,  0x79, 0xf8, 0x5e, 0x8d, 0x8,  0xdb, 0xa6, 0xe5, 0x37, 0x9,
+             0x61, 0xdc, 0xf0, 0x78, 0x52, 0xb8, 0x6e, 0xa1, 0x61, 0xd2, 0x49,
+             0x3,  0xac, 0x79, 0x21, 0xe5, 0x90, 0x37, 0xb0, 0xaf, 0xe},
+        },
+        {
+            {0x1d, 0xae, 0x75, 0xf,  0x5e, 0x80, 0x40, 0x51, 0x30, 0xcc, 0x62,
+             0x26, 0xe3, 0xfb, 0x2,  0xec, 0x6d, 0x39, 0x92, 0xea, 0x1e, 0xdf,
+             0xeb, 0x2c, 0xb3, 0x5b, 0x43, 0xc5, 0x44, 0x33, 0xae, 0x44},
+            {0x2f, 0x4,  0x48, 0x37, 0xc1, 0x55, 0x5,  0x96, 0x11, 0xaa, 0xb,
+             0x82, 0xe6, 0x41, 0x9a, 0x21, 0xc,  0x6d, 0x48, 0x73, 0x38, 0xf7,
+             0x81, 0x1c, 0x61, 0xc6, 0x2,  0x5a, 0x67, 0xcc, 0x9a, 0x30},
+            {0xee, 0x43, 0xa5, 0xbb, 0xb9, 0x89, 0xf2, 0x9c, 0x42, 0x71, 0xc9,
+             0x5a, 0x9d, 0xe,  0x76, 0xf3, 0xaa, 0x60, 0x93, 0x4f, 0xc6, 0xe5,
+             0x82, 0x1d, 0x8f, 0x67, 0x94, 0x7f, 0x1b, 0x22, 0xd5, 0x62},
+        },
+        {
+            {0x3c, 0x7a, 0xf7, 0x3a, 0x26, 0xd4, 0x85, 0x75, 0x4d, 0x14, 0xe9,
+             0xfe, 0x11, 0x7b, 0xae, 0xdf, 0x3d, 0x19, 0xf7, 0x59, 0x80, 0x70,
+             0x6,  0xa5, 0x37, 0x20, 0x92, 0x83, 0x53, 0x9a, 0xf2, 0x14},
+            {0x6d, 0x93, 0xd0, 0x18, 0x9c, 0x29, 0x4c, 0x52, 0xc,  0x1a, 0xc,
+             0x8a, 0x6c, 0xb5, 0x6b, 0xc8, 0x31, 0x86, 0x4a, 0xdb, 0x2e, 0x5,
+             0x75, 0xa3, 0x62, 0x45, 0x75, 0xbc, 0xe4, 0xfd, 0xe,  0x5c},
+            {0xf5, 0xd7, 0xb2, 0x25, 0xdc, 0x7e, 0x71, 0xdf, 0x40, 0x30, 0xb5,
+             0x99, 0xdb, 0x70, 0xf9, 0x21, 0x62, 0x4c, 0xed, 0xc3, 0xb7, 0x34,
+             0x92, 0xda, 0x3e, 0x9,  0xee, 0x7b, 0x5c, 0x36, 0x72, 0x5e},
+        },
+        {
+            {0x3e, 0xb3, 0x8,  0x2f, 0x6,  0x39, 0x93, 0x7d, 0xbe, 0x32, 0x9f,
+             0xdf, 0xe5, 0x59, 0x96, 0x5b, 0xfd, 0xbd, 0x9e, 0x1f, 0xad, 0x3d,
+             0xff, 0xac, 0xb7, 0x49, 0x73, 0xcb, 0x55, 0x5,  0xb2, 0x70},
+            {0x7f, 0x21, 0x71, 0x45, 0x7,  0xfc, 0x5b, 0x57, 0x5b, 0xd9, 0x94,
+             0x6,  0x5d, 0x67, 0x79, 0x37, 0x33, 0x1e, 0x19, 0xf4, 0xbb, 0x37,
+             0xa,  0x9a, 0xbc, 0xea, 0xb4, 0x47, 0x4c, 0x10, 0xf1, 0x77},
+            {0x4c, 0x2c, 0x11, 0x55, 0xc5, 0x13, 0x51, 0xbe, 0xcd, 0x1f, 0x88,
+             0x9a, 0x3a, 0x42, 0x88, 0x66, 0x47, 0x3b, 0x50, 0x5e, 0x85, 0x77,
+             0x66, 0x44, 0x4a, 0x40, 0x6,  0x4a, 0x8f, 0x39, 0x34, 0xe},
+        },
+        {
+            {0x28, 0x19, 0x4b, 0x3e, 0x9,  0xb,  0x93, 0x18, 0x40, 0xf6, 0xf3,
+             0x73, 0xe,  0xe1, 0xe3, 0x7d, 0x6f, 0x5d, 0x39, 0x73, 0xda, 0x17,
+             0x32, 0xf4, 0x3e, 0x9c, 0x37, 0xca, 0xd6, 0xde, 0x8a, 0x6f},
+            {0xe8, 0xbd, 0xce, 0x3e, 0xd9, 0x22, 0x7d, 0xb6, 0x7,  0x2f, 0x82,
+             0x27, 0x41, 0xe8, 0xb3, 0x9,  0x8d, 0x6d, 0x5b, 0xb0, 0x1f, 0xa6,
+             0x3f, 0x74, 0x72, 0x23, 0x36, 0x8a, 0x36, 0x5,  0x54, 0x5e},
+            {0x9a, 0xb2, 0xb7, 0xfd, 0x3d, 0x12, 0x40, 0xe3, 0x91, 0xb2, 0x1a,
+             0xa2, 0xe1, 0x97, 0x7b, 0x48, 0x9e, 0x94, 0xe6, 0xfd, 0x2,  0x7d,
+             0x96, 0xf9, 0x97, 0xde, 0xd3, 0xc8, 0x2e, 0xe7, 0xd,  0x78},
+        },
+        {
+            {0x72, 0x27, 0xf4, 0x0,  0xf3, 0xea, 0x1f, 0x67, 0xaa, 0x41, 0x8c,
+             0x2a, 0x2a, 0xeb, 0x72, 0x8f, 0x92, 0x32, 0x37, 0x97, 0xd7, 0x7f,
+             0xa1, 0x29, 0xa6, 0x87, 0xb5, 0x32, 0xad, 0xc6, 0xef, 0x1d},
+            {0xbc, 0xe7, 0x9a, 0x8,  0x45, 0x85, 0xe2, 0xa,  0x6,  0x4d, 0x7f,
+             0x1c, 0xcf, 0xde, 0x8d, 0x38, 0xb8, 0x11, 0x48, 0xa,  0x51, 0x15,
+             0xac, 0x38, 0xe4, 0x8c, 0x92, 0x71, 0xf6, 0x8b, 0xb2, 0xe},
+            {0xa7, 0x95, 0x51, 0xef, 0x1a, 0xbe, 0x5b, 0xaf, 0xed, 0x15, 0x7b,
+             0x91, 0x77, 0x12, 0x8c, 0x14, 0x2e, 0xda, 0xe5, 0x7a, 0xfb, 0xf7,
+             0x91, 0x29, 0x67, 0x28, 0xdd, 0xf8, 0x1b, 0x20, 0x7d, 0x46},
+        },
+    },
+    {
+        {
+            {0xa9, 0xe7, 0x7a, 0x56, 0xbd, 0xf4, 0x1e, 0xbc, 0xbd, 0x98, 0x44,
+             0xd6, 0xb2, 0x4c, 0x62, 0x3f, 0xc8, 0x4e, 0x1f, 0x2c, 0xd2, 0x64,
+             0x10, 0xe4, 0x1,  0x40, 0x38, 0xba, 0xa5, 0xc5, 0xf9, 0x2e},
+            {0xad, 0x4f, 0xef, 0x74, 0x9a, 0x91, 0xfe, 0x95, 0xa2, 0x8,  0xa3,
+             0xf6, 0xec, 0x7b, 0x82, 0x3a, 0x1,  0x7b, 0xa4, 0x9,  0xd3, 0x1,
+             0x4e, 0x96, 0x97, 0xc7, 0xa3, 0x5b, 0x4f, 0x3c, 0xc4, 0x71},
+            {0xcd, 0x74, 0x9e, 0xfa, 0xf6, 0x6d, 0xfd, 0xb6, 0x7a, 0x26, 0xaf,
+             0xe4, 0xbc, 0x78, 0x82, 0xf1, 0xe,  0x99, 0xef, 0xf1, 0xd0, 0xb3,
+             0x55, 0x82, 0x93, 0xf2, 0xc5, 0x90, 0xa3, 0x8c, 0x75, 0x5a},
+        },
+        {
+            {0x94, 0xdc, 0x61, 0x1d, 0x8b, 0x91, 0xe0, 0x8c, 0x66, 0x30, 0x81,
+             0x9a, 0x46, 0x36, 0xed, 0x8d, 0xd3, 0xaa, 0xe8, 0xaf, 0x29, 0xa8,
+             0xe6, 0xd4, 0x3f, 0xd4, 0x39, 0xf6, 0x27, 0x80, 0x73, 0xa},
+            {0x95, 0x24, 0x46, 0xd9, 0x10, 0x27, 0xb7, 0xa2, 0x3,  0x50, 0x7d,
+             0xd5, 0xd2, 0xc6, 0xa8, 0x3a, 0xca, 0x87, 0xb4, 0xa0, 0xbf, 0x0,
+             0xd4, 0xe3, 0xec, 0x72, 0xeb, 0xb3, 0x44, 0xe2, 0xba, 0x2d},
+            {0xcc, 0xe1, 0xff, 0x57, 0x2f, 0x4a, 0xf,  0x98, 0x43, 0x98, 0x83,
+             0xe1, 0xd,  0xd,  0x67, 0x0,  0xfd, 0x15, 0xfb, 0x49, 0x4a, 0x3f,
+             0x5c, 0x10, 0x9c, 0xa6, 0x26, 0x51, 0x63, 0xca, 0x98, 0x26},
+        },
+        {
+            {0xe,  0xd9, 0x3d, 0x5e, 0x2f, 0x70, 0x3d, 0x2e, 0x86, 0x53, 0xd2,
+             0xe4, 0x18, 0x9,  0x3f, 0x9e, 0x6a, 0xa9, 0x4d, 0x2,  0xf6, 0x3e,
+             0x77, 0x5e, 0x32, 0x33, 0xfa, 0x4a, 0xc,  0x4b, 0x0,  0x3c},
+            {0x78, 0xba, 0xb0, 0x32, 0x88, 0x31, 0x65, 0xe7, 0x8b, 0xff, 0x5c,
+             0x92, 0xf7, 0x31, 0x18, 0x38, 0xcc, 0x1f, 0x29, 0xa0, 0x91, 0x1b,
+             0xa8, 0x8,  0x7,  0xeb, 0xca, 0x49, 0xcc, 0x3d, 0xb4, 0x1f},
+            {0x2b, 0xb8, 0xf4, 0x6,  0xac, 0x46, 0xa9, 0x9a, 0xf3, 0xc4, 0x6,
+             0xa8, 0xa5, 0x84, 0xa2, 0x1c, 0x87, 0x47, 0xcd, 0xc6, 0x5f, 0x26,
+             0xd3, 0x3e, 0x17, 0xd2, 0x1f, 0xcd, 0x1,  0xfd, 0x43, 0x6b},
+        },
+        {
+            {0xf3, 0xe,  0x76, 0x3e, 0x58, 0x42, 0xc7, 0xb5, 0x90, 0xb9, 0xa,
+             0xee, 0xb9, 0x52, 0xdc, 0x75, 0x3f, 0x92, 0x2b, 0x7,  0xc2, 0x27,
+             0x14, 0xbf, 0xf0, 0xd9, 0xf0, 0x6f, 0x2d, 0xb,  0x42, 0x73},
+            {0x44, 0xc5, 0x97, 0x46, 0x4b, 0x5d, 0xa7, 0xc7, 0xbf, 0xff, 0xf,
+             0xdf, 0x48, 0xf8, 0xfd, 0x15, 0x5a, 0x78, 0x46, 0xaa, 0xeb, 0xb9,
+             0x68, 0x28, 0x14, 0xf7, 0x52, 0x5b, 0x10, 0xd7, 0x68, 0x5a},
+            {0x6,  0x1e, 0x85, 0x9e, 0xcb, 0xf6, 0x2c, 0xaf, 0xc4, 0x38, 0x22,
+             0xc6, 0x13, 0x39, 0x59, 0x8f, 0x73, 0xf3, 0xfb, 0x99, 0x96, 0xb8,
+             0x8a, 0xda, 0x9e, 0xbc, 0x34, 0xea, 0x2f, 0x63, 0xb5, 0x3d},
+        },
+        {
+            {0xd5, 0x25, 0x98, 0x82, 0xb1, 0x90, 0x49, 0x2e, 0x91, 0x89, 0x9a,
+             0x3e, 0x87, 0xeb, 0xea, 0xed, 0xf8, 0x4a, 0x70, 0x4c, 0x39, 0x3d,
+             0xf0, 0xee, 0xe,  0x2b, 0xdf, 0x95, 0xa4, 0x7e, 0x19, 0x59},
+            {0xd8, 0xd9, 0x5d, 0xf7, 0x2b, 0xee, 0x6e, 0xf4, 0xa5, 0x59, 0x67,
+             0x39, 0xf6, 0xb1, 0x17, 0xd,  0x73, 0x72, 0x9e, 0x49, 0x31, 0xd1,
+             0xf2, 0x1b, 0x13, 0x5f, 0xd7, 0x49, 0xdf, 0x1a, 0x32, 0x4},
+            {0xae, 0x5a, 0xe5, 0xe4, 0x19, 0x60, 0xe1, 0x4,  0xe9, 0x92, 0x2f,
+             0x7e, 0x7a, 0x43, 0x7b, 0xe7, 0xa4, 0x9a, 0x15, 0x6f, 0xc1, 0x2d,
+             0xce, 0xc7, 0xc0, 0xc,  0xd7, 0xf4, 0xc1, 0xfd, 0xea, 0x45},
+        },
+        {
+            {0xed, 0xb1, 0xcc, 0xcf, 0x24, 0x46, 0xe,  0xb6, 0x95, 0x3,  0x5c,
+             0xbd, 0x92, 0xc2, 0xdb, 0x59, 0xc9, 0x81, 0x4,  0xdc, 0x1d, 0x9d,
+             0xa0, 0x31, 0x40, 0xd9, 0x56, 0x5d, 0xea, 0xce, 0x73, 0x3f},
+            {0x2b, 0xd7, 0x45, 0x80, 0x85, 0x1,  0x84, 0x69, 0x51, 0x6,  0x2f,
+             0xcf, 0xa2, 0xfa, 0x22, 0x4c, 0xc6, 0x2d, 0x22, 0x6b, 0x65, 0x36,
+             0x1a, 0x94, 0xde, 0xda, 0x62, 0x3,  0xc8, 0xeb, 0x5e, 0x5a},
+            {0xc6, 0x8d, 0x4e, 0xa,  0xd1, 0xbf, 0xa7, 0xb7, 0x39, 0xb3, 0xc9,
+             0x44, 0x7e, 0x0,  0x57, 0xbe, 0xfa, 0xae, 0x57, 0x15, 0x7f, 0x20,
+             0xc1, 0x60, 0xdb, 0x18, 0x62, 0x26, 0x91, 0x88, 0x5,  0x26},
+        },
+        {
+            {0x42, 0xe5, 0x76, 0xc6, 0x3c, 0x8e, 0x81, 0x4c, 0xad, 0xcc, 0xce,
+             0x3,  0x93, 0x2c, 0x42, 0x5e, 0x8,  0x9f, 0x12, 0xb4, 0xca, 0xcc,
+             0x7,  0xec, 0xb8, 0x43, 0x44, 0xb2, 0x10, 0xfa, 0xed, 0xd},
+            {0x4,  0xff, 0x60, 0x83, 0xa6, 0x4,  0xf7, 0x59, 0xf4, 0xe6, 0x61,
+             0x76, 0xde, 0x3f, 0xd9, 0xc3, 0x51, 0x35, 0x87, 0x12, 0x73, 0x2a,
+             0x1b, 0x83, 0x57, 0x5d, 0x61, 0x4e, 0x2e, 0xc,  0xad, 0x54},
+            {0x2a, 0x52, 0x2b, 0xb8, 0xd5, 0x67, 0x3b, 0xee, 0xeb, 0xc1, 0xa5,
+             0x9f, 0x46, 0x63, 0xf1, 0x36, 0xd3, 0x9f, 0xc1, 0x6e, 0xf2, 0xd2,
+             0xb4, 0xa5, 0x8,  0x94, 0x7a, 0xa7, 0xba, 0xb2, 0xec, 0x62},
+        },
+        {
+            {0x74, 0x28, 0xb6, 0xaf, 0x36, 0x28, 0x7,  0x92, 0xa5, 0x4,  0xe1,
+             0x79, 0x85, 0x5e, 0xcd, 0x5f, 0x4a, 0xa1, 0x30, 0xc6, 0xad, 0x1,
+             0xad, 0x5a, 0x98, 0x3f, 0x66, 0x75, 0x50, 0x3d, 0x91, 0x61},
+            {0x3d, 0x2b, 0x15, 0x61, 0x52, 0x79, 0xed, 0xe5, 0xd1, 0xd7, 0xdd,
+             0xe,  0x7d, 0x35, 0x62, 0x49, 0x71, 0x4c, 0x6b, 0xb9, 0xd0, 0xc8,
+             0x82, 0x74, 0xbe, 0xd8, 0x66, 0xa9, 0x19, 0xf9, 0x59, 0x2e},
+            {0xda, 0x31, 0x32, 0x1a, 0x36, 0x2d, 0xc6, 0xd,  0x70, 0x2,  0x20,
+             0x94, 0x32, 0x58, 0x47, 0xfa, 0xce, 0x94, 0x95, 0x3f, 0x51, 0x1,
+             0xd8, 0x2,  0x5c, 0x5d, 0xc0, 0x31, 0xa1, 0xc2, 0xdb, 0x3d},
+        },
+    },
+    {
+        {
+            {0x14, 0xbb, 0x96, 0x27, 0xa2, 0x57, 0xaa, 0xf3, 0x21, 0xda, 0x7,
+             0x9b, 0xb7, 0xba, 0x3a, 0x88, 0x1c, 0x39, 0xa0, 0x31, 0x18, 0xe2,
+             0x4b, 0xe5, 0xf9, 0x5,  0x32, 0xd8, 0x38, 0xfb, 0xe7, 0x5e},
+            {0x4b, 0xc5, 0x5e, 0xce, 0xf9, 0xf,  0xdc, 0x9a, 0xd,  0x13, 0x2f,
+             0x8c, 0x6b, 0x2a, 0x9c, 0x3,  0x15, 0x95, 0xf8, 0xf0, 0xc7, 0x7,
+             0x80, 0x2,  0x6b, 0xb3, 0x4,  0xac, 0x14, 0x83, 0x96, 0x78},
+            {0x8e, 0x6a, 0x44, 0x41, 0xcb, 0xfd, 0x8d, 0x53, 0xf9, 0x37, 0x49,
+             0x43, 0xa9, 0xfd, 0xac, 0xa5, 0x78, 0x8c, 0x3c, 0x26, 0x8d, 0x90,
+             0xaf, 0x46, 0x9,  0xd,  0xca, 0x9b, 0x3c, 0x63, 0xd0, 0x61},
+        },
+        {
+            {0xdf, 0x73, 0xfc, 0xf8, 0xbc, 0x28, 0xa3, 0xad, 0xfc, 0x37, 0xf0,
+             0xa6, 0x5d, 0x69, 0x84, 0xee, 0x9,  0xa9, 0xc2, 0x38, 0xdb, 0xb4,
+             0x7f, 0x63, 0xdc, 0x7b, 0x6,  0xf8, 0x2d, 0xac, 0x23, 0x5b},
+            {0x66, 0x25, 0xdb, 0xff, 0x35, 0x49, 0x74, 0x63, 0xbb, 0x68, 0xb,
+             0x78, 0x89, 0x6b, 0xbd, 0xc5, 0x3,  0xec, 0x3e, 0x55, 0x80, 0x32,
+             0x1b, 0x6f, 0xf5, 0xd7, 0xae, 0x47, 0xd8, 0x5f, 0x96, 0x6e},
+            {0x7b, 0x52, 0x80, 0xee, 0x53, 0xb9, 0xd2, 0x9a, 0x8d, 0x6d, 0xde,
+             0xfa, 0xaa, 0x19, 0x8f, 0xe8, 0xcf, 0x82, 0xe,  0x15, 0x4,  0x17,
+             0x71, 0xe,  0xdc, 0xde, 0x95, 0xdd, 0xb9, 0xbb, 0xb9, 0x79},
+        },
+        {
+            {0x74, 0x73, 0x9f, 0x8e, 0xae, 0x7d, 0x99, 0xd1, 0x16, 0x8,  0xbb,
+             0xcf, 0xf8, 0xa2, 0x32, 0xa0, 0xa,  0x5f, 0x44, 0x6d, 0x12, 0xba,
+             0x6c, 0xcd, 0x34, 0xb8, 0xcc, 0xa,  0x46, 0x11, 0xa8, 0x1b},
+            {0xc2, 0x26, 0x31, 0x6a, 0x40, 0x55, 0xb3, 0xeb, 0x93, 0xc3, 0xc8,
+             0x68, 0xa8, 0x83, 0x63, 0xd2, 0x82, 0x7a, 0xb9, 0xe5, 0x29, 0x64,
+             0xc,  0x6c, 0x47, 0x21, 0xfd, 0xc9, 0x58, 0xf1, 0x65, 0x50},
+            {0x54, 0x99, 0x42, 0xc,  0xfb, 0x69, 0x81, 0x70, 0x67, 0xcf, 0x6e,
+             0xd7, 0xac, 0x0,  0x46, 0xe1, 0xba, 0x45, 0xe6, 0x70, 0x8a, 0xb9,
+             0xaa, 0x2e, 0xf2, 0xfa, 0xa4, 0x58, 0x9e, 0xf3, 0x81, 0x39},
+        },
+        {
+            {0xde, 0x6f, 0xe6, 0x6d, 0xa5, 0xdf, 0x45, 0xc8, 0x3a, 0x48, 0x40,
+             0x2c, 0x0,  0xa5, 0x52, 0xe1, 0x32, 0xf6, 0xb4, 0xc7, 0x63, 0xe1,
+             0xd2, 0xe9, 0x65, 0x1b, 0xbc, 0xdc, 0x2e, 0x45, 0xf4, 0x30},
+            {0x93, 0xa,  0x23, 0x59, 0x75, 0x8a, 0xfb, 0x18, 0x5d, 0xf4, 0xe6,
+             0x60, 0x69, 0x8f, 0x16, 0x1d, 0xb5, 0x3c, 0xa9, 0x14, 0x45, 0xa9,
+             0x85, 0x3a, 0xfd, 0xd0, 0xac, 0x5,  0x37, 0x8,  0xdc, 0x38},
+            {0x40, 0x97, 0x75, 0xc5, 0x82, 0x27, 0x6d, 0x85, 0xcc, 0xbe, 0x9c,
+             0xf9, 0x69, 0x45, 0x13, 0xfa, 0x71, 0x4e, 0xea, 0xc0, 0x73, 0xfc,
+             0x44, 0x88, 0x69, 0x24, 0x3f, 0x59, 0x1a, 0x9a, 0x2d, 0x63},
+        },
+        {
+            {0xa7, 0x84, 0xc,  0xed, 0x11, 0xfd, 0x9,  0xbf, 0x3a, 0x69, 0x9f,
+             0xd,  0x81, 0x71, 0xf0, 0x63, 0x79, 0x87, 0xcf, 0x57, 0x2d, 0x8c,
+             0x90, 0x21, 0xa2, 0x4b, 0xf6, 0x8a, 0xf2, 0x7d, 0x5a, 0x3a},
+            {0xa6, 0xcb, 0x7,  0xb8, 0x15, 0x6b, 0xbb, 0xf6, 0xd7, 0xf0, 0x54,
+             0xbc, 0xdf, 0xc7, 0x23, 0x18, 0xb,  0x67, 0x29, 0x6e, 0x3,  0x97,
+             0x1d, 0xbb, 0x57, 0x4a, 0xed, 0x47, 0x88, 0xf4, 0x24, 0xb},
+            {0xc7, 0xea, 0x1b, 0x51, 0xbe, 0xd4, 0xda, 0xdc, 0xf2, 0xcc, 0x26,
+             0xed, 0x75, 0x80, 0x53, 0xa4, 0x65, 0x9a, 0x5f, 0x0,  0x9f, 0xff,
+             0x9c, 0xe1, 0x63, 0x1f, 0x48, 0x75, 0x44, 0xf7, 0xfc, 0x34},
+        },
+        {
+            {0x98, 0xaa, 0xcf, 0x78, 0xab, 0x1d, 0xbb, 0xa5, 0xf2, 0x72, 0xb,
+             0x19, 0x67, 0xa2, 0xed, 0x5c, 0x8e, 0x60, 0x92, 0xa,  0x11, 0xc9,
+             0x9,  0x93, 0xb0, 0x74, 0xb3, 0x2f, 0x4,  0xa3, 0x19, 0x1},
+            {0xca, 0x67, 0x97, 0x78, 0x4c, 0xe0, 0x97, 0xc1, 0x7d, 0x46, 0xd9,
+             0x38, 0xcb, 0x4d, 0x71, 0xb8, 0xa8, 0x5f, 0xf9, 0x83, 0x82, 0x88,
+             0xde, 0x55, 0xf7, 0x63, 0xfa, 0x4d, 0x16, 0xdc, 0x3b, 0x3d},
+            {0x7d, 0x17, 0xc2, 0xe8, 0x9c, 0xd8, 0xa2, 0x67, 0xc1, 0xd0, 0x95,
+             0x68, 0xf6, 0xa5, 0x9d, 0x66, 0xb0, 0xa2, 0x82, 0xb2, 0xe5, 0x98,
+             0x65, 0xf5, 0x73, 0xa,  0xe2, 0xed, 0xf1, 0x88, 0xc0, 0x56},
+        },
+        {
+            {0x2,  0x8f, 0xf3, 0x24, 0xac, 0x5f, 0x1b, 0x58, 0xbd, 0xc, 0xe3,
+             0xba, 0xfe, 0xe9, 0xb,  0xa9, 0xf0, 0x92, 0xcf, 0x8a, 0x2, 0x69,
+             0x21, 0x9a, 0x8f, 0x3,  0x59, 0x83, 0xa4, 0x7e, 0x8b, 0x3},
+            {0x17, 0x6e, 0xa8, 0x10, 0x11, 0x3d, 0x6d, 0x33, 0xfa, 0xb2, 0x75,
+             0xb,  0x32, 0x88, 0xf3, 0xd7, 0x88, 0x29, 0x7,  0x25, 0x76, 0x33,
+             0x15, 0xf9, 0x87, 0x8b, 0x10, 0x99, 0x6b, 0x4c, 0x67, 0x9},
+            {0xf8, 0x6f, 0x31, 0x99, 0x21, 0xf8, 0x4e, 0x9f, 0x4f, 0x8d, 0xa7,
+             0xea, 0x82, 0xd2, 0x49, 0x2f, 0x74, 0x31, 0xef, 0x5a, 0xab, 0xa5,
+             0x71, 0x9,  0x65, 0xeb, 0x69, 0x59, 0x2,  0x31, 0x5e, 0x6e},
+        },
+        {
+            {0x22, 0x62, 0x6,  0x63, 0xe,  0xfb, 0x4,  0x33, 0x3f, 0xba, 0xac,
+             0x87, 0x89, 0x6,  0x35, 0xfb, 0xa3, 0x61, 0x10, 0x8c, 0x77, 0x24,
+             0x19, 0xbd, 0x20, 0x86, 0x83, 0xd1, 0x43, 0xad, 0x58, 0x30},
+            {0xfb, 0x93, 0xe5, 0x87, 0xf5, 0x62, 0x6c, 0xb1, 0x71, 0x3e, 0x5d,
+             0xca, 0xde, 0xed, 0x99, 0x49, 0x6d, 0x3e, 0xcc, 0x14, 0xe0, 0xc1,
+             0x91, 0xb4, 0xa8, 0xdb, 0xa8, 0x89, 0x47, 0x11, 0xf5, 0x8},
+            {0xd0, 0x63, 0x76, 0xe5, 0xfd, 0xf,  0x3c, 0x32, 0x10, 0xa6, 0x2e,
+             0xa2, 0x38, 0xdf, 0xc3, 0x5,  0x9a, 0x4f, 0x99, 0xac, 0xbd, 0x8a,
+             0xc7, 0xbd, 0x99, 0xdc, 0xe3, 0xef, 0xa4, 0x9f, 0x54, 0x26},
+        },
+    },
+    {
+        {
+            {0x6e, 0x66, 0x3f, 0xaf, 0x49, 0x85, 0x46, 0xdb, 0xa5, 0xe,  0x4a,
+             0xf1, 0x4,  0xcf, 0x7f, 0xd7, 0x47, 0xc,  0xba, 0xa4, 0xf7, 0x3f,
+             0xf2, 0x3d, 0x85, 0x3c, 0xce, 0x32, 0xe1, 0xdf, 0x10, 0x3a},
+            {0xd6, 0xf9, 0x6b, 0x1e, 0x46, 0x5a, 0x1d, 0x74, 0x81, 0xa5, 0x77,
+             0x77, 0xfc, 0xb3, 0x5,  0x23, 0xd9, 0xd3, 0x74, 0x64, 0xa2, 0x74,
+             0x55, 0xd4, 0xff, 0xe0, 0x1,  0x64, 0xdc, 0xe1, 0x26, 0x19},
+            {0xa0, 0xce, 0x17, 0xea, 0x8a, 0x4e, 0x7f, 0xe0, 0xfd, 0xc1, 0x1f,
+             0x3a, 0x46, 0x15, 0xd5, 0x2f, 0xf1, 0xc0, 0xf2, 0x31, 0xfd, 0x22,
+             0x53, 0x17, 0x15, 0x5d, 0x1e, 0x86, 0x1d, 0xd0, 0xa1, 0x1f},
+        },
+        {
+            {0xab, 0x94, 0xdf, 0xd1, 0x0,  0xac, 0xdc, 0x38, 0xe9, 0xd,  0x8,
+             0xd1, 0xdd, 0x2b, 0x71, 0x2e, 0x62, 0xe2, 0xd5, 0xfd, 0x3e, 0xe9,
+             0x13, 0x7f, 0xe5, 0x1,  0x9a, 0xee, 0x18, 0xed, 0xfc, 0x73},
+            {0x32, 0x98, 0x59, 0x7d, 0x94, 0x55, 0x80, 0xcc, 0x20, 0x55, 0xf1,
+             0x37, 0xda, 0x56, 0x46, 0x1e, 0x20, 0x93, 0x5,  0x4e, 0x74, 0xf7,
+             0xf6, 0x99, 0x33, 0xcf, 0x75, 0x6a, 0xbc, 0x63, 0x35, 0x77},
+            {0xb3, 0x9c, 0x13, 0x63, 0x8,  0xe9, 0xb1, 0x6,  0xcd, 0x3e, 0xa0,
+             0xc5, 0x67, 0xda, 0x93, 0xa4, 0x32, 0x89, 0x63, 0xad, 0xc8, 0xce,
+             0x77, 0x8d, 0x44, 0x4f, 0x86, 0x1b, 0x70, 0x6b, 0x42, 0x1f},
+        },
+        {
+            {0x52, 0x25, 0xa1, 0x91, 0xc8, 0x35, 0x7e, 0xf1, 0x76, 0x9c, 0x5e,
+             0x57, 0x53, 0x81, 0x6b, 0xb7, 0x3e, 0x72, 0x9b, 0xd,  0x6f, 0x40,
+             0x83, 0xfa, 0x38, 0xe4, 0xa7, 0x3f, 0x1b, 0xbb, 0x76, 0xb},
+            {0x1,  0x1c, 0x91, 0x41, 0x4c, 0x26, 0xc9, 0xef, 0x25, 0x2c, 0xa2,
+             0x17, 0xb8, 0xb7, 0xa3, 0xf1, 0x47, 0x14, 0xf,  0xf3, 0x6b, 0xda,
+             0x75, 0x58, 0x90, 0xb0, 0x31, 0x1d, 0x27, 0xf5, 0x1a, 0x4e},
+            {0x9b, 0x93, 0x92, 0x7f, 0xf9, 0xc1, 0xb8, 0x8,  0x6e, 0xab, 0x44,
+             0xd4, 0xcb, 0x71, 0x67, 0xbe, 0x17, 0x80, 0xbb, 0x99, 0x63, 0x64,
+             0xe5, 0x22, 0x55, 0xa9, 0x72, 0xb7, 0x1e, 0xd6, 0x6d, 0x7b},
+        },
+        {
+            {0xc7, 0xd2, 0x1,  0xab, 0xf9, 0xab, 0x30, 0x57, 0x18, 0x3b, 0x14,
+             0x40, 0xdc, 0x76, 0xfb, 0x16, 0x81, 0xb2, 0xcb, 0xa0, 0x65, 0xbe,
+             0x6c, 0x86, 0xfe, 0x6a, 0xff, 0x9b, 0x65, 0x9b, 0xfa, 0x53},
+            {0x92, 0x3d, 0xf3, 0x50, 0xe8, 0xc1, 0xad, 0xb7, 0xcf, 0xd5, 0x8c,
+             0x60, 0x4f, 0xfa, 0x98, 0x79, 0xdb, 0x5b, 0xfc, 0x8d, 0xbd, 0x2d,
+             0x96, 0xad, 0x4f, 0x2f, 0x1d, 0xaf, 0xce, 0x9b, 0x3e, 0x70},
+            {0x55, 0x54, 0x88, 0x94, 0xe9, 0xc8, 0x14, 0x6c, 0xe5, 0xd4, 0xae,
+             0x65, 0x66, 0x5d, 0x3a, 0x84, 0xf1, 0x5a, 0xd6, 0xbc, 0x3e, 0xb7,
+             0x1b, 0x18, 0x50, 0x1f, 0xc6, 0xc4, 0xe5, 0x93, 0x8d, 0x39},
+        },
+        {
+            {0xf2, 0xe3, 0xe7, 0xd2, 0x60, 0x7c, 0x87, 0xc3, 0xb1, 0x8b, 0x82,
+             0x30, 0xa0, 0xaa, 0x34, 0x3b, 0x38, 0xf1, 0x9e, 0x73, 0xe7, 0x26,
+             0x3e, 0x28, 0x77, 0x5,  0xc3, 0x2,  0x90, 0x9c, 0x9c, 0x69},
+            {0xf3, 0x48, 0xe2, 0x33, 0x67, 0xd1, 0x4b, 0x1c, 0x5f, 0xa,  0xbf,
+             0x15, 0x87, 0x12, 0x9e, 0xbd, 0x76, 0x3,  0xb,  0xa1, 0xf0, 0x8c,
+             0x3f, 0xd4, 0x13, 0x1b, 0x19, 0xdf, 0x5d, 0x9b, 0xb0, 0x53},
+            {0xcc, 0xf1, 0x46, 0x59, 0x23, 0xa7, 0x6,  0xf3, 0x7d, 0xd9, 0xe5,
+             0xcc, 0xb5, 0x18, 0x17, 0x92, 0x75, 0xe9, 0xb4, 0x81, 0x47, 0xd2,
+             0xcd, 0x28, 0x7,  0xd9, 0xcd, 0x6f, 0xc,  0xf3, 0xca, 0x51},
+        },
+        {
+            {0xc7, 0x54, 0xac, 0x18, 0x9a, 0xf9, 0x7a, 0x73, 0xf,  0xb3, 0x1c,
+             0xc5, 0xdc, 0x78, 0x33, 0x90, 0xc7, 0xc,  0xe1, 0x4c, 0x33, 0xbc,
+             0x89, 0x2b, 0x9a, 0xe9, 0xf8, 0x89, 0xc1, 0x29, 0xae, 0x12},
+            {0xa,  0xe0, 0x74, 0x76, 0x42, 0xa7, 0xb,  0xa6, 0xf3, 0x7b, 0x7a,
+             0xa1, 0x70, 0x85, 0xe,  0x63, 0xcc, 0x24, 0x33, 0xcf, 0x3d, 0x56,
+             0x58, 0x37, 0xaa, 0xfd, 0x83, 0x23, 0x29, 0xaa, 0x4,  0x55},
+            {0xcf, 0x1,  0xd,  0x1f, 0xcb, 0xc0, 0x9e, 0xa9, 0xae, 0xf7, 0x34,
+             0x3a, 0xcc, 0xef, 0xd1, 0xd,  0x22, 0x4e, 0x9c, 0xd0, 0x21, 0x75,
+             0xca, 0x55, 0xea, 0xa5, 0xeb, 0x58, 0xe9, 0x4f, 0xd1, 0x5f},
+        },
+        {
+            {0x8e, 0xcb, 0x93, 0xbf, 0x5e, 0xfe, 0x42, 0x3c, 0x5f, 0x56, 0xd4,
+             0x36, 0x51, 0xa8, 0xdf, 0xbe, 0xe8, 0x20, 0x42, 0x88, 0x9e, 0x85,
+             0xf0, 0xe0, 0x28, 0xd1, 0x25, 0x7,  0x96, 0x3f, 0xd7, 0x7d},
+            {0x2c, 0xab, 0x45, 0x28, 0xdf, 0x2d, 0xdc, 0xb5, 0x93, 0xe9, 0x7f,
+             0xa,  0xb1, 0x91, 0x94, 0x6,  0x46, 0xe3, 0x2,  0x40, 0xd6, 0xf3,
+             0xaa, 0x4d, 0xd1, 0x74, 0x64, 0x58, 0x6e, 0xf2, 0x3f, 0x9},
+            {0x29, 0x98, 0x5,  0x68, 0xfe, 0x24, 0xd,  0xb1, 0xe5, 0x23, 0xaf,
+             0xdb, 0x72, 0x6,  0x73, 0x75, 0x29, 0xac, 0x57, 0xb4, 0x3a, 0x25,
+             0x67, 0x13, 0xa4, 0x70, 0xb4, 0x86, 0xbc, 0xbc, 0x59, 0x2f},
+        },
+        {
+            {0x1,  0xc3, 0x91, 0xb6, 0x60, 0xd5, 0x41, 0x70, 0x1e, 0xe7, 0xd7,
+             0xad, 0x3f, 0x1b, 0x20, 0x85, 0x85, 0x55, 0x33, 0x11, 0x63, 0xe1,
+             0xc2, 0x16, 0xb1, 0x28, 0x8,  0x1,  0x3d, 0x5e, 0xa5, 0x2a},
+            {0x5f, 0x13, 0x17, 0x99, 0x42, 0x7d, 0x84, 0x83, 0xd7, 0x3,  0x7d,
+             0x56, 0x1f, 0x91, 0x1b, 0xad, 0xd1, 0xaa, 0x77, 0xbe, 0xd9, 0x48,
+             0x77, 0x7e, 0x4a, 0xaf, 0x51, 0x2e, 0x2e, 0xb4, 0x58, 0x54},
+            {0x4f, 0x44, 0x7,  0xc,  0xe6, 0x92, 0x51, 0xed, 0x10, 0x1d, 0x42,
+             0x74, 0x2d, 0x4e, 0xc5, 0x42, 0x64, 0xc8, 0xb5, 0xfd, 0x82, 0x4c,
+             0x2b, 0x35, 0x64, 0x86, 0x76, 0x8a, 0x4a, 0x0,  0xe9, 0x13},
+        },
+    },
+    {
+        {
+            {0x7f, 0x87, 0x3b, 0x19, 0xc9, 0x0,  0x2e, 0xbb, 0x6b, 0x50, 0xdc,
+             0xe0, 0x90, 0xa8, 0xe3, 0xec, 0x9f, 0x64, 0xde, 0x36, 0xc0, 0xb7,
+             0xf3, 0xec, 0x1a, 0x9e, 0xde, 0x98, 0x8,  0x4,  0x46, 0x5f},
+            {0xdb, 0xce, 0x2f, 0x83, 0x45, 0x88, 0x9d, 0x73, 0x63, 0xf8, 0x6b,
+             0xae, 0xc9, 0xd6, 0x38, 0xfa, 0xf7, 0xfe, 0x4f, 0xb7, 0xca, 0xd,
+             0xbc, 0x32, 0x5e, 0xe4, 0xbc, 0x14, 0x88, 0x7e, 0x93, 0x73},
+            {0x8d, 0xf4, 0x7b, 0x29, 0x16, 0x71, 0x3,  0xb9, 0x34, 0x68, 0xf0,
+             0xd4, 0x22, 0x3b, 0xd1, 0xa9, 0xc6, 0xbd, 0x96, 0x46, 0x57, 0x15,
+             0x97, 0xe1, 0x35, 0xe8, 0xd5, 0x91, 0xe8, 0xa4, 0xf8, 0x2c},
+        },
+        {
+            {0xa2, 0x6b, 0xd0, 0x17, 0x7e, 0x48, 0xb5, 0x2c, 0x6b, 0x19, 0x50,
+             0x39, 0x1c, 0x38, 0xd2, 0x24, 0x30, 0x8a, 0x97, 0x85, 0x81, 0x9c,
+             0x65, 0xd7, 0xf6, 0xa4, 0xd6, 0x91, 0x28, 0x7f, 0x6f, 0x7a},
+            {0x67, 0xf,  0x11, 0x7,  0x87, 0xfd, 0x93, 0x6d, 0x49, 0xb5, 0x38,
+             0x7c, 0xd3, 0x9,  0x4c, 0xdd, 0x86, 0x6a, 0x73, 0xc2, 0x4c, 0x6a,
+             0xb1, 0x7c, 0x9,  0x2a, 0x25, 0x58, 0x6e, 0xbd, 0x49, 0x20},
+            {0x49, 0xef, 0x9a, 0x6a, 0x8d, 0xfd, 0x9,  0x7d, 0xb,  0xb9, 0x3d,
+             0x5b, 0xbe, 0x60, 0xee, 0xf0, 0xd4, 0xbf, 0x9e, 0x51, 0x2c, 0xb5,
+             0x21, 0x4c, 0x1d, 0x94, 0x45, 0xc5, 0xdf, 0xaa, 0x11, 0x60},
+        },
+        {
+            {0x90, 0xf8, 0xcb, 0x2,  0xc8, 0xd0, 0xde, 0x63, 0xaa, 0x6a, 0xff,
+             0xd,  0xca, 0x98, 0xd0, 0xfb, 0x99, 0xed, 0xb6, 0xb9, 0xfd, 0xa,
+             0x4d, 0x62, 0x1e, 0xb,  0x34, 0x79, 0xb7, 0x18, 0xce, 0x69},
+            {0x3c, 0xf8, 0x95, 0xcf, 0x6d, 0x92, 0x67, 0x5f, 0x71, 0x90, 0x28,
+             0x71, 0x61, 0x85, 0x7e, 0x7c, 0x5b, 0x7a, 0x8f, 0x99, 0xf3, 0xe7,
+             0xa1, 0xd6, 0xe0, 0xf9, 0x62, 0xb,  0x1b, 0xcc, 0xc5, 0x6f},
+            {0xcb, 0x79, 0x98, 0xb2, 0x28, 0x55, 0xef, 0xd1, 0x92, 0x90, 0x7e,
+             0xd4, 0x3c, 0xae, 0x1a, 0xdd, 0x52, 0x23, 0x9f, 0x18, 0x42, 0x4,
+             0x7e, 0x12, 0xf1, 0x1,  0x71, 0xe5, 0x3a, 0x6b, 0x59, 0x15},
+        },
+        {
+            {0xca, 0x24, 0x51, 0x7e, 0x16, 0x31, 0xff, 0x9,  0xdf, 0x45, 0xc7,
+             0xd9, 0x8b, 0x15, 0xe4, 0xb,  0xe5, 0x56, 0xf5, 0x7e, 0x22, 0x7d,
+             0x2b, 0x29, 0x38, 0xd1, 0xb6, 0xaf, 0x41, 0xe2, 0xa4, 0x3a},
+            {0xa2, 0x79, 0x91, 0x3f, 0xd2, 0x39, 0x27, 0x46, 0xcf, 0xdd, 0xd6,
+             0x97, 0x31, 0x12, 0x83, 0xff, 0x8a, 0x14, 0xf2, 0x53, 0xb5, 0xde,
+             0x7,  0x13, 0xda, 0x4d, 0x5f, 0x7b, 0x68, 0x37, 0x22, 0xd},
+            {0xf5, 0x5,  0x33, 0x2a, 0xbf, 0x38, 0xc1, 0x2c, 0xc3, 0x26, 0xe9,
+             0xa2, 0x8f, 0x3f, 0x58, 0x48, 0xeb, 0xd2, 0x49, 0x55, 0xa2, 0xb1,
+             0x3a, 0x8,  0x6c, 0xa3, 0x87, 0x46, 0x6e, 0xaa, 0xfc, 0x32},
+        },
+        {
+            {0xdf, 0xcc, 0x87, 0x27, 0x73, 0xa4, 0x7,  0x32, 0xf8, 0xe3, 0x13,
+             0xf2, 0x8,  0x19, 0xe3, 0x17, 0x4e, 0x96, 0xd,  0xf6, 0xd7, 0xec,
+             0xb2, 0xd5, 0xe9, 0xb,  0x60, 0xc2, 0x36, 0x63, 0x6f, 0x74},
+            {0xf5, 0x9a, 0x7d, 0xc5, 0x8d, 0x6e, 0xc5, 0x7b, 0xf2, 0xbd, 0xf0,
+             0x9d, 0xed, 0xd2, 0xb,  0x3e, 0xa3, 0xe4, 0xef, 0x22, 0xde, 0x14,
+             0xc0, 0xaa, 0x5c, 0x6a, 0xbd, 0xfe, 0xce, 0xe9, 0x27, 0x46},
+            {0x1c, 0x97, 0x6c, 0xab, 0x45, 0xf3, 0x4a, 0x3f, 0x1f, 0x73, 0x43,
+             0x99, 0x72, 0xeb, 0x88, 0xe2, 0x6d, 0x18, 0x44, 0x3,  0x8a, 0x6a,
+             0x59, 0x33, 0x93, 0x62, 0xd6, 0x7e, 0x0,  0x17, 0x49, 0x7b},
+        },
+        {
+            {0xdd, 0xa2, 0x53, 0xdd, 0x28, 0x1b, 0x34, 0x54, 0x3f, 0xfc, 0x42,
+             0xdf, 0x5b, 0x90, 0x17, 0xaa, 0xf4, 0xf8, 0xd2, 0x4d, 0xd9, 0x92,
+             0xf5, 0xf,  0x7d, 0xd3, 0x8c, 0xe0, 0xf,  0x62, 0x3,  0x1d},
+            {0x64, 0xb0, 0x84, 0xab, 0x5c, 0xfb, 0x85, 0x2d, 0x14, 0xbc, 0xf3,
+             0x89, 0xd2, 0x10, 0x78, 0x49, 0xc,  0xce, 0x15, 0x7b, 0x44, 0xdc,
+             0x6a, 0x47, 0x7b, 0xfd, 0x44, 0xf8, 0x76, 0xa3, 0x2b, 0x12},
+            {0x54, 0xe5, 0xb4, 0xa2, 0xcd, 0x32, 0x2,  0xc2, 0x7f, 0x18, 0x5d,
+             0x11, 0x42, 0xfd, 0xd0, 0x9e, 0xd9, 0x79, 0xd4, 0x7d, 0xbe, 0xb4,
+             0xab, 0x2e, 0x4c, 0xec, 0x68, 0x2b, 0xf5, 0xb,  0xc7, 0x2},
+        },
+        {
+            {0xe1, 0x72, 0x8d, 0x45, 0xbf, 0x32, 0xe5, 0xac, 0xb5, 0x3c, 0xb7,
+             0x7c, 0xe0, 0x68, 0xe7, 0x5b, 0xe7, 0xbd, 0x8b, 0xee, 0x94, 0x7d,
+             0xcf, 0x56, 0x3,  0x3a, 0xb4, 0xfe, 0xe3, 0x97, 0x6,  0x6b},
+            {0xbb, 0x2f, 0xb,  0x5d, 0x4b, 0xec, 0x87, 0xa2, 0xca, 0x82, 0x48,
+             0x7,  0x90, 0x57, 0x5c, 0x41, 0x5c, 0x81, 0xd0, 0xc1, 0x1e, 0xa6,
+             0x44, 0xe0, 0xe0, 0xf5, 0x9e, 0x40, 0xa,  0x4f, 0x33, 0x26},
+            {0xc0, 0xa3, 0x62, 0xdf, 0x4a, 0xf0, 0xc8, 0xb6, 0x5d, 0xa4, 0x6d,
+             0x7,  0xef, 0x0,  0xf0, 0x3e, 0xa9, 0xd2, 0xf0, 0x49, 0x58, 0xb9,
+             0x9c, 0x9c, 0xae, 0x2f, 0x1b, 0x44, 0x43, 0x7f, 0xc3, 0x1c},
+        },
+        {
+            {0xb9, 0xae, 0xce, 0xc9, 0xf1, 0x56, 0x66, 0xd7, 0x6a, 0x65, 0xe5,
+             0x18, 0xf8, 0x15, 0x5b, 0x1c, 0x34, 0x23, 0x4c, 0x84, 0x32, 0x28,
+             0xe7, 0x26, 0x38, 0x68, 0x19, 0x2f, 0x77, 0x6f, 0x34, 0x3a},
+            {0x4f, 0x32, 0xc7, 0x5c, 0x5a, 0x56, 0x8f, 0x50, 0x22, 0xa9, 0x6,
+             0xe5, 0xc0, 0xc4, 0x61, 0xd0, 0x19, 0xac, 0x45, 0x5c, 0xdb, 0xab,
+             0x18, 0xfb, 0x4a, 0x31, 0x80, 0x3,  0xc1, 0x9,  0x68, 0x6c},
+            {0xc8, 0x6a, 0xda, 0xe2, 0x12, 0x51, 0xd5, 0xd2, 0xed, 0x51, 0xe8,
+             0xb1, 0x31, 0x3,  0xbd, 0xe9, 0x62, 0x72, 0xc6, 0x8e, 0xdd, 0x46,
+             0x7,  0x96, 0xd0, 0xc5, 0xf7, 0x6e, 0x9f, 0x1b, 0x91, 0x5},
+        },
+    },
+    {
+        {
+            {0xef, 0xea, 0x2e, 0x51, 0xf3, 0xac, 0x49, 0x53, 0x49, 0xcb, 0xc1,
+             0x1c, 0xd3, 0x41, 0xc1, 0x20, 0x8d, 0x68, 0x9a, 0xa9, 0x7,  0xc,
+             0x18, 0x24, 0x17, 0x2d, 0x4b, 0xc6, 0xd1, 0xf9, 0x5e, 0x55},
+            {0xbb, 0xe,  0xdf, 0xf5, 0x83, 0x99, 0x33, 0xc1, 0xac, 0x4c, 0x2c,
+             0x51, 0x8f, 0x75, 0xf3, 0xc0, 0xe1, 0x98, 0xb3, 0xb,  0xa,  0x13,
+             0xf1, 0x2c, 0x62, 0xc,  0x27, 0xaa, 0xf9, 0xec, 0x3c, 0x6b},
+            {0x8,  0xbd, 0x73, 0x3b, 0xba, 0x70, 0xa7, 0x36, 0xc,  0xbf, 0xaf,
+             0xa3, 0x8,  0xef, 0x4a, 0x62, 0xf2, 0x46, 0x9,  0xb4, 0x98, 0xff,
+             0x37, 0x57, 0x9d, 0x74, 0x81, 0x33, 0xe1, 0x4d, 0x5f, 0x67},
+        },
+        {
+            {0x1d, 0xb3, 0xda, 0x3b, 0xd9, 0xf6, 0x2f, 0xa1, 0xfe, 0x2d, 0x65,
+             0x9d, 0xf,  0xd8, 0x25, 0x7,  0x87, 0x94, 0xbe, 0x9a, 0xf3, 0x4f,
+             0x9c, 0x1,  0x43, 0x3c, 0xcd, 0x82, 0xb8, 0x50, 0xf4, 0x60},
+            {0xfc, 0x82, 0x17, 0x6b, 0x3,  0x52, 0x2c, 0xe,  0xb4, 0x83, 0xad,
+             0x6c, 0x81, 0x6c, 0x81, 0x64, 0x3e, 0x7,  0x64, 0x69, 0xd9, 0xbd,
+             0xdc, 0xd0, 0x20, 0xc5, 0x64, 0x1,  0xf7, 0x9d, 0xd9, 0x13},
+            {0xca, 0xc0, 0xe5, 0x21, 0xc3, 0x5e, 0x4b, 0x1,  0xa2, 0xbf, 0x19,
+             0xd7, 0xc9, 0x69, 0xcb, 0x4f, 0xa0, 0x23, 0x0,  0x75, 0x18, 0x1c,
+             0x5f, 0x4e, 0x80, 0xac, 0xed, 0x55, 0x9e, 0xde, 0x6,  0x1c},
+        },
+        {
+            {0xaa, 0x69, 0x6d, 0xff, 0x40, 0x2b, 0xd5, 0xff, 0xbb, 0x49, 0x40,
+             0xdc, 0x18, 0xb,  0x53, 0x34, 0x97, 0x98, 0x4d, 0xa3, 0x2f, 0x5c,
+             0x4a, 0x5e, 0x2d, 0xba, 0x32, 0x7d, 0x8e, 0x6f, 0x9,  0x78},
+            {0xe2, 0xc4, 0x3e, 0xa3, 0xd6, 0x7a, 0xf,  0x99, 0x8e, 0xe0, 0x2e,
+             0xbe, 0x38, 0xf9, 0x8,  0x66, 0x15, 0x45, 0x28, 0x63, 0xc5, 0x43,
+             0xa1, 0x9c, 0xd,  0xb6, 0x2d, 0xec, 0x1f, 0x8a, 0xf3, 0x4c},
+            {0xe7, 0x5c, 0xfa, 0xd,  0x65, 0xaa, 0xaa, 0xa0, 0x8c, 0x47, 0xb5,
+             0x48, 0x2a, 0x9e, 0xc4, 0xf9, 0x5b, 0x72, 0x3,  0x70, 0x7d, 0xcc,
+             0x9,  0x4f, 0xbe, 0x1a, 0x9,  0x26, 0x3a, 0xad, 0x3c, 0x37},
+        },
+        {
+            {0xad, 0xbb, 0xdd, 0x89, 0xfb, 0xa8, 0xbe, 0xf1, 0xcb, 0xae, 0xae,
+             0x61, 0xbc, 0x2c, 0xcb, 0x3b, 0x9d, 0x8d, 0x9b, 0x1f, 0xbb, 0xa7,
+             0x58, 0x8f, 0x86, 0xa6, 0x12, 0x51, 0xda, 0x7e, 0x54, 0x21},
+            {0x7c, 0xf5, 0xc9, 0x82, 0x4d, 0x63, 0x94, 0xb2, 0x36, 0x45, 0x93,
+             0x24, 0xe1, 0xfd, 0xcb, 0x1f, 0x5a, 0xdb, 0x8c, 0x41, 0xb3, 0x4d,
+             0x9c, 0x9e, 0xfc, 0x19, 0x44, 0x45, 0xd9, 0xf3, 0x40, 0x0},
+            {0xd3, 0x86, 0x59, 0xfd, 0x39, 0xe9, 0xfd, 0xde, 0xc,  0x38, 0xa,
+             0x51, 0x89, 0x2c, 0x27, 0xf4, 0xb9, 0x19, 0x31, 0xbb, 0x7,  0xa4,
+             0x2b, 0xb7, 0xf4, 0x4d, 0x25, 0x4a, 0x33, 0xa,  0x55, 0x63},
+        },
+        {
+            {0x49, 0x7b, 0x54, 0x72, 0x45, 0x58, 0xba, 0x9b, 0xe0, 0x8,  0xc4,
+             0xe2, 0xfa, 0xc6, 0x5,  0xf3, 0x8d, 0xf1, 0x34, 0xc7, 0x69, 0xfa,
+             0xe8, 0x60, 0x7a, 0x76, 0x7d, 0xaa, 0xaf, 0x2b, 0xa9, 0x39},
+            {0x37, 0xcf, 0x69, 0xb5, 0xed, 0xd6, 0x7,  0x65, 0xe1, 0x2e, 0xa5,
+             0xc,  0xb0, 0x29, 0x84, 0x17, 0x5d, 0xd6, 0x6b, 0xeb, 0x90, 0x0,
+             0x7c, 0xea, 0x51, 0x8f, 0xf7, 0xda, 0xc7, 0x62, 0xea, 0x3e},
+            {0x4e, 0x27, 0x93, 0xe6, 0x13, 0xc7, 0x24, 0x9d, 0x75, 0xd3, 0xdb,
+             0x68, 0x77, 0x85, 0x63, 0x5f, 0x9a, 0xb3, 0x8a, 0xeb, 0x60, 0x55,
+             0x52, 0x70, 0xcd, 0xc4, 0xc9, 0x65, 0x6,  0x6a, 0x43, 0x68},
+        },
+        {
+            {0x7c, 0x10, 0x20, 0xe8, 0x17, 0xd3, 0x56, 0x1e, 0x65, 0xe9, 0xa,
+             0x84, 0x44, 0x68, 0x26, 0xc5, 0x7a, 0xfc, 0xf,  0x32, 0xc6, 0xa1,
+             0xe0, 0xc1, 0x72, 0x14, 0x61, 0x91, 0x9c, 0x66, 0x73, 0x53},
+            {0x27, 0x3f, 0x2f, 0x20, 0xe8, 0x35, 0x2,  0xbc, 0xb0, 0x75, 0xf9,
+             0x64, 0xe2, 0x0,  0x5c, 0xc7, 0x16, 0x24, 0x8c, 0xa3, 0xd5, 0xe9,
+             0xa4, 0x91, 0xf9, 0x89, 0xb7, 0x8a, 0xf6, 0xe7, 0xb6, 0x17},
+            {0x57, 0x52, 0xe,  0x9a, 0xab, 0x14, 0x28, 0x5d, 0xfc, 0xb3, 0xca,
+             0xc9, 0x84, 0x20, 0x8f, 0x90, 0xca, 0x1e, 0x2d, 0x5b, 0x88, 0xf5,
+             0xca, 0xaf, 0x11, 0x7d, 0xf8, 0x78, 0xa6, 0xb5, 0xb4, 0x1c},
+        },
+        {
+            {0xe7, 0x7,  0xa0, 0xa2, 0x62, 0xaa, 0x74, 0x6b, 0xb1, 0xc7, 0x71,
+             0xf0, 0xb0, 0xe0, 0x11, 0xf3, 0x23, 0xe2, 0xb,  0x0,  0x38, 0xe4,
+             0x7,  0x57, 0xac, 0x6e, 0xef, 0x82, 0x2d, 0xfd, 0xc0, 0x2d},
+            {0x6c, 0xfc, 0x4a, 0x39, 0x6b, 0xc0, 0x64, 0xb6, 0xb1, 0x5f, 0xda,
+             0x98, 0x24, 0xde, 0x88, 0xc,  0x34, 0xd8, 0xca, 0x4b, 0x16, 0x3,
+             0x8d, 0x4f, 0xa2, 0x34, 0x74, 0xde, 0x78, 0xca, 0xb,  0x33},
+            {0x4e, 0x74, 0x19, 0x11, 0x84, 0xff, 0x2e, 0x98, 0x24, 0x47, 0x7,
+             0x2b, 0x96, 0x5e, 0x69, 0xf9, 0xfb, 0x53, 0xc9, 0xbf, 0x4f, 0xc1,
+             0x8a, 0xc5, 0xf5, 0x1c, 0x9f, 0x36, 0x1b, 0xbe, 0x31, 0x3c},
+        },
+        {
+            {0x72, 0x42, 0xcb, 0xf9, 0x93, 0xbc, 0x68, 0xc1, 0x98, 0xdb, 0xce,
+             0xc7, 0x1f, 0x71, 0xb8, 0xae, 0x7a, 0x8d, 0xac, 0x34, 0xaa, 0x52,
+             0xe,  0x7f, 0xbb, 0x55, 0x7d, 0x7e, 0x9,  0xc1, 0xce, 0x41},
+            {0xee, 0x8a, 0x94, 0x8,  0x4d, 0x86, 0xf4, 0xb0, 0x6f, 0x1c, 0xba,
+             0x91, 0xee, 0x19, 0xdc, 0x7,  0x58, 0xa1, 0xac, 0xa6, 0xae, 0xcd,
+             0x75, 0x79, 0xbb, 0xd4, 0x62, 0x42, 0x13, 0x61, 0xb,  0x33},
+            {0x8a, 0x80, 0x6d, 0xa2, 0xd7, 0x19, 0x96, 0xf7, 0x6d, 0x15, 0x9e,
+             0x1d, 0x9e, 0xd4, 0x1f, 0xbb, 0x27, 0xdf, 0xa1, 0xdb, 0x6c, 0xc3,
+             0xd7, 0x73, 0x7d, 0x77, 0x28, 0x1f, 0xd9, 0x4c, 0xb4, 0x26},
+        },
+    },
+    {
+        {
+            {0x83, 0x3,  0x73, 0x62, 0x93, 0xf2, 0xb7, 0xe1, 0x2c, 0x8a, 0xca,
+             0xeb, 0xff, 0x79, 0x52, 0x4b, 0x14, 0x13, 0xd4, 0xbf, 0x8a, 0x77,
+             0xfc, 0xda, 0xf,  0x61, 0x72, 0x9c, 0x14, 0x10, 0xeb, 0x7d},
+            {0x75, 0x74, 0x38, 0x8f, 0x47, 0x48, 0xf0, 0x51, 0x3c, 0xcb, 0xbe,
+             0x9c, 0xf4, 0xbc, 0x5d, 0xb2, 0x55, 0x20, 0x9f, 0xd9, 0x44, 0x12,
+             0xab, 0x9a, 0xd6, 0xa5, 0x10, 0x1c, 0x6c, 0x9e, 0x70, 0x2c},
+            {0x7a, 0xee, 0x66, 0x87, 0x6a, 0xaf, 0x62, 0xcb, 0xe,  0xcd, 0x53,
+             0x55, 0x4,  0xec, 0xcb, 0x66, 0xb5, 0xe4, 0xb,  0xf,  0x38, 0x1,
+             0x80, 0x58, 0xea, 0xe2, 0x2c, 0xf6, 0x9f, 0x8e, 0xe6, 0x8},
+        },
+        {
+            {0xf9, 0xf2, 0xb8, 0xa,  0xd5, 0x9,  0x2d, 0x2f, 0xdf, 0x23, 0x59,
+             0xc5, 0x8d, 0x21, 0xb9, 0xac, 0xb9, 0x6c, 0x76, 0x73, 0x26, 0x34,
+             0x8f, 0x4a, 0xf5, 0x19, 0xf7, 0x38, 0xd7, 0x3b, 0xb1, 0x4c},
+            {0xad, 0x30, 0xc1, 0x4b, 0xa,  0x50, 0xad, 0x34, 0x9c, 0xd4, 0xb,
+             0x3d, 0x49, 0xdb, 0x38, 0x8d, 0xbe, 0x89, 0xa,  0x50, 0x98, 0x3d,
+             0x5c, 0xa2, 0x9,  0x3b, 0xba, 0xee, 0x87, 0x3f, 0x1f, 0x2f},
+            {0x4a, 0xb6, 0x15, 0xe5, 0x75, 0x8c, 0x84, 0xf7, 0x38, 0x90, 0x4a,
+             0xdb, 0xba, 0x1,  0x95, 0xa5, 0x50, 0x1b, 0x75, 0x3f, 0x3f, 0x31,
+             0xd,  0xc2, 0xe8, 0x2e, 0xae, 0xc0, 0x53, 0xe3, 0xa1, 0x19},
+        },
+        {
+            {0xbd, 0xbd, 0x96, 0xd5, 0xcd, 0x72, 0x21, 0xb4, 0x40, 0xfc, 0xee,
+             0x98, 0x43, 0x45, 0xe0, 0x93, 0xb5, 0x9,  0x41, 0xb4, 0x47, 0x53,
+             0xb1, 0x9f, 0x34, 0xae, 0x66, 0x2,  0x99, 0xd3, 0x6b, 0x73},
+            {0xc3, 0x5,  0xfa, 0xba, 0x60, 0x75, 0x1c, 0x7d, 0x61, 0x5e, 0xe5,
+             0xc6, 0xa0, 0xa0, 0xe1, 0xb3, 0x73, 0x64, 0xd6, 0xc0, 0x18, 0x97,
+             0x52, 0xe3, 0x86, 0x34, 0xc,  0xc2, 0x11, 0x6b, 0x54, 0x41},
+            {0xb4, 0xb3, 0x34, 0x93, 0x50, 0x2d, 0x53, 0x85, 0x73, 0x65, 0x81,
+             0x60, 0x4b, 0x11, 0xfd, 0x46, 0x75, 0x83, 0x5c, 0x42, 0x30, 0x5f,
+             0x5f, 0xcc, 0x5c, 0xab, 0x7f, 0xb8, 0xa2, 0x95, 0x22, 0x41},
+        },
+        {
+            {0xc6, 0xea, 0x93, 0xe2, 0x61, 0x52, 0x65, 0x2e, 0xdb, 0xac, 0x33,
+             0x21, 0x3,  0x92, 0x5a, 0x84, 0x6b, 0x99, 0x0,  0x79, 0xcb, 0x75,
+             0x9,  0x46, 0x80, 0xdd, 0x5a, 0x19, 0x8d, 0xbb, 0x60, 0x7},
+            {0xe9, 0xd6, 0x7e, 0xf5, 0x88, 0x9b, 0xc9, 0x19, 0x25, 0xc8, 0xf8,
+             0x6d, 0x26, 0xcb, 0x93, 0x53, 0x73, 0xd2, 0xa,  0xb3, 0x13, 0x32,
+             0xee, 0x5c, 0x34, 0x2e, 0x2d, 0xb5, 0xeb, 0x53, 0xe1, 0x14},
+            {0x8a, 0x81, 0xe6, 0xcd, 0x17, 0x1a, 0x3e, 0x41, 0x84, 0xa0, 0x69,
+             0xed, 0xa9, 0x6d, 0x15, 0x57, 0xb1, 0xcc, 0xca, 0x46, 0x8f, 0x26,
+             0xbf, 0x2c, 0xf2, 0xc5, 0x3a, 0xc3, 0x9b, 0xbe, 0x34, 0x6b},
+        },
+        {
+            {0xd3, 0xf2, 0x71, 0x65, 0x65, 0x69, 0xfc, 0x11, 0x7a, 0x73, 0xe,
+             0x53, 0x45, 0xe8, 0xc9, 0xc6, 0x35, 0x50, 0xfe, 0xd4, 0xa2, 0xe7,
+             0x3a, 0xe3, 0xb,  0xd3, 0x6d, 0x2e, 0xb6, 0xc7, 0xb9, 0x1},
+            {0xb2, 0xc0, 0x78, 0x3a, 0x64, 0x2f, 0xdf, 0xf3, 0x7c, 0x2,  0x2e,
+             0xf2, 0x1e, 0x97, 0x3e, 0x4c, 0xa3, 0xb5, 0xc1, 0x49, 0x5e, 0x1c,
+             0x7d, 0xec, 0x2d, 0xdd, 0x22, 0x9,  0x8f, 0xc1, 0x12, 0x20},
+            {0x29, 0x9d, 0xc8, 0x5a, 0xe5, 0x55, 0xb,  0x88, 0x63, 0xa7, 0xa0,
+             0x45, 0x1f, 0x24, 0x83, 0x14, 0x1f, 0x6c, 0xe7, 0xc2, 0xdf, 0xef,
+             0x36, 0x3d, 0xe8, 0xad, 0x4b, 0x4e, 0x78, 0x5b, 0xaf, 0x8},
+        },
+        {
+            {0x4b, 0x2c, 0xcc, 0x89, 0xd2, 0x14, 0x73, 0xe2, 0x8d, 0x17, 0x87,
+             0xa2, 0x11, 0xbd, 0xe4, 0x4b, 0xce, 0x64, 0x33, 0xfa, 0xd6, 0x28,
+             0xd5, 0x18, 0x6e, 0x82, 0xd9, 0xaf, 0xd5, 0xc1, 0x23, 0x64},
+            {0x33, 0x25, 0x1f, 0x88, 0xdc, 0x99, 0x34, 0x28, 0xb6, 0x23, 0x93,
+             0x77, 0xda, 0x25, 0x5,  0x9d, 0xf4, 0x41, 0x34, 0x67, 0xfb, 0xdd,
+             0x7a, 0x89, 0x8d, 0x16, 0x3a, 0x16, 0x71, 0x9d, 0xb7, 0x32},
+            {0x6a, 0xb3, 0xfc, 0xed, 0xd9, 0xf8, 0x85, 0xcc, 0xf9, 0xe5, 0x46,
+             0x37, 0x8f, 0xc2, 0xbc, 0x22, 0xcd, 0xd3, 0xe5, 0xf9, 0x38, 0xe3,
+             0x9d, 0xe4, 0xcc, 0x2d, 0x3e, 0xc1, 0xfb, 0x5e, 0xa,  0x48},
+        },
+        {
+            {0x1f, 0x22, 0xce, 0x42, 0xe4, 0x4c, 0x61, 0xb6, 0x28, 0x39, 0x5,
+             0x4c, 0xcc, 0x9d, 0x19, 0x6e, 0x3,  0xbe, 0x1c, 0xdc, 0xa4, 0xb4,
+             0x3f, 0x66, 0x6,  0x8e, 0x1c, 0x69, 0x47, 0x1d, 0xb3, 0x24},
+            {0x71, 0x20, 0x62, 0x1,  0xb,  0xe7, 0x51, 0xb,  0xc5, 0xaf, 0x1d,
+             0x8b, 0xcf, 0x5,  0xb5, 0x6,  0xcd, 0xab, 0x5a, 0xef, 0x61, 0xb0,
+             0x6b, 0x2c, 0x31, 0xbf, 0xb7, 0xc,  0x60, 0x27, 0xaa, 0x47},
+            {0xc3, 0xf8, 0x15, 0xc0, 0xed, 0x1e, 0x54, 0x2a, 0x7c, 0x3f, 0x69,
+             0x7c, 0x7e, 0xfe, 0xa4, 0x11, 0xd6, 0x78, 0xa2, 0x4e, 0x13, 0x66,
+             0xaf, 0xf0, 0x94, 0xa0, 0xdd, 0x14, 0x5d, 0x58, 0x5b, 0x54},
+        },
+        {
+            {0xe1, 0x21, 0xb3, 0xe3, 0xd0, 0xe4, 0x4,  0x62, 0x95, 0x1e, 0xff,
+             0x28, 0x7a, 0x63, 0xaa, 0x3b, 0x9e, 0xbd, 0x99, 0x5b, 0xfd, 0xcf,
+             0xc,  0xb,  0x71, 0xd0, 0xc8, 0x64, 0x3e, 0xdc, 0x22, 0x4d},
+            {0xf,  0x3a, 0xd4, 0xa0, 0x5e, 0x27, 0xbf, 0x67, 0xbe, 0xee, 0x9b,
+             0x8,  0x34, 0x8e, 0xe6, 0xad, 0x2e, 0xe7, 0x79, 0xd4, 0x4c, 0x13,
+             0x89, 0x42, 0x54, 0x54, 0xba, 0x32, 0xc3, 0xf9, 0x62, 0xf},
+            {0x39, 0x5f, 0x3b, 0xd6, 0x89, 0x65, 0xb4, 0xfc, 0x61, 0xcf, 0xcb,
+             0x57, 0x3f, 0x6a, 0xae, 0x5c, 0x5,  0xfa, 0x3a, 0x95, 0xd2, 0xc2,
+             0xba, 0xfe, 0x36, 0x14, 0x37, 0x36, 0x1a, 0xa0, 0xf,  0x1c},
+        },
+    },
+    {
+        {
+            {0x50, 0x6a, 0x93, 0x8c, 0xe,  0x2b, 0x8,  0x69, 0xb6, 0xc5, 0xda,
+             0xc1, 0x35, 0xa0, 0xc9, 0xf9, 0x34, 0xb6, 0xdf, 0xc4, 0x54, 0x3e,
+             0xb7, 0x6f, 0x40, 0xc1, 0x2b, 0x1d, 0x9b, 0x41, 0x5,  0x40},
+            {0xff, 0x3d, 0x94, 0x22, 0xb6, 0x4,  0xc6, 0xd2, 0xa0, 0xb3, 0xcf,
+             0x44, 0xce, 0xbe, 0x8c, 0xbc, 0x78, 0x86, 0x80, 0x97, 0xf3, 0x4f,
+             0x25, 0x5d, 0xbf, 0xa6, 0x1c, 0x3b, 0x4f, 0x61, 0xa3, 0xf},
+            {0xf0, 0x82, 0xbe, 0xb9, 0xbd, 0xfe, 0x3,  0xa0, 0x90, 0xac, 0x44,
+             0x3a, 0xaf, 0xc1, 0x89, 0x20, 0x8e, 0xfa, 0x54, 0x19, 0x91, 0x9f,
+             0x49, 0xf8, 0x42, 0xab, 0x40, 0xef, 0x8a, 0x21, 0xba, 0x1f},
+        },
+        {
+            {0x94, 0x1,  0x7b, 0x3e, 0x4,  0x57, 0x3e, 0x4f, 0x7f, 0xaf, 0xda,
+             0x8,  0xee, 0x3e, 0x1d, 0xa8, 0xf1, 0xde, 0xdc, 0x99, 0xab, 0xc6,
+             0x39, 0xc8, 0xd5, 0x61, 0x77, 0xff, 0x13, 0x5d, 0x53, 0x6c},
+            {0x3e, 0xf5, 0xc8, 0xfa, 0x48, 0x94, 0x54, 0xab, 0x41, 0x37, 0xa6,
+             0x7b, 0x9a, 0xe8, 0xf6, 0x81, 0x1,  0x5e, 0x2b, 0x6c, 0x7d, 0x6c,
+             0xfd, 0x74, 0x42, 0x6e, 0xc8, 0xa8, 0xca, 0x3a, 0x2e, 0x39},
+            {0xaf, 0x35, 0x8a, 0x3e, 0xe9, 0x34, 0xbd, 0x4c, 0x16, 0xe8, 0x87,
+             0x58, 0x44, 0x81, 0x7,  0x2e, 0xab, 0xb0, 0x9a, 0xf2, 0x76, 0x9c,
+             0x31, 0x19, 0x3b, 0xc1, 0xa,  0xd5, 0xe4, 0x7f, 0xe1, 0x25},
+        },
+        {
+            {0xa7, 0x21, 0xf1, 0x76, 0xf5, 0x7f, 0x5f, 0x91, 0xe3, 0x87, 0xcd,
+             0x2f, 0x27, 0x32, 0x4a, 0xc3, 0x26, 0xe5, 0x1b, 0x4d, 0xde, 0x2f,
+             0xba, 0xcc, 0x9b, 0x89, 0x69, 0x89, 0x8f, 0x82, 0xba, 0x6b},
+            {0x76, 0xf6, 0x4,  0x1e, 0xd7, 0x9b, 0x28, 0xa,  0x95, 0xf,  0x42,
+             0xd6, 0x52, 0x1c, 0x8e, 0x20, 0xab, 0x1f, 0x69, 0x34, 0xb0, 0xd8,
+             0x86, 0x51, 0x51, 0xb3, 0x9f, 0x2a, 0x44, 0x51, 0x57, 0x25},
+            {0x1,  0x39, 0xfe, 0x90, 0x66, 0xbc, 0xd1, 0xe2, 0xd5, 0x7a, 0x99,
+             0xa0, 0x18, 0x4a, 0xb5, 0x4c, 0xd4, 0x60, 0x84, 0xaf, 0x14, 0x69,
+             0x1d, 0x97, 0xe4, 0x7b, 0x6b, 0x7f, 0x4f, 0x50, 0x9d, 0x55},
+        },
+        {
+            {0xfd, 0x66, 0xd2, 0xf6, 0xe7, 0x91, 0x48, 0x9c, 0x1b, 0x78, 0x7,
+             0x3,  0x9b, 0xa1, 0x44, 0x7,  0x3b, 0xe2, 0x61, 0x60, 0x1d, 0x8f,
+             0x38, 0x88, 0xe,  0xd5, 0x4b, 0x35, 0xa3, 0xa6, 0x3e, 0x12},
+            {0xd5, 0x54, 0xeb, 0xb3, 0x78, 0x83, 0x73, 0xa7, 0x7c, 0x3c, 0x55,
+             0xa5, 0x66, 0xd3, 0x69, 0x1d, 0xba, 0x0,  0x28, 0xf9, 0x62, 0xcf,
+             0x26, 0xa,  0x17, 0x32, 0x7e, 0x80, 0xd5, 0x12, 0xab, 0x1},
+            {0x96, 0x2d, 0xe3, 0x41, 0x90, 0x18, 0x8d, 0x11, 0x48, 0x58, 0x31,
+             0xd8, 0xc2, 0xe3, 0xed, 0xb9, 0xd9, 0x45, 0x32, 0xd8, 0x71, 0x42,
+             0xab, 0x1e, 0x54, 0xa1, 0x18, 0xc9, 0xe2, 0x61, 0x39, 0x4a},
+        },
+        {
+            {0x1e, 0x3f, 0x23, 0xf3, 0x44, 0xd6, 0x27, 0x3,  0x16, 0xf0, 0xfc,
+             0x34, 0xe,  0x26, 0x9a, 0x49, 0x79, 0xb9, 0xda, 0xf2, 0x16, 0xa7,
+             0xb5, 0x83, 0x1f, 0x11, 0xd4, 0x9b, 0xad, 0xee, 0xac, 0x68},
+            {0xa0, 0xbb, 0xe6, 0xf8, 0xe0, 0x3b, 0xdc, 0x71, 0xa,  0xe3, 0xff,
+             0x7e, 0x34, 0xf8, 0xce, 0xd6, 0x6a, 0x47, 0x3a, 0xe1, 0x5f, 0x42,
+             0x92, 0xa9, 0x63, 0xb7, 0x1d, 0xfb, 0xe3, 0xbc, 0xd6, 0x2c},
+            {0x10, 0xc2, 0xd7, 0xf3, 0xe,  0xc9, 0xb4, 0x38, 0xc,  0x4,  0xad,
+             0xb7, 0x24, 0x6e, 0x8e, 0x30, 0x23, 0x3e, 0xe7, 0xb7, 0xf1, 0xd9,
+             0x60, 0x38, 0x97, 0xf5, 0x8,  0xb5, 0xd5, 0x60, 0x57, 0x59},
+        },
+        {
+            {0x90, 0x27, 0x2,  0xfd, 0xeb, 0xcb, 0x2a, 0x88, 0x60, 0x57, 0x11,
+             0xc4, 0x5,  0x33, 0xaf, 0x89, 0xf4, 0x73, 0x34, 0x7d, 0xe3, 0x92,
+             0xf4, 0x65, 0x2b, 0x5a, 0x51, 0x54, 0xdf, 0xc5, 0xb2, 0x2c},
+            {0x97, 0x63, 0xaa, 0x4,  0xe1, 0xbf, 0x29, 0x61, 0xcb, 0xfc, 0xa7,
+             0xa4, 0x8,  0x0,  0x96, 0x8f, 0x58, 0x94, 0x90, 0x7d, 0x89, 0xc0,
+             0x8b, 0x3f, 0xa9, 0x91, 0xb2, 0xdc, 0x3e, 0xa4, 0x9f, 0x70},
+            {0xca, 0x2a, 0xfd, 0x63, 0x8c, 0x5d, 0xa,  0xeb, 0xff, 0x4e, 0x69,
+             0x2e, 0x66, 0xc1, 0x2b, 0xd2, 0x3a, 0xb0, 0xcb, 0xf8, 0x6e, 0xf3,
+             0x23, 0x27, 0x1f, 0x13, 0xc8, 0xf0, 0xec, 0x29, 0xf0, 0x70},
+        },
+        {
+            {0xb9, 0xb0, 0x10, 0x5e, 0xaa, 0xaf, 0x6a, 0x2a, 0xa9, 0x1a, 0x4,
+             0xef, 0x70, 0xa3, 0xf0, 0x78, 0x1f, 0xd6, 0x3a, 0xaa, 0x77, 0xfb,
+             0x3e, 0x77, 0xe1, 0xd9, 0x4b, 0xa7, 0xa2, 0xa5, 0xec, 0x44},
+            {0x33, 0x3e, 0xed, 0x2e, 0xb3, 0x7,  0x13, 0x46, 0xe7, 0x81, 0x55,
+             0xa4, 0x33, 0x2f, 0x4,  0xae, 0x66, 0x3,  0x5f, 0x19, 0xd3, 0x49,
+             0x44, 0xc9, 0x58, 0x48, 0x31, 0x6c, 0x8a, 0x5d, 0x7d, 0xb},
+            {0x43, 0xd5, 0x95, 0x7b, 0x32, 0x48, 0xd4, 0x25, 0x1d, 0xf,  0x34,
+             0xa3, 0x0,  0x83, 0xd3, 0x70, 0x2b, 0xc5, 0xe1, 0x60, 0x1c, 0x53,
+             0x1c, 0xde, 0xe4, 0xe9, 0x7d, 0x2c, 0x51, 0x24, 0x22, 0x27},
+        },
+        {
+            {0xfc, 0x75, 0xa9, 0x42, 0x8a, 0xbb, 0x7b, 0xbf, 0x58, 0xa3, 0xad,
+             0x96, 0x77, 0x39, 0x5c, 0x8c, 0x48, 0xaa, 0xed, 0xcd, 0x6f, 0xc7,
+             0x7f, 0xe2, 0xa6, 0x20, 0xbc, 0xf6, 0xd7, 0x5f, 0x73, 0x19},
+            {0x2e, 0x34, 0xc5, 0x49, 0xaf, 0x92, 0xbc, 0x1a, 0xd0, 0xfa, 0xe6,
+             0xb2, 0x11, 0xd8, 0xee, 0xff, 0x29, 0x4e, 0xc8, 0xfc, 0x8d, 0x8c,
+             0xa2, 0xef, 0x43, 0xc5, 0x4c, 0xa4, 0x18, 0xdf, 0xb5, 0x11},
+            {0x66, 0x42, 0xc8, 0x42, 0xd0, 0x90, 0xab, 0xe3, 0x7e, 0x54, 0x19,
+             0x7f, 0xf,  0x8e, 0x84, 0xeb, 0xb9, 0x97, 0xa4, 0x65, 0xd0, 0xa1,
+             0x3,  0x25, 0x5f, 0x89, 0xdf, 0x91, 0x11, 0x91, 0xef, 0xf},
+        },
+    },
+};
+
+#endif  // OPENSSL_SMALL
+
+// Bi[i] = (2*i+1)*B
+static const ge_precomp Bi[8] = {
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            1288382639258501, 245678601348599, 269427782077623,
+            1462984067271730, 137412439391563
+#else
+            25967493, 19198397, 29566455, 3660896, 54414519, 4014786, 27544626,
+            21800161, 61029707, 2047604
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            62697248952638, 204681361388450, 631292143396476, 338455783676468,
+            1213667448819585
+#else
+            54563134, 934261, 64385954, 3049989, 66381436, 9406985, 12720692,
+            5043384, 19500929, 18085054
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            301289933810280, 1259582250014073, 1422107436869536,
+            796239922652654, 1953934009299142
+#else
+            58370664, 4489569, 9688441, 18769238, 10184608, 21191052, 29287918,
+            11864899, 42594502, 29115885
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            1601611775252272, 1720807796594148, 1132070835939856,
+            1260455018889551, 2147779492816911
+#else
+            15636272, 23865875, 24204772, 25642034, 616976, 16869170, 27787599,
+            18782243, 28944399, 32004408
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            316559037616741, 2177824224946892, 1459442586438991,
+            1461528397712656, 751590696113597
+#else
+            16568933, 4717097, 55552716, 32452109, 15682895, 21747389, 16354576,
+            21778470, 7689661, 11199574
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            1850748884277385, 1200145853858453, 1068094770532492,
+            672251375690438, 1586055907191707
+#else
+            30464137, 27578307, 55329429, 17883566, 23220364, 15915852, 7512774,
+            10017326, 49359771, 23634074
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            769950342298419, 132954430919746, 844085933195555, 974092374476333,
+            726076285546016
+#else
+            10861363, 11473154, 27284546, 1981175, 37044515, 12577860, 32867885,
+            14515107, 51670560, 10819379
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            425251763115706, 608463272472562, 442562545713235, 837766094556764,
+            374555092627893
+#else
+            4708026, 6336745, 20377586, 9066809, 55836755, 6594695, 41455196,
+            12483687, 54440373, 5581305
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            1086255230780037, 274979815921559, 1960002765731872,
+            929474102396301, 1190409889297339
+#else
+            19563141, 16186464, 37722007, 4097518, 10237984, 29206317, 28542349,
+            13850243, 43430843, 17738489
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            665000864555967, 2065379846933859, 370231110385876, 350988370788628,
+            1233371373142985
+#else
+            5153727, 9909285, 1723747, 30776558, 30523604, 5516873, 19480852,
+            5230134, 43156425, 18378665
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            2019367628972465, 676711900706637, 110710997811333,
+            1108646842542025, 517791959672113
+#else
+            36839857, 30090922, 7665485, 10083793, 28475525, 1649722, 20654025,
+            16520125, 30598449, 7715701
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            965130719900578, 247011430587952, 526356006571389, 91986625355052,
+            2157223321444601
+#else
+            28881826, 14381568, 9657904, 3680757, 46927229, 7843315, 35708204,
+            1370707, 29794553, 32145132
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            1802695059465007, 1664899123557221, 593559490740857,
+            2160434469266659, 927570450755031
+#else
+            44589871, 26862249, 14201701, 24808930, 43598457, 8844725, 18474211,
+            32192982, 54046167, 13821876
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            1725674970513508, 1933645953859181, 1542344539275782,
+            1767788773573747, 1297447965928905
+#else
+            60653668, 25714560, 3374701, 28813570, 40010246, 22982724, 31655027,
+            26342105, 18853321, 19333481
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            1381809363726107, 1430341051343062, 2061843536018959,
+            1551778050872521, 2036394857967624
+#else
+            4566811, 20590564, 38133974, 21313742, 59506191, 30723862, 58594505,
+            23123294, 2207752, 30344648
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            1970894096313054, 528066325833207, 1619374932191227,
+            2207306624415883, 1169170329061080
+#else
+            41954014, 29368610, 29681143, 7868801, 60254203, 24130566, 54671499,
+            32891431, 35997400, 17421995
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            2070390218572616, 1458919061857835, 624171843017421,
+            1055332792707765, 433987520732508
+#else
+            25576264, 30851218, 7349803, 21739588, 16472781, 9300885, 3844789,
+            15725684, 171356, 6466918
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            893653801273833, 1168026499324677, 1242553501121234,
+            1306366254304474, 1086752658510815
+#else
+            23103977, 13316479, 9739013, 17404951, 817874, 18515490, 8965338,
+            19466374, 36393951, 16193876
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            213454002618221, 939771523987438, 1159882208056014, 317388369627517,
+            621213314200687
+#else
+            33587053, 3180712, 64714734, 14003686, 50205390, 17283591, 17238397,
+            4729455, 49034351, 9256799
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            1971678598905747, 338026507889165, 762398079972271, 655096486107477,
+            42299032696322
+#else
+            41926547, 29380300, 32336397, 5036987, 45872047, 11360616, 22616405,
+            9761698, 47281666, 630304
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            177130678690680, 1754759263300204, 1864311296286618,
+            1180675631479880, 1292726903152791
+#else
+            53388152, 2639452, 42871404, 26147950, 9494426, 27780403, 60554312,
+            17593437, 64659607, 19263131
+#endif
+        }},
+    },
+    {
+        {{
+#if defined(OPENSSL_64_BIT)
+            1913163449625248, 460779200291993, 2193883288642314,
+            1008900146920800, 1721983679009502
+#else
+            63957664, 28508356, 9282713, 6866145, 35201802, 32691408, 48168288,
+            15033783, 25105118, 25659556
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            1070401523076875, 1272492007800961, 1910153608563310,
+            2075579521696771, 1191169788841221
+#else
+            42782475, 15950225, 35307649, 18961608, 55446126, 28463506, 1573891,
+            30928545, 2198789, 17749813
+#endif
+        }},
+        {{
+#if defined(OPENSSL_64_BIT)
+            692896803108118, 500174642072499, 2068223309439677,
+            1162190621851337, 1426986007309901
+#else
+            64009494, 10324966, 64867251, 7453182, 61661885, 30818928, 53296841,
+            17317989, 34647629, 21263748
+#endif
+        }},
+    },
+};
diff --git a/ring-0.17.14/crypto/curve25519/internal.h b/ring-0.17.14/crypto/curve25519/internal.h
new file mode 100644
index 0000000000..d15a1b47a7
--- /dev/null
+++ b/ring-0.17.14/crypto/curve25519/internal.h
@@ -0,0 +1,135 @@
+// Copyright 2020 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_CURVE25519_INTERNAL_H
+#define OPENSSL_HEADER_CURVE25519_INTERNAL_H
+
+#include <ring-core/base.h>
+
+#include "../internal.h"
+
+
+#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE)
+#define BORINGSSL_X25519_NEON
+
+// x25519_NEON is defined in asm/x25519-arm.S.
+void x25519_NEON(uint8_t out[32], const uint8_t scalar[32],
+                 const uint8_t point[32]);
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \
+    defined(__GNUC__) && defined(__x86_64__) && !defined(OPENSSL_WINDOWS)
+#define BORINGSSL_FE25519_ADX
+
+// fiat_curve25519_adx_mul is defined in
+// third_party/fiat/asm/fiat_curve25519_adx_mul.S
+void __attribute__((sysv_abi))
+fiat_curve25519_adx_mul(uint64_t out[4], const uint64_t in1[4],
+                        const uint64_t in2[4]);
+
+// fiat_curve25519_adx_square is defined in
+// third_party/fiat/asm/fiat_curve25519_adx_square.S
+void __attribute__((sysv_abi))
+fiat_curve25519_adx_square(uint64_t out[4], const uint64_t in[4]);
+
+// x25519_scalar_mult_adx is defined in third_party/fiat/curve25519_64_adx.h
+void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32],
+                            const uint8_t point[32]);
+void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]);
+#endif
+
+#if defined(OPENSSL_64_BIT)
+// An element t,
+// entries t[0]...t[4], represents the integer t[0]+2^51 t[1]+2^102 t[2]+2^153
+// t[3]+2^204 t[4].
+// fe limbs are bounded by 1.125*2^51.
+// fe_loose limbs are bounded by 3.375*2^51.
+typedef uint64_t fe_limb_t;
+#define FE_NUM_LIMBS 5
+#else
+// An element t,
+// entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
+// t[3]+2^102 t[4]+...+2^230 t[9].
+// fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
+// fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc.
+typedef uint32_t fe_limb_t;
+#define FE_NUM_LIMBS 10
+#endif
+
+// fe means field element. Here the field is \Z/(2^255-19).
+// Multiplication and carrying produce fe from fe_loose.
+// Keep in sync with `Elem` and `ELEM_LIMBS` in curve25519/ops.rs.
+typedef struct fe { fe_limb_t v[FE_NUM_LIMBS]; } fe;
+
+// Addition and subtraction produce fe_loose from (fe, fe).
+// Keep in sync with `Elem` and `ELEM_LIMBS` in curve25519/ops.rs.
+typedef struct fe_loose { fe_limb_t v[FE_NUM_LIMBS]; } fe_loose;
+
+static inline void fe_limbs_copy(fe_limb_t r[], const fe_limb_t a[]) {
+  for (size_t i = 0; i < FE_NUM_LIMBS; ++i) {
+    r[i] = a[i];
+  }
+}
+
+// ge means group element.
+//
+// Here the group is the set of pairs (x,y) of field elements (see fe.h)
+// satisfying -x^2 + y^2 = 1 + d x^2y^2
+// where d = -121665/121666.
+//
+// Representations:
+//   ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z
+//   ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT
+//   ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T
+//   ge_precomp (Duif): (y+x,y-x,2dxy)
+
+// Keep in sync with `Point` in curve25519/ops.rs.
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+} ge_p2;
+
+
+// Keep in sync with `ExtPoint` in curve25519/ops.rs.
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+  fe T;
+} ge_p3;
+
+typedef struct {
+  fe_loose X;
+  fe_loose Y;
+  fe_loose Z;
+  fe_loose T;
+} ge_p1p1;
+
+typedef struct {
+  fe_loose yplusx;
+  fe_loose yminusx;
+  fe_loose xy2d;
+} ge_precomp;
+
+typedef struct {
+  fe_loose YplusX;
+  fe_loose YminusX;
+  fe_loose Z;
+  fe_loose T2d;
+} ge_cached;
+
+extern const uint8_t k25519Precomp[32][8][3][32];
+
+#endif  // OPENSSL_HEADER_CURVE25519_INTERNAL_H
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c b/ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c
new file mode 100644
index 0000000000..9530cbc9b2
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c
@@ -0,0 +1,881 @@
+/* Copyright (c) 2019, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <ring-core/aes.h>
+
+#include "../../internal.h"
+
+// This file contains a constant-time implementation of AES, bitsliced with
+// 32-bit or 64-bit, operating on two-, four-, and eight-block
+// batches, respectively.
+//
+// This implementation is based on the algorithms described in the following
+// references:
+// - https://bearssl.org/constanttime.html#aes
+// - https://eprint.iacr.org/2009/129.pdf
+// - https://eprint.iacr.org/2009/191.pdf
+
+
+// Word operations.
+//
+// An aes_word_t is the word used for this AES implementation. Throughout this
+// file, bits and bytes are ordered little-endian, though "left" and "right"
+// shifts match the operations themselves, which makes them reversed in a
+// little-endian, left-to-right reading.
+//
+// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
+// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
+// bits each, each corresponding to a byte in an AES block in column-major
+// order (AES's byte order). We refer to these as "logical bytes". Note, in the
+// 32-bit and 64-bit implementations, they are smaller than a byte. (The
+// contents of a logical byte will be described later.)
+//
+// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
+// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
+// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
+// value ranges from 0 to 15 independent of |aes_word_t| and
+// |AES_NOHW_BATCH_SIZE|.
+//
+// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
+// uses row-major order. Matching the AES order was easier to reason about, and
+// we do not have PSHUFB available to arbitrarily permute bytes.
+
+#if defined(OPENSSL_64_BIT)
+typedef uint64_t aes_word_t;
+#define AES_NOHW_WORD_SIZE 8
+#define AES_NOHW_BATCH_SIZE 4
+#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
+#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
+#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
+#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
+#else  // !OPENSSL_64_BIT
+typedef uint32_t aes_word_t;
+#define AES_NOHW_WORD_SIZE 4
+#define AES_NOHW_BATCH_SIZE 2
+#define AES_NOHW_ROW0_MASK 0x03030303
+#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
+#define AES_NOHW_ROW2_MASK 0x30303030
+#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
+#endif  // OPENSSL_64_BIT
+
+static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
+  return a & b;
+}
+
+static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
+  return a | b;
+}
+
+static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
+  return a ^ b;
+}
+
+static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
+
+static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
+  return a << (i * AES_NOHW_BATCH_SIZE);
+}
+
+static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
+  return a >> (i * AES_NOHW_BATCH_SIZE);
+}
+
+OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
+                      "batch size does not match word size");
+OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
+                      "AES_NOHW_WORD_SIZE is incorrect");
+
+
+// Block representations.
+//
+// This implementation uses three representations for AES blocks. First, the
+// public API represents blocks as uint8_t[16] in the usual way. Second, most
+// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
+// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
+// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
+// bars divide logical bytes):
+//
+//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
+//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
+//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
+//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
+//   ...
+//
+// Finally, an individual block may be stored as an intermediate form in an
+// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
+// block, so that block[0]'s ith logical byte contains least-significant
+// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
+// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
+// "compacting" the block. Note this is no-op with 128-bit words because then
+// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
+// words, one block would be stored in two words:
+//
+//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
+//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
+//
+// Observe that the distances between corresponding bits in bitsliced and
+// compact bit orders match. If we line up corresponding words of each block,
+// the bitsliced and compact representations may be converted by tranposing bits
+// in corresponding logical bytes. Continuing the 64-bit example:
+//
+//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
+//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
+//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
+//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
+//
+//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
+//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
+//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
+//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
+//
+// Note also that bitwise operations and (logical) byte permutations on an
+// |aes_word_t| work equally for the bitsliced and compact words.
+//
+// We use the compact form in the |AES_KEY| representation to save work
+// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
+// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
+// before or after |aes_nohw_transpose|.
+
+#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
+
+// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
+// specified, it is in bitsliced form.
+typedef struct {
+  aes_word_t w[8];
+} AES_NOHW_BATCH;
+
+// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
+// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
+// |AES_KEY|s so it should not be used as a long-term key representation.
+typedef struct {
+  // keys is an array of batches, one for each round key. Each batch stores
+  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
+  AES_NOHW_BATCH keys[AES_MAXNR + 1];
+} AES_NOHW_SCHEDULE;
+
+// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
+// compact form.
+static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
+                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
+                                      size_t i) {
+  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
+  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
+  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
+  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
+  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
+  // will be correctly placed.)
+  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
+#if defined(OPENSSL_64_BIT)
+  batch->w[i] = in[0];
+  batch->w[i + 4] = in[1];
+#else
+  batch->w[i] = in[0];
+  batch->w[i + 2] = in[1];
+  batch->w[i + 4] = in[2];
+  batch->w[i + 6] = in[3];
+#endif
+}
+
+// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
+// compact form.
+static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
+                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                                      size_t i) {
+  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
+#if defined(OPENSSL_64_BIT)
+  out[0] = batch->w[i];
+  out[1] = batch->w[i + 4];
+#else
+  out[0] = batch->w[i];
+  out[1] = batch->w[i + 2];
+  out[2] = batch->w[i + 4];
+  out[3] = batch->w[i + 6];
+#endif
+}
+
+// aes_nohw_delta_swap returns |a| with bits |a & mask| and
+// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
+static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
+                                             aes_word_t shift) {
+  // See
+  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
+  aes_word_t b = (a ^ (a >> shift)) & mask;
+  return a ^ b ^ (b << shift);
+}
+
+// In the 32-bit and 64-bit implementations, a block spans multiple words.
+// |aes_nohw_compact_block| must permute bits across different words. First we
+// implement |aes_nohw_compact_word| which performs a smaller version of the
+// transformation which stays within a single word.
+//
+// These transformations are generalizations of the output of
+// http://programming.sirrida.de/calcperm.php on smaller inputs.
+#if defined(OPENSSL_64_BIT)
+static inline uint64_t aes_nohw_compact_word(uint64_t a) {
+#if defined(RING_BIG_ENDIAN)
+  a = CRYPTO_bswap8(a);
+#endif
+  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
+  // quartets of those chunks:
+  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
+  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
+  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
+  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
+  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
+  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
+  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
+  return a;
+}
+
+static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
+  // Reverse the steps of |aes_nohw_uncompact_word|.
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
+  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
+#if defined(RING_BIG_ENDIAN)
+  a = CRYPTO_bswap8(a);
+#endif
+  return a;
+}
+#else   // !OPENSSL_64_BIT
+static inline uint32_t aes_nohw_compact_word(uint32_t a) {
+#if defined(RING_BIG_ENDIAN)
+  a = CRYPTO_bswap4(a);
+#endif
+  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
+  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
+  // Note:  0x00cc = 0b0000_0000_1100_1100
+  //   0x00cc << 6 = 0b0011_0011_0000_0000
+  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
+  // Now we swap groups of four bits (still numbering by pairs):
+  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
+  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
+  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
+  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
+  return a;
+}
+
+static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
+  // Reverse the steps of |aes_nohw_uncompact_word|.
+  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
+  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
+#if defined(RING_BIG_ENDIAN)
+  a = CRYPTO_bswap4(a);
+#endif
+  return a;
+}
+
+static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
+                                                uint8_t a2, uint8_t a3) {
+  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
+         ((uint32_t)a3 << 24);
+}
+
+static inline uint8_t lo(uint32_t a) {
+  return (uint8_t)a;
+}
+
+#endif  // OPENSSL_64_BIT
+
+static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                                          const uint8_t in[16]) {
+  OPENSSL_memcpy(out, in, 16);
+#if defined(OPENSSL_64_BIT)
+  uint64_t a0 = aes_nohw_compact_word(out[0]);
+  uint64_t a1 = aes_nohw_compact_word(out[1]);
+  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
+  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
+#else
+  uint32_t a0 = aes_nohw_compact_word(out[0]);
+  uint32_t a1 = aes_nohw_compact_word(out[1]);
+  uint32_t a2 = aes_nohw_compact_word(out[2]);
+  uint32_t a3 = aes_nohw_compact_word(out[3]);
+  // Note clang, when building for ARM Thumb2, will sometimes miscompile
+  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+  // without optimizations. This bug was introduced in
+  // https://reviews.llvm.org/rL340261 and fixed in
+  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+  out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
+  out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
+  out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
+  out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
+#endif
+}
+
+static inline void aes_nohw_uncompact_block(
+    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
+#if defined(OPENSSL_64_BIT)
+  uint64_t a0 = in[0];
+  uint64_t a1 = in[1];
+  uint64_t b0 =
+      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
+  uint64_t b1 =
+      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
+  OPENSSL_memcpy(out, &b0, 8);
+  OPENSSL_memcpy(out + 8, &b1, 8);
+#else
+  uint32_t a0 = in[0];
+  uint32_t a1 = in[1];
+  uint32_t a2 = in[2];
+  uint32_t a3 = in[3];
+  // Note clang, when building for ARM Thumb2, will sometimes miscompile
+  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+  // without optimizations. This bug was introduced in
+  // https://reviews.llvm.org/rL340261 and fixed in
+  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+  uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
+  uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
+  uint32_t b2 =
+      aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
+  uint32_t b3 =
+      aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
+  b0 = aes_nohw_uncompact_word(b0);
+  b1 = aes_nohw_uncompact_word(b1);
+  b2 = aes_nohw_uncompact_word(b2);
+  b3 = aes_nohw_uncompact_word(b3);
+  OPENSSL_memcpy(out, &b0, 4);
+  OPENSSL_memcpy(out + 4, &b1, 4);
+  OPENSSL_memcpy(out + 8, &b2, 4);
+  OPENSSL_memcpy(out + 12, &b3, 4);
+#endif
+}
+
+// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
+// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
+// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
+// is repeated to the full width of |aes_word_t|.
+static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
+                                      uint32_t mask, aes_word_t shift) {
+#if defined(OPENSSL_64_BIT)
+  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
+#else
+  aes_word_t mask_w = mask;
+#endif
+  // This is a variation on a delta swap.
+  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
+  *a ^= swap << shift;
+  *b ^= swap;
+}
+
+// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
+// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
+// and transposes each square.
+static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
+  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
+
+#if AES_NOHW_BATCH_SIZE >= 4
+  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
+#endif
+
+#if AES_NOHW_BATCH_SIZE >= 8
+  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
+#endif
+}
+
+// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
+// |num_blocks| must be at most |AES_NOHW_BATCH|.
+static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
+                              size_t num_blocks) {
+  // Don't leave unused blocks uninitialized.
+  OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH));
+  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
+  for (size_t i = 0; i < num_blocks; i++) {
+    aes_word_t block[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_compact_block(block, in + 16 * i);
+    aes_nohw_batch_set(out, block, i);
+  }
+
+  aes_nohw_transpose(out);
+}
+
+// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
+// |num_blocks| must be at most |AES_NOHW_BATCH|.
+static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
+                                const AES_NOHW_BATCH *batch) {
+  AES_NOHW_BATCH copy = *batch;
+  aes_nohw_transpose(&copy);
+
+  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
+  for (size_t i = 0; i < num_blocks; i++) {
+    aes_word_t block[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_batch_get(&copy, block, i);
+    aes_nohw_uncompact_block(out + 16 * i, block);
+  }
+}
+
+
+// AES round steps.
+
+static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
+                                   const AES_NOHW_BATCH *key) {
+  for (size_t i = 0; i < 8; i++) {
+    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
+  }
+}
+
+static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
+  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
+  aes_word_t x0 = batch->w[7];
+  aes_word_t x1 = batch->w[6];
+  aes_word_t x2 = batch->w[5];
+  aes_word_t x3 = batch->w[4];
+  aes_word_t x4 = batch->w[3];
+  aes_word_t x5 = batch->w[2];
+  aes_word_t x6 = batch->w[1];
+  aes_word_t x7 = batch->w[0];
+
+  // Figure 2, the top linear transformation.
+  aes_word_t y14 = aes_nohw_xor(x3, x5);
+  aes_word_t y13 = aes_nohw_xor(x0, x6);
+  aes_word_t y9 = aes_nohw_xor(x0, x3);
+  aes_word_t y8 = aes_nohw_xor(x0, x5);
+  aes_word_t t0 = aes_nohw_xor(x1, x2);
+  aes_word_t y1 = aes_nohw_xor(t0, x7);
+  aes_word_t y4 = aes_nohw_xor(y1, x3);
+  aes_word_t y12 = aes_nohw_xor(y13, y14);
+  aes_word_t y2 = aes_nohw_xor(y1, x0);
+  aes_word_t y5 = aes_nohw_xor(y1, x6);
+  aes_word_t y3 = aes_nohw_xor(y5, y8);
+  aes_word_t t1 = aes_nohw_xor(x4, y12);
+  aes_word_t y15 = aes_nohw_xor(t1, x5);
+  aes_word_t y20 = aes_nohw_xor(t1, x1);
+  aes_word_t y6 = aes_nohw_xor(y15, x7);
+  aes_word_t y10 = aes_nohw_xor(y15, t0);
+  aes_word_t y11 = aes_nohw_xor(y20, y9);
+  aes_word_t y7 = aes_nohw_xor(x7, y11);
+  aes_word_t y17 = aes_nohw_xor(y10, y11);
+  aes_word_t y19 = aes_nohw_xor(y10, y8);
+  aes_word_t y16 = aes_nohw_xor(t0, y11);
+  aes_word_t y21 = aes_nohw_xor(y13, y16);
+  aes_word_t y18 = aes_nohw_xor(x0, y16);
+
+  // Figure 3, the middle non-linear section.
+  aes_word_t t2 = aes_nohw_and(y12, y15);
+  aes_word_t t3 = aes_nohw_and(y3, y6);
+  aes_word_t t4 = aes_nohw_xor(t3, t2);
+  aes_word_t t5 = aes_nohw_and(y4, x7);
+  aes_word_t t6 = aes_nohw_xor(t5, t2);
+  aes_word_t t7 = aes_nohw_and(y13, y16);
+  aes_word_t t8 = aes_nohw_and(y5, y1);
+  aes_word_t t9 = aes_nohw_xor(t8, t7);
+  aes_word_t t10 = aes_nohw_and(y2, y7);
+  aes_word_t t11 = aes_nohw_xor(t10, t7);
+  aes_word_t t12 = aes_nohw_and(y9, y11);
+  aes_word_t t13 = aes_nohw_and(y14, y17);
+  aes_word_t t14 = aes_nohw_xor(t13, t12);
+  aes_word_t t15 = aes_nohw_and(y8, y10);
+  aes_word_t t16 = aes_nohw_xor(t15, t12);
+  aes_word_t t17 = aes_nohw_xor(t4, t14);
+  aes_word_t t18 = aes_nohw_xor(t6, t16);
+  aes_word_t t19 = aes_nohw_xor(t9, t14);
+  aes_word_t t20 = aes_nohw_xor(t11, t16);
+  aes_word_t t21 = aes_nohw_xor(t17, y20);
+  aes_word_t t22 = aes_nohw_xor(t18, y19);
+  aes_word_t t23 = aes_nohw_xor(t19, y21);
+  aes_word_t t24 = aes_nohw_xor(t20, y18);
+  aes_word_t t25 = aes_nohw_xor(t21, t22);
+  aes_word_t t26 = aes_nohw_and(t21, t23);
+  aes_word_t t27 = aes_nohw_xor(t24, t26);
+  aes_word_t t28 = aes_nohw_and(t25, t27);
+  aes_word_t t29 = aes_nohw_xor(t28, t22);
+  aes_word_t t30 = aes_nohw_xor(t23, t24);
+  aes_word_t t31 = aes_nohw_xor(t22, t26);
+  aes_word_t t32 = aes_nohw_and(t31, t30);
+  aes_word_t t33 = aes_nohw_xor(t32, t24);
+  aes_word_t t34 = aes_nohw_xor(t23, t33);
+  aes_word_t t35 = aes_nohw_xor(t27, t33);
+  aes_word_t t36 = aes_nohw_and(t24, t35);
+  aes_word_t t37 = aes_nohw_xor(t36, t34);
+  aes_word_t t38 = aes_nohw_xor(t27, t36);
+  aes_word_t t39 = aes_nohw_and(t29, t38);
+  aes_word_t t40 = aes_nohw_xor(t25, t39);
+  aes_word_t t41 = aes_nohw_xor(t40, t37);
+  aes_word_t t42 = aes_nohw_xor(t29, t33);
+  aes_word_t t43 = aes_nohw_xor(t29, t40);
+  aes_word_t t44 = aes_nohw_xor(t33, t37);
+  aes_word_t t45 = aes_nohw_xor(t42, t41);
+  aes_word_t z0 = aes_nohw_and(t44, y15);
+  aes_word_t z1 = aes_nohw_and(t37, y6);
+  aes_word_t z2 = aes_nohw_and(t33, x7);
+  aes_word_t z3 = aes_nohw_and(t43, y16);
+  aes_word_t z4 = aes_nohw_and(t40, y1);
+  aes_word_t z5 = aes_nohw_and(t29, y7);
+  aes_word_t z6 = aes_nohw_and(t42, y11);
+  aes_word_t z7 = aes_nohw_and(t45, y17);
+  aes_word_t z8 = aes_nohw_and(t41, y10);
+  aes_word_t z9 = aes_nohw_and(t44, y12);
+  aes_word_t z10 = aes_nohw_and(t37, y3);
+  aes_word_t z11 = aes_nohw_and(t33, y4);
+  aes_word_t z12 = aes_nohw_and(t43, y13);
+  aes_word_t z13 = aes_nohw_and(t40, y5);
+  aes_word_t z14 = aes_nohw_and(t29, y2);
+  aes_word_t z15 = aes_nohw_and(t42, y9);
+  aes_word_t z16 = aes_nohw_and(t45, y14);
+  aes_word_t z17 = aes_nohw_and(t41, y8);
+
+  // Figure 4, bottom linear transformation.
+  aes_word_t t46 = aes_nohw_xor(z15, z16);
+  aes_word_t t47 = aes_nohw_xor(z10, z11);
+  aes_word_t t48 = aes_nohw_xor(z5, z13);
+  aes_word_t t49 = aes_nohw_xor(z9, z10);
+  aes_word_t t50 = aes_nohw_xor(z2, z12);
+  aes_word_t t51 = aes_nohw_xor(z2, z5);
+  aes_word_t t52 = aes_nohw_xor(z7, z8);
+  aes_word_t t53 = aes_nohw_xor(z0, z3);
+  aes_word_t t54 = aes_nohw_xor(z6, z7);
+  aes_word_t t55 = aes_nohw_xor(z16, z17);
+  aes_word_t t56 = aes_nohw_xor(z12, t48);
+  aes_word_t t57 = aes_nohw_xor(t50, t53);
+  aes_word_t t58 = aes_nohw_xor(z4, t46);
+  aes_word_t t59 = aes_nohw_xor(z3, t54);
+  aes_word_t t60 = aes_nohw_xor(t46, t57);
+  aes_word_t t61 = aes_nohw_xor(z14, t57);
+  aes_word_t t62 = aes_nohw_xor(t52, t58);
+  aes_word_t t63 = aes_nohw_xor(t49, t58);
+  aes_word_t t64 = aes_nohw_xor(z4, t59);
+  aes_word_t t65 = aes_nohw_xor(t61, t62);
+  aes_word_t t66 = aes_nohw_xor(z1, t63);
+  aes_word_t s0 = aes_nohw_xor(t59, t63);
+  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
+  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
+  aes_word_t t67 = aes_nohw_xor(t64, t65);
+  aes_word_t s3 = aes_nohw_xor(t53, t66);
+  aes_word_t s4 = aes_nohw_xor(t51, t66);
+  aes_word_t s5 = aes_nohw_xor(t47, t65);
+  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
+  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
+
+  batch->w[0] = s7;
+  batch->w[1] = s6;
+  batch->w[2] = s5;
+  batch->w[3] = s4;
+  batch->w[4] = s3;
+  batch->w[5] = s2;
+  batch->w[6] = s1;
+  batch->w[7] = s0;
+}
+
+// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
+// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
+// constant shift counts in the SSE2 implementation.
+#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
+  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
+               aes_nohw_shift_left((v), 16 - (n)*4)))
+
+static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
+  for (size_t i = 0; i < 8; i++) {
+    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
+    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
+    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
+    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
+    row1 = aes_nohw_rotate_cols_right(row1, 1);
+    row2 = aes_nohw_rotate_cols_right(row2, 2);
+    row3 = aes_nohw_rotate_cols_right(row3, 3);
+    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
+  }
+}
+
+// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
+// down by one.
+static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
+#if defined(OPENSSL_64_BIT)
+  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
+         ((v << 12) & UINT64_C(0xf000f000f000f000));
+#else
+  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
+#endif
+}
+
+// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
+// by two.
+static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
+#if defined(OPENSSL_64_BIT)
+  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
+         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
+#else
+  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
+#endif
+}
+
+static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
+  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
+  aes_word_t a0 = batch->w[0];
+  aes_word_t a1 = batch->w[1];
+  aes_word_t a2 = batch->w[2];
+  aes_word_t a3 = batch->w[3];
+  aes_word_t a4 = batch->w[4];
+  aes_word_t a5 = batch->w[5];
+  aes_word_t a6 = batch->w[6];
+  aes_word_t a7 = batch->w[7];
+
+  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
+  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
+  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
+  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
+  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
+  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
+  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
+  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
+  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
+  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
+  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
+  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
+  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
+  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
+  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
+  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
+
+  batch->w[0] =
+      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
+  batch->w[1] =
+      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
+                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
+  batch->w[2] =
+      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
+  batch->w[3] =
+      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
+                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
+  batch->w[4] =
+      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
+                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
+  batch->w[5] =
+      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
+  batch->w[6] =
+      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
+  batch->w[7] =
+      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
+}
+
+static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
+                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
+  aes_nohw_add_round_key(batch, &key->keys[0]);
+  for (size_t i = 1; i < num_rounds; i++) {
+    aes_nohw_sub_bytes(batch);
+    aes_nohw_shift_rows(batch);
+    aes_nohw_mix_columns(batch);
+    aes_nohw_add_round_key(batch, &key->keys[i]);
+  }
+  aes_nohw_sub_bytes(batch);
+  aes_nohw_shift_rows(batch);
+  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
+}
+
+// Key schedule.
+
+static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
+                                       const AES_KEY *key) {
+  for (size_t i = 0; i <= key->rounds; i++) {
+    // Copy the round key into each block in the batch.
+    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
+      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
+      OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16);
+      aes_nohw_batch_set(&out->keys[i], tmp, j);
+    }
+    aes_nohw_transpose(&out->keys[i]);
+  }
+}
+
+static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
+                                          0x20, 0x40, 0x80, 0x1b, 0x36};
+
+// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
+// |rcon|, stored in a |aes_word_t|.
+static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
+  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
+  return ((aes_word_t)rcon);
+}
+
+static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
+  AES_NOHW_BATCH batch;
+  OPENSSL_memset(&batch, 0, sizeof(batch));
+  aes_nohw_batch_set(&batch, in, 0);
+  aes_nohw_transpose(&batch);
+  aes_nohw_sub_bytes(&batch);
+  aes_nohw_transpose(&batch);
+  aes_nohw_batch_get(&batch, out, 0);
+}
+
+static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
+  key->rounds = 10;
+
+  aes_word_t block[AES_NOHW_BLOCK_WORDS];
+  aes_nohw_compact_block(block, in);
+  OPENSSL_memcpy(key->rd_key, block, 16);
+
+  for (size_t i = 1; i <= 10; i++) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block);
+    uint8_t rcon = aes_nohw_rcon[i - 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate |rcon| and the transformed word into the first word.
+      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
+      block[j] = aes_nohw_xor(
+          block[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words. Note this is reordered from the usual
+      // formulation to avoid needing masks.
+      aes_word_t v = block[j];
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
+    }
+    OPENSSL_memcpy(key->rd_key + 4 * i, block, 16);
+  }
+}
+
+static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
+  key->rounds = 14;
+
+  // Each key schedule iteration produces two round keys.
+  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
+  aes_nohw_compact_block(block1, in);
+  OPENSSL_memcpy(key->rd_key, block1, 16);
+
+  aes_nohw_compact_block(block2, in + 16);
+  OPENSSL_memcpy(key->rd_key + 4, block2, 16);
+
+  for (size_t i = 2; i <= 14; i += 2) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block2);
+    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate |rcon| and the transformed word into the first word.
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
+      block1[j] = aes_nohw_xor(
+          block1[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block1[j];
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
+    }
+    OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16);
+
+    if (i == 14) {
+      break;
+    }
+
+    aes_nohw_sub_block(sub, block1);
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate the transformed word into the first word.
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block2[j];
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
+    }
+    OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
+  }
+}
+
+
+// External API.
+
+int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
+                             AES_KEY *aeskey) {
+  switch (bits) {
+    case 128:
+      aes_nohw_setup_key_128(aeskey, key);
+      return 0;
+    case 256:
+      aes_nohw_setup_key_256(aeskey, key);
+      return 0;
+  }
+  return 1;
+}
+
+void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+  AES_NOHW_BATCH batch;
+  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
+  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
+}
+
+static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
+                                      const uint8_t b[16]) {
+  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
+    aes_word_t x, y;
+    OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t));
+    OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t));
+    x = aes_nohw_xor(x, y);
+    OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t));
+  }
+}
+
+void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
+                                   size_t blocks, const AES_KEY *key,
+                                   const uint8_t ivec[16]) {
+  if (blocks == 0) {
+    return;
+  }
+
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+
+  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
+  alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
+  alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
+  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+    OPENSSL_memcpy(ivs + 16 * i, ivec, 16);
+  }
+
+  uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
+  for (;;) {
+    // Update counters.
+    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
+    }
+
+    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
+    AES_NOHW_BATCH batch;
+    aes_nohw_to_batch(&batch, ivs, todo);
+    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+    aes_nohw_from_batch(enc_ivs, todo, &batch);
+
+    for (size_t i = 0; i < todo; i++) {
+      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
+    }
+
+    blocks -= todo;
+    if (blocks == 0) {
+      break;
+    }
+
+    in += 16 * AES_NOHW_BATCH_SIZE;
+    out += 16 * AES_NOHW_BATCH_SIZE;
+    ctr += AES_NOHW_BATCH_SIZE;
+  }
+}
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
new file mode 100644
index 0000000000..f4c546a70a
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -0,0 +1,1038 @@
+#!/usr/bin/env perl
+# Copyright 2024 The BoringSSL Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#------------------------------------------------------------------------------
+#
+# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version)
+#
+# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512
+# / AVX10.  This means it can only use 16 vector registers instead of 32, the
+# maximum vector length is 32 bytes, and some instructions such as vpternlogd
+# and masked loads/stores are unavailable.  However, it is able to run on CPUs
+# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan"
+# server processors) and some Intel client CPUs such as Alder Lake.
+#
+# This implementation also uses Karatsuba multiplication instead of schoolbook
+# multiplication for GHASH in its main loop.  This does not help much on Intel,
+# but it improves performance by ~5% on AMD Zen 3 which is the main target for
+# this implementation.  Other factors weighing slightly in favor of Karatsuba
+# multiplication in this implementation are the lower maximum vector length
+# (which means there is space left in the Htable array to cache the halves of
+# the key powers XOR'd together) and the unavailability of the vpternlogd
+# instruction (which helped schoolbook a bit more than Karatsuba).
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
+
+my $win64;
+my @argregs;
+if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
+    $win64   = 1;
+    @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
+}
+else {
+    $win64   = 0;
+    @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
+}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
+  or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
+  or die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my $g_cur_func_name;
+my $g_cur_func_uses_seh;
+my @g_cur_func_saved_gpregs;
+my @g_cur_func_saved_xmmregs;
+
+sub _begin_func {
+    my ( $funcname, $uses_seh ) = @_;
+    $g_cur_func_name          = $funcname;
+    $g_cur_func_uses_seh      = $uses_seh;
+    @g_cur_func_saved_gpregs  = ();
+    @g_cur_func_saved_xmmregs = ();
+    return <<___;
+.globl $funcname
+.type $funcname,\@abi-omnipotent
+.align 32
+$funcname:
+    .cfi_startproc
+    @{[ $uses_seh ? ".seh_startproc" : "" ]}
+    _CET_ENDBR
+___
+}
+
+# Push a list of general purpose registers onto the stack.
+sub _save_gpregs {
+    my @gpregs = @_;
+    my $code   = "";
+    die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
+    die "_save_gpregs can only be called once per function"
+      if @g_cur_func_saved_gpregs;
+    die "Order must be _save_gpregs, then _save_xmmregs"
+      if @g_cur_func_saved_xmmregs;
+    @g_cur_func_saved_gpregs = @gpregs;
+    for my $reg (@gpregs) {
+        $code .= "push $reg\n";
+        if ($win64) {
+            $code .= ".seh_pushreg $reg\n";
+        }
+        else {
+            $code .= ".cfi_push $reg\n";
+        }
+    }
+    return $code;
+}
+
+# Push a list of xmm registers onto the stack if the target is Windows.
+sub _save_xmmregs {
+    my @xmmregs     = @_;
+    my $num_xmmregs = scalar @xmmregs;
+    my $code        = "";
+    die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
+    die "_save_xmmregs can only be called once per function"
+      if @g_cur_func_saved_xmmregs;
+    if ( $win64 and $num_xmmregs > 0 ) {
+        @g_cur_func_saved_xmmregs = @xmmregs;
+        my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+        my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
+        $code .= "sub \$$alloc_size, %rsp\n";
+        $code .= ".seh_stackalloc $alloc_size\n";
+        for my $i ( 0 .. $num_xmmregs - 1 ) {
+            my $reg_num = $xmmregs[$i];
+            my $pos     = 16 * $i;
+            $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
+            $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
+        }
+    }
+    return $code;
+}
+
+sub _end_func {
+    my $code = "";
+
+    # Restore any xmm registers that were saved earlier.
+    my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
+    if ( $win64 and $num_xmmregs > 0 ) {
+        my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+        my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
+        for my $i ( 0 .. $num_xmmregs - 1 ) {
+            my $reg_num = $g_cur_func_saved_xmmregs[$i];
+            my $pos     = 16 * $i;
+            $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
+        }
+        $code .= "add \$$alloc_size, %rsp\n";
+    }
+
+    # Restore any general purpose registers that were saved earlier.
+    for my $reg ( reverse @g_cur_func_saved_gpregs ) {
+        $code .= "pop $reg\n";
+        if ( !$win64 ) {
+            $code .= ".cfi_pop $reg\n";
+        }
+    }
+
+    $code .= <<___;
+    ret
+    @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
+    .cfi_endproc
+    .size   $g_cur_func_name, . - $g_cur_func_name
+___
+    return $code;
+}
+
+my $code = <<___;
+.section .rodata
+.align 16
+
+    # A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+    .quad   0x08090a0b0c0d0e0f, 0x0001020304050607
+
+    # This is the GHASH reducing polynomial without its constant term, i.e.
+    # x^128 + x^7 + x^2 + x, represented using the backwards mapping
+    # between bits and polynomial coefficients.
+    #
+    # Alternatively, it can be interpreted as the naturally-ordered
+    # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+    # "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+    .quad   1, 0xc200000000000000
+
+    # Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+    .quad   1, 0xc200000000000001
+
+.align 32
+    # The below constants are used for incrementing the counter blocks.
+.Lctr_pattern:
+    .quad   0, 0
+    .quad   1, 0
+.Linc_2blocks:
+    .quad   2, 0
+    .quad   2, 0
+
+.text
+___
+
+# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
+# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
+# in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
+my $NUM_H_POWERS            = 8;
+my $OFFSETOFEND_H_POWERS    = $NUM_H_POWERS * 16;
+my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS;
+
+# Offset to 'rounds' in AES_KEY struct
+my $OFFSETOF_AES_ROUNDS = 240;
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+# the reduced products in \dst.  Uses schoolbook multiplication.
+sub _ghash_mul {
+    my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+    return <<___;
+    vpclmulqdq      \$0x00, $a, $b, $t0        # LO = a_L * b_L
+    vpclmulqdq      \$0x01, $a, $b, $t1        # MI_0 = a_L * b_H
+    vpclmulqdq      \$0x10, $a, $b, $t2        # MI_1 = a_H * b_L
+    vpxor           $t2, $t1, $t1              # MI = MI_0 + MI_1
+    vpclmulqdq      \$0x01, $t0, $gfpoly, $t2  # LO_L*(x^63 + x^62 + x^57)
+    vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
+    vpxor           $t0, $t1, $t1              # Fold LO into MI (part 1)
+    vpxor           $t2, $t1, $t1              # Fold LO into MI (part 2)
+    vpclmulqdq      \$0x11, $a, $b, $dst       # HI = a_H * b_H
+    vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
+    vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
+    vpxor           $t1, $dst, $dst            # Fold MI into HI (part 1)
+    vpxor           $t0, $dst, $dst            # Fold MI into HI (part 2)
+___
+}
+
+# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
+#
+# Initialize |Htable| with powers of the GHASH subkey |H|.
+#
+# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
+# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
+# in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
+$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1;
+{
+    my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
+    my ( $TMP0,   $TMP0_XMM )   = ( "%ymm0", "%xmm0" );
+    my ( $TMP1,   $TMP1_XMM )   = ( "%ymm1", "%xmm1" );
+    my ( $TMP2,   $TMP2_XMM )   = ( "%ymm2", "%xmm2" );
+    my ( $H_CUR,  $H_CUR_XMM )  = ( "%ymm3", "%xmm3" );
+    my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" );
+    my ( $H_INC,  $H_INC_XMM )  = ( "%ymm5", "%xmm5" );
+    my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" );
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6) ]}
+    .seh_endprologue
+
+    # Load the byte-reflected hash subkey.  BoringSSL provides it in
+    # byte-reflected form except the two halves are in the wrong order.
+    vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM
+
+    # Finish preprocessing the byte-reflected hash subkey by multiplying it by
+    # x^-1 ("standard" interpretation of polynomial coefficients) or
+    # equivalently x^1 (natural interpretation).  This gets the key into a
+    # format that avoids having to bit-reflect the data blocks later.
+    vpshufd         \$0xd3, $H_CUR_XMM, $TMP0_XMM
+    vpsrad          \$31, $TMP0_XMM, $TMP0_XMM
+    vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
+    vpand           .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM
+    vpxor           $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM
+
+    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
+
+    # Square H^1 to get H^2.
+    @{[ _ghash_mul  $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
+                    $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]}
+
+    # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+    vinserti128     \$1, $H_CUR_XMM, $H_INC, $H_CUR
+    vinserti128     \$1, $H_INC_XMM, $H_INC, $H_INC
+
+    # Compute H_CUR2 = [H^4, H^3].
+    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+
+    # Store [H^2, H^1] and [H^4, H^3].
+    vmovdqu         $H_CUR, 3*32($HTABLE)
+    vmovdqu         $H_CUR2, 2*32($HTABLE)
+
+    # For Karatsuba multiplication: compute and store the two 64-bit halves of
+    # each key power XOR'd together.  Order is 4,2,3,1.
+    vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
+    vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
+    vpxor           $TMP1, $TMP0, $TMP0
+    vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE)
+
+    # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+    @{[ _ghash_mul  $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+    vmovdqu         $H_CUR, 1*32($HTABLE)
+    vmovdqu         $H_CUR2, 0*32($HTABLE)
+
+    # Again, compute and store the two 64-bit halves of each key power XOR'd
+    # together.  Order is 8,6,7,5.
+    vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
+    vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
+    vpxor           $TMP1, $TMP0, $TMP0
+    vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE)
+
+    vzeroupper
+___
+}
+$code .= _end_func;
+
+# Do one step of the GHASH update of four vectors of data blocks.
+#   $i: the step to do, 0 through 9
+#   $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+#   $htable: pointer to the Htable for the key
+#   $bswap_mask: mask for reflecting the bytes of blocks
+#   $h_pow[2-1]_xored: XOR'd key powers cached from Htable
+#   $tmp[0-2]: temporary registers.  $tmp[1-2] must be preserved across steps.
+#   $lo, $mi: working state for this macro that must be preserved across steps
+#   $ghash_acc: the GHASH accumulator (input/output)
+sub _ghash_step_4x {
+    my (
+        $i,            $ghashdata_ptr, $htable, $bswap_mask,
+        $h_pow2_xored, $h_pow1_xored,  $tmp0,   $tmp0_xmm,
+        $tmp1,         $tmp2,          $lo,     $mi,
+        $ghash_acc,    $ghash_acc_xmm
+    ) = @_;
+    my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm );    # alias
+    if ( $i == 0 ) {
+        return <<___;
+        # First vector
+        vmovdqu         0*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+        vmovdqu         0*32($htable), $tmp2
+        vpxor           $ghash_acc, $tmp1, $tmp1
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $hi
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x00, $h_pow2_xored, $tmp0, $mi
+___
+    }
+    elsif ( $i == 1 ) {
+        return <<___;
+___
+    }
+    elsif ( $i == 2 ) {
+        return <<___;
+        # Second vector
+        vmovdqu         1*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+        vmovdqu         1*32($htable), $tmp2
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $lo, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $hi, $hi
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x10, $h_pow2_xored, $tmp0, $tmp0
+        vpxor           $tmp0, $mi, $mi
+___
+    }
+    elsif ( $i == 3 ) {
+        return <<___;
+        # Third vector
+        vmovdqu         2*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+        vmovdqu         2*32($htable), $tmp2
+___
+    }
+    elsif ( $i == 4 ) {
+        return <<___;
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $lo, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $hi, $hi
+___
+    }
+    elsif ( $i == 5 ) {
+        return <<___;
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x00, $h_pow1_xored, $tmp0, $tmp0
+        vpxor           $tmp0, $mi, $mi
+
+        # Fourth vector
+        vmovdqu         3*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+___
+    }
+    elsif ( $i == 6 ) {
+        return <<___;
+        vmovdqu         3*32($htable), $tmp2
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $lo, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $hi, $hi
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x10, $h_pow1_xored, $tmp0, $tmp0
+        vpxor           $tmp0, $mi, $mi
+___
+    }
+    elsif ( $i == 7 ) {
+        return <<___;
+        # Finalize 'mi' following Karatsuba multiplication.
+        vpxor           $lo, $mi, $mi
+        vpxor           $hi, $mi, $mi
+
+        # Fold lo into mi.
+        vbroadcasti128  .Lgfpoly(%rip), $tmp2
+        vpclmulqdq      \$0x01, $lo, $tmp2, $tmp0
+        vpshufd         \$0x4e, $lo, $lo
+        vpxor           $lo, $mi, $mi
+        vpxor           $tmp0, $mi, $mi
+___
+    }
+    elsif ( $i == 8 ) {
+        return <<___;
+        # Fold mi into hi.
+        vpclmulqdq      \$0x01, $mi, $tmp2, $tmp0
+        vpshufd         \$0x4e, $mi, $mi
+        vpxor           $mi, $hi, $hi
+        vpxor           $tmp0, $hi, $hi
+___
+    }
+    elsif ( $i == 9 ) {
+        return <<___;
+        vextracti128    \$1, $hi, $tmp0_xmm
+        vpxor           $tmp0_xmm, $hi_xmm, $ghash_acc_xmm
+___
+    }
+}
+
+sub _ghash_4x {
+    my $code = "";
+    for my $i ( 0 .. 9 ) {
+        $code .= _ghash_step_4x $i, @_;
+    }
+    return $code;
+}
+
+# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
+#                                const uint8_t *in, size_t len);
+#
+# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
+# by |in| and |len|.  |len| must be exactly 16.
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1;
+{
+    # Function arguments
+    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
+
+    # Additional local variables
+    my ( $TMP0,       $TMP0_XMM )       = ( "%ymm0", "%xmm0" );
+    my ( $TMP1,       $TMP1_XMM )       = ( "%ymm1", "%xmm1" );
+    my ( $TMP2,       $TMP2_XMM )       = ( "%ymm2", "%xmm2" );
+    my ( $LO,         $LO_XMM )         = ( "%ymm3", "%xmm3" );
+    my ( $MI,         $MI_XMM )         = ( "%ymm4", "%xmm4" );
+    my ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( "%ymm5", "%xmm5" );
+    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
+    my ( $GFPOLY,     $GFPOLY_XMM )     = ( "%ymm7", "%xmm7" );
+    my $H_POW2_XORED = "%ymm8";
+    my $H_POW1_XORED = "%ymm9";
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6 .. 9) ]}
+    .seh_endprologue
+
+    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
+    # usually only 128-bit vectors will be used.  So as an optimization, don't
+    # broadcast these constants to both 128-bit lanes quite yet.
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+
+    # Load the GHASH accumulator.
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+
+    # Update GHASH with the remaining 16-byte block if any.
+.Lghash_lastblock:
+    vmovdqu         ($AAD), $TMP0_XMM
+    vpshufb         $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
+    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
+    @{[ _ghash_mul  $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
+                    $TMP1_XMM, $TMP2_XMM, $LO_XMM ]}
+
+.Lghash_done:
+    # Store the updated GHASH accumulator back to memory.
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
+___
+}
+$code .= _end_func;
+
+sub _vaesenc_4x {
+    my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
+    return <<___;
+    vaesenc         $round_key, $aesdata0, $aesdata0
+    vaesenc         $round_key, $aesdata1, $aesdata1
+    vaesenc         $round_key, $aesdata2, $aesdata2
+    vaesenc         $round_key, $aesdata3, $aesdata3
+___
+}
+
+sub _ctr_begin_4x {
+    my (
+        $le_ctr,   $bswap_mask, $rndkey0,  $aesdata0,
+        $aesdata1, $aesdata2,   $aesdata3, $tmp
+    ) = @_;
+    return <<___;
+    # Increment le_ctr four times to generate four vectors of little-endian
+    # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
+    vmovdqu         .Linc_2blocks(%rip), $tmp
+    vpshufb         $bswap_mask, $le_ctr, $aesdata0
+    vpaddd          $tmp, $le_ctr, $le_ctr
+    vpshufb         $bswap_mask, $le_ctr, $aesdata1
+    vpaddd          $tmp, $le_ctr, $le_ctr
+    vpshufb         $bswap_mask, $le_ctr, $aesdata2
+    vpaddd          $tmp, $le_ctr, $le_ctr
+    vpshufb         $bswap_mask, $le_ctr, $aesdata3
+    vpaddd          $tmp, $le_ctr, $le_ctr
+
+    # AES "round zero": XOR in the zero-th round key.
+    vpxor           $rndkey0, $aesdata0, $aesdata0
+    vpxor           $rndkey0, $aesdata1, $aesdata1
+    vpxor           $rndkey0, $aesdata2, $aesdata2
+    vpxor           $rndkey0, $aesdata3, $aesdata3
+___
+}
+
+# Do the last AES round for four vectors of counter blocks, XOR four vectors of
+# source data with the resulting keystream blocks, and write the result to the
+# destination buffer.  The implementation differs slightly as it takes advantage
+# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce
+# latency, but it has the same effect.
+sub _aesenclast_and_xor_4x {
+    my (
+        $src,      $dst,      $rndkeylast, $aesdata0,
+        $aesdata1, $aesdata2, $aesdata3,   $t0,
+        $t1,       $t2,       $t3
+    ) = @_;
+    return <<___;
+    vpxor           0*32($src), $rndkeylast, $t0
+    vpxor           1*32($src), $rndkeylast, $t1
+    vpxor           2*32($src), $rndkeylast, $t2
+    vpxor           3*32($src), $rndkeylast, $t3
+    vaesenclast     $t0, $aesdata0, $aesdata0
+    vaesenclast     $t1, $aesdata1, $aesdata1
+    vaesenclast     $t2, $aesdata2, $aesdata2
+    vaesenclast     $t3, $aesdata3, $aesdata3
+    vmovdqu         $aesdata0, 0*32($dst)
+    vmovdqu         $aesdata1, 1*32($dst)
+    vmovdqu         $aesdata2, 2*32($dst)
+    vmovdqu         $aesdata3, 3*32($dst)
+___
+}
+
+my $g_update_macro_expansion_count = 0;
+
+# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out,
+#                                         size_t len, const AES_KEY *key,
+#                                         const uint8_t ivec[16],
+#                                         const u128 Htable[16],
+#                                         uint8_t Xi[16]);
+#
+# This macro generates a GCM encryption or decryption update function with the
+# above prototype (with \enc selecting which one).  The function computes the
+# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
+# writes the resulting encrypted or decrypted data to |out|.  It also updates
+# the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
+#
+# |len| must be a multiple of 16.  The caller must do any buffering needed to
+# ensure this.  Both in-place and out-of-place en/decryption are supported.
+#
+# |ivec| must give the current counter in big-endian format.  This function
+# loads the counter from |ivec| and increments the loaded counter as needed, but
+# it does *not* store the updated counter back to |ivec|.  The caller must
+# update |ivec| if any more data segments follow.  Internally, only the low
+# 32-bit word of the counter is incremented, following the GCM standard.
+sub _aes_gcm_update {
+    my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
+    my ($enc)              = @_;
+    my $code               = "";
+
+    # Function arguments
+    my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
+      = $win64
+      ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
+      : ( @argregs[ 0 .. 5 ], "%r12" );
+
+    # Additional local variables.
+    # %rax is used as a temporary register.  BE_CTR_PTR is also available as a
+    # temporary register after the counter is loaded.
+
+    # AES key length in bytes
+    my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
+
+    # Pointer to the last AES round key for the chosen AES variant
+    my $RNDKEYLAST_PTR = "%r11";
+
+    # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+    # using vpshufb, copied to all 128-bit lanes.
+    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" );
+
+    # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+    # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+    # more than one lane may be used, and they need to be XOR'd together.
+    my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" );
+
+    # TMP[0-2] are temporary registers.
+    my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" );
+    my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" );
+    my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" );
+
+    # LO and MI are used to accumulate unreduced GHASH products.
+    my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" );
+    my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" );
+
+    # Cached key powers from Htable
+    my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" );
+    my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" );
+
+    # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+    my $RNDKEY0    = "%ymm9";
+    my $RNDKEYLAST = "%ymm10";
+
+    # LE_CTR contains the next set of little-endian counter blocks.
+    my $LE_CTR = "%ymm11";
+
+    # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+    my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" );
+    my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" );
+    my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" );
+    my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" );
+    my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );
+
+    my @ghash_4x_args = (
+        $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED,
+        $H_POW1_XORED,      $TMP0,   $TMP0_XMM,   $TMP1,
+        $TMP2,              $LO,     $MI,         $GHASH_ACC,
+        $GHASH_ACC_XMM
+    );
+
+    if ($win64) {
+        $code .= <<___;
+        @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
+        mov             64(%rsp), $BE_CTR_PTR     # arg5
+        mov             72(%rsp), $HTABLE         # arg6
+        mov             80(%rsp), $GHASH_ACC_PTR  # arg7
+        @{[ _save_xmmregs (6 .. 15) ]}
+        .seh_endprologue
+___
+    }
+    else {
+        $code .= <<___;
+        @{[ _save_gpregs $GHASH_ACC_PTR ]}
+        mov             16(%rsp), $GHASH_ACC_PTR  # arg7
+___
+    }
+
+    if ($enc) {
+        $code .= <<___;
+#ifdef BORINGSSL_DISPATCH_TEST
+        .extern BORINGSSL_function_hit
+        movb \$1,BORINGSSL_function_hit+8(%rip)
+#endif
+___
+    }
+    $code .= <<___;
+    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+
+    # Load the GHASH accumulator and the starting counter.
+    # BoringSSL passes these values in big endian format.
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vbroadcasti128  ($BE_CTR_PTR), $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR
+
+    # Load the AES key length in bytes.  BoringSSL stores number of rounds
+    # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
+    movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
+    lea             -20(,$AESKEYLEN,4), $AESKEYLEN
+
+    # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
+    # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+    # respectively.  Then load the zero-th and last round keys.
+    lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
+    vbroadcasti128  ($AESKEY), $RNDKEY0
+    vbroadcasti128  ($RNDKEYLAST_PTR), $RNDKEYLAST
+
+    # Finish initializing LE_CTR by adding 1 to the second block.
+    vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
+
+    # If there are at least 128 bytes of data, then continue into the loop that
+    # processes 128 bytes of data at a time.  Otherwise skip it.
+    cmp             \$127, $DATALEN
+    jbe             .Lcrypt_loop_4x_done$local_label_suffix
+
+    vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
+    vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
+___
+
+    # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+    if ($enc) {
+        $code .= <<___;
+        # Encrypt the first 4 vectors of plaintext blocks.
+        @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
+        lea             16($AESKEY), %rax
+.Lvaesenc_loop_first_4_vecs$local_label_suffix:
+        vbroadcasti128  (%rax), $TMP0
+        @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+        add             \$16, %rax
+        cmp             %rax, $RNDKEYLAST_PTR
+        jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
+        @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
+                                   $TMP0, $TMP1, $LO, $MI ]}
+        sub             \$-128, $SRC  # 128 is 4 bytes, -128 is 1 byte
+        add             \$-128, $DATALEN
+        cmp             \$127, $DATALEN
+        jbe             .Lghash_last_ciphertext_4x$local_label_suffix
+___
+    }
+
+    $code .= <<___;
+.align 16
+.Lcrypt_loop_4x$local_label_suffix:
+
+    # Start the AES encryption of the counter blocks.
+    @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
+    cmp             \$24, $AESKEYLEN
+    jl              .Laes128$local_label_suffix
+    je              .Laes192$local_label_suffix
+    # AES-256
+    vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+    vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+.Laes192$local_label_suffix:
+    vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+    vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+.Laes128$local_label_suffix:
+___
+
+    # Prefetch the source data 512 bytes ahead into the L1 data cache, to
+    # improve performance when the hardware prefetcher is disabled.  Assumes the
+    # L1 data cache line size is 64 bytes (de facto standard on x86_64).
+    $code .= "prefetcht0 512($SRC)\n";
+    $code .= "prefetcht0 512+64($SRC)\n";
+
+    # Finish the AES encryption of the counter blocks in AESDATA[0-3],
+    # interleaved with the GHASH update of the ciphertext blocks.
+    for my $i ( reverse 1 .. 9 ) {
+        $code .= <<___;
+        @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]}
+        vbroadcasti128  -$i*16($RNDKEYLAST_PTR), $TMP0
+        @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+___
+    }
+    $code .= <<___;
+    @{[ _ghash_step_4x 9, @ghash_4x_args ]}
+
+    @{[ $enc ? "sub \$-128, $DST" : "" ]}  # 128 is 4 bytes, -128 is 1 byte
+    @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
+                               $TMP0, $TMP1, $LO, $MI ]}
+    sub             \$-128, $SRC
+    @{[ !$enc ? "sub \$-128, $DST" : "" ]}
+    add             \$-128, $DATALEN
+    cmp             \$127, $DATALEN
+    ja              .Lcrypt_loop_4x$local_label_suffix
+___
+
+    if ($enc) {
+
+        # Update GHASH with the last set of ciphertext blocks.
+        $code .= <<___;
+.Lghash_last_ciphertext_4x$local_label_suffix:
+        @{[ _ghash_4x @ghash_4x_args ]}
+        sub             \$-128, $DST
+___
+    }
+
+    my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.
+    my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM );    # reuse
+
+    $code .= <<___;
+.Lcrypt_loop_4x_done$local_label_suffix:
+    # Check whether any data remains.
+    test            $DATALEN, $DATALEN
+    jz              .Ldone$local_label_suffix
+
+    # DATALEN is in [16, 32, 48, 64, 80, 96, 112].
+
+    # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+    # is the number of blocks that remain.
+    lea             $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR
+    sub             $DATALEN, $POWERS_PTR
+
+    # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+    vpxor           $LO_XMM, $LO_XMM, $LO_XMM
+    vpxor           $MI_XMM, $MI_XMM, $MI_XMM
+    vpxor           $HI_XMM, $HI_XMM, $HI_XMM
+
+    cmp             \$64, $DATALEN
+    jb              .Llessthan64bytes$local_label_suffix
+
+    # DATALEN is in [64, 80, 96, 112].  Encrypt two vectors of counter blocks.
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
+    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
+    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+    vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
+    vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
+    lea             16($AESKEY), %rax
+.Lvaesenc_loop_tail_1$local_label_suffix:
+    vbroadcasti128  (%rax), $TMP0
+    vaesenc         $TMP0, $AESDATA0, $AESDATA0
+    vaesenc         $TMP0, $AESDATA1, $AESDATA1
+    add             \$16, %rax
+    cmp             %rax, $RNDKEYLAST_PTR
+    jne             .Lvaesenc_loop_tail_1$local_label_suffix
+    vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
+    vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1
+
+    # XOR the data with the two vectors of keystream blocks.
+    vmovdqu         0($SRC), $TMP0
+    vmovdqu         32($SRC), $TMP1
+    vpxor           $TMP0, $AESDATA0, $AESDATA0
+    vpxor           $TMP1, $AESDATA1, $AESDATA1
+    vmovdqu         $AESDATA0, 0($DST)
+    vmovdqu         $AESDATA1, 32($DST)
+
+    # Update GHASH with two vectors of ciphertext blocks, without reducing.
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1
+    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
+    vmovdqu         ($POWERS_PTR), $TMP0
+    vmovdqu         32($POWERS_PTR), $TMP1
+    vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $LO
+    vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $MI
+    vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $HI
+    vpclmulqdq      \$0x00, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $LO, $LO
+    vpclmulqdq      \$0x01, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x10, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $HI, $HI
+
+    add             \$64, $POWERS_PTR
+    add             \$64, $SRC
+    add             \$64, $DST
+    sub             \$64, $DATALEN
+    jz              .Lreduce$local_label_suffix
+
+    vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+    # DATALEN is in [16, 32, 48].  Encrypt two last vectors of counter blocks.
+.Llessthan64bytes$local_label_suffix:
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
+    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
+    vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
+    vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
+    lea             16($AESKEY), %rax
+.Lvaesenc_loop_tail_2$local_label_suffix:
+    vbroadcasti128  (%rax), $TMP0
+    vaesenc         $TMP0, $AESDATA0, $AESDATA0
+    vaesenc         $TMP0, $AESDATA1, $AESDATA1
+    add             \$16, %rax
+    cmp             %rax, $RNDKEYLAST_PTR
+    jne             .Lvaesenc_loop_tail_2$local_label_suffix
+    vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
+    vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1
+
+    # XOR the remaining data with the keystream blocks, and update GHASH with
+    # the remaining ciphertext blocks without reducing.
+
+    cmp             \$32, $DATALEN
+    jb              .Lxor_one_block$local_label_suffix
+    je              .Lxor_two_blocks$local_label_suffix
+
+.Lxor_three_blocks$local_label_suffix:
+    vmovdqu         0($SRC), $TMP0
+    vmovdqu         32($SRC), $TMP1_XMM
+    vpxor           $TMP0, $AESDATA0, $AESDATA0
+    vpxor           $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM
+    vmovdqu         $AESDATA0, 0($DST)
+    vmovdqu         $AESDATA1_XMM, 32($DST)
+
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+    vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM
+    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
+    vmovdqu         ($POWERS_PTR), $TMP0
+    vmovdqu         32($POWERS_PTR), $TMP1_XMM
+    vpclmulqdq      \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $LO, $LO
+    vpclmulqdq      \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $HI, $HI
+    jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix
+
+.Lxor_two_blocks$local_label_suffix:
+    vmovdqu         ($SRC), $TMP0
+    vpxor           $TMP0, $AESDATA0, $AESDATA0
+    vmovdqu         $AESDATA0, ($DST)
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
+    vmovdqu         ($POWERS_PTR), $TMP0
+    jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix
+
+.Lxor_one_block$local_label_suffix:
+    vmovdqu         ($SRC), $TMP0_XMM
+    vpxor           $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM
+    vmovdqu         $AESDATA0_XMM, ($DST)
+    vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM
+    vpxor           $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM
+    vmovdqu         ($POWERS_PTR), $TMP0_XMM
+
+.Lghash_mul_one_vec_unreduced$local_label_suffix:
+    vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $LO, $LO
+    vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $HI, $HI
+
+.Lreduce$local_label_suffix:
+    # Finally, do the GHASH reduction.
+    vbroadcasti128  .Lgfpoly(%rip), $TMP0
+    vpclmulqdq      \$0x01, $LO, $TMP0, $TMP1
+    vpshufd         \$0x4e, $LO, $LO
+    vpxor           $LO, $MI, $MI
+    vpxor           $TMP1, $MI, $MI
+    vpclmulqdq      \$0x01, $MI, $TMP0, $TMP1
+    vpshufd         \$0x4e, $MI, $MI
+    vpxor           $MI, $HI, $HI
+    vpxor           $TMP1, $HI, $HI
+    vextracti128    \$1, $HI, $GHASH_ACC_XMM
+    vpxor           $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+.Ldone$local_label_suffix:
+    # Store the updated GHASH accumulator back to memory.
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
+___
+    return $code;
+}
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+sub filter_and_print {
+    # This function replaces AVX2 assembly instructions with their assembled forms,
+    # to allow the code to work on old versions of binutils (older than 2.30) that do
+    # not support these instructions.
+    my %asmMap = (
+        'vaesenc         %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdc,0xe2',
+        'vaesenc         %ymm2, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdc,0xea',
+        'vaesenc         %ymm2, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdc,0xf2',
+        'vaesenc         %ymm2, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdc,0xfa',
+        'vaesenclast     %ymm10, %ymm12, %ymm12' => '.byte 0xc4,0x42,0x1d,0xdd,0xe2',
+        'vaesenclast     %ymm10, %ymm13, %ymm13' => '.byte 0xc4,0x42,0x15,0xdd,0xea',
+        'vaesenclast     %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdd,0xe2',
+        'vaesenclast     %ymm3, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdd,0xeb',
+        'vaesenclast     %ymm5, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdd,0xf5',
+        'vaesenclast     %ymm6, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdd,0xfe',
+        'vpclmulqdq      $0x00, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00',
+        'vpclmulqdq      $0x00, %ymm2, %ymm12, %ymm5' => '.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00',
+        'vpclmulqdq      $0x00, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00',
+        'vpclmulqdq      $0x00, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00',
+        'vpclmulqdq      $0x00, %ymm4, %ymm3, %ymm5' => '.byte 0xc4,0xe3,0x65,0x44,0xec,0x00',
+        'vpclmulqdq      $0x00, %ymm5, %ymm3, %ymm0' => '.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00',
+        'vpclmulqdq      $0x00, %ymm5, %ymm4, %ymm0' => '.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00',
+        'vpclmulqdq      $0x00, %ymm7, %ymm2, %ymm6' => '.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00',
+        'vpclmulqdq      $0x00, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00',
+        'vpclmulqdq      $0x01, %ymm0, %ymm6, %ymm2' => '.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01',
+        'vpclmulqdq      $0x01, %ymm1, %ymm6, %ymm0' => '.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01',
+        'vpclmulqdq      $0x01, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01',
+        'vpclmulqdq      $0x01, %ymm2, %ymm12, %ymm6' => '.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01',
+        'vpclmulqdq      $0x01, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01',
+        'vpclmulqdq      $0x01, %ymm5, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01',
+        'vpclmulqdq      $0x01, %ymm5, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01',
+        'vpclmulqdq      $0x01, %ymm5, %ymm4, %ymm1' => '.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01',
+        'vpclmulqdq      $0x01, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01',
+        'vpclmulqdq      $0x01, %ymm6, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01',
+        'vpclmulqdq      $0x01, %ymm6, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01',
+        'vpclmulqdq      $0x10, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10',
+        'vpclmulqdq      $0x10, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10',
+        'vpclmulqdq      $0x10, %ymm5, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10',
+        'vpclmulqdq      $0x10, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10',
+        'vpclmulqdq      $0x10, %ymm7, %ymm2, %ymm2' => '.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10',
+        'vpclmulqdq      $0x10, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10',
+        'vpclmulqdq      $0x11, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11',
+        'vpclmulqdq      $0x11, %ymm2, %ymm12, %ymm7' => '.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11',
+        'vpclmulqdq      $0x11, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11',
+        'vpclmulqdq      $0x11, %ymm4, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11',
+        'vpclmulqdq      $0x11, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11',
+        'vpclmulqdq      $0x11, %ymm5, %ymm3, %ymm4' => '.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11',
+        'vpclmulqdq      $0x11, %ymm5, %ymm4, %ymm3' => '.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11',
+    );
+    for my $line (split("\n",$code)) {
+        my $trimmed;
+        $trimmed = $line;
+        $trimmed =~ s/^\s+//;
+        $trimmed =~ s/\s+(#.*)?$//;
+        if (exists $asmMap{$trimmed}) {
+            $line = $asmMap{$trimmed};
+        } else {
+            if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
+                die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
+                     'find target -name "*aes-gcm-avx2*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
+            }
+        }
+        print $line,"\n";
+    }
+}
+
+filter_and_print();
+
+close STDOUT or die "error closing STDOUT: $!";
+exit 0;
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl
new file mode 100644
index 0000000000..77c8bbec63
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1120 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will
+# be computed incorrectly.
+#
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+#
+# The upstream code uses the condition |$avx>1| even though no AVX2
+# instructions are used, because it assumes MOVBE is supported by the assembler
+# if and only if AVX2 is also supported by the assembler; see
+# https://marc.info/?l=openssl-dev&m=146567589526984&w=2.
+$avx = 2;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# See the comment above regarding why the condition is ($avx>1) when there are
+# no AVX2 instructions being used.
+if ($avx>1) {{{
+
+# On Windows, only four parameters are passed in registers. The last two
+# parameters will be manually loaded into %rdi and %rsi.
+my ($inp, $out, $len, $key, $ivp, $Htable) =
+    $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") :
+             ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9");
+
+# The offset from %rbp to the Xip parameter. On Windows, all parameters have
+# corresponding stack positions, not just ones passed on the stack.
+# (0x40 = 6*8 + 0x10)
+#
+# Xip only needs to be accessed at the beginning and end of the function, and
+# this function is short on registers, so we make it the last parameter for
+# convenience.
+my $Xip_offset = $win64 ? 0x40 : 0x10;
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	sub		\$6,$len
+	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
+	vmovdqu		0x00-0x80($key),$rndkey
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpaddb		$T2,$inout2,$inout3
+	vpaddb		$T2,$inout3,$inout4
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$rndkey,$T1,$inout0
+	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
+	jmp		.Loop6x
+
+.align	32
+.Loop6x:
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32		# discard $inout[1-5]?
+	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
+	  vpaddb	$T2,$inout5,$T1		# next counter value
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpxor		$rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+	vmovdqu		$T1,($ivp)		# save next counter value
+	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
+	  vpxor		$rndkey,$inout3,$inout3
+	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
+	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+
+	# At this point, the current block of 96 (0x60) bytes has already been
+	# loaded into registers. Concurrently with processing it, we want to
+	# load the next 96 bytes of input for the next round. Obviously, we can
+	# only do this if there are at least 96 more bytes of input beyond the
+	# input we're currently processing, or else we'd read past the end of
+	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
+	# bytes of input beyond the 96 bytes we're already processing, and we
+	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
+	# we'll read in the next block so that it is in registers for the next
+	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
+	# the current block and then ignore what we re-read.
+	#
+	# At this point, |$in0| points to the current (already read into
+	# registers) block, and |$end0| points to 2*96 bytes before the end of
+	# the input. Thus, |$in0| > |$end0| means that we do not have the next
+	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
+	xor		%r12,%r12
+	cmp		$in0,$end0
+
+	  vaesenc	$T2,$inout0,$inout0
+	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
+	  vpxor		$rndkey,$inout4,$inout4
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
+	  vaesenc	$T2,$inout1,$inout1
+	  vpxor		$rndkey,$inout5,$inout5
+	setnc		%r12b
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	  vaesenc	$T2,$inout2,$inout2
+	vmovdqu		0x10-0x20($Htable),$Hkey	# $Hkey^2
+	neg		%r12
+	  vaesenc	$T2,$inout3,$inout3
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
+	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+	  vaesenc	$T2,$inout4,$inout4
+	 vpxor		$Z1,$T1,$Z0
+	and		\$0x60,%r12
+	  vmovups	0x20-0x80($key),$rndkey
+	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
+	  vaesenc	$T2,$inout5,$inout5
+
+	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
+	lea		($in0,%r12),$in0
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
+	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x58($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x50($in0),%r12
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x20+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x28+8(%rsp)
+	vmovdqu		0x30-0x20($Htable),$Z1	# borrow $Z1 for $Hkey^3
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x30-0x80($key),$rndkey
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
+	  vaesenc	$rndkey,$inout1,$inout1
+	 vpxor		$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout2,$inout2
+	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
+	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	 vpxor		$T1,$Z0,$Z0
+	vmovdqu		0x40-0x20($Htable),$T1	# borrow $T1 for $Hkey^4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x40-0x80($key),$rndkey
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T1,$Ii,$T2
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x48($in0),%r13
+	 vpxor		$Z1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x40($in0),%r12
+	vpclmulqdq	\$0x11,$T1,$Ii,$T1
+	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x30+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x38+8(%rsp)
+	 vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x60-0x20($Htable),$T2	# borrow $T2 for $Hkey^5
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x50-0x80($key),$rndkey
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x38($in0),%r13
+	 vpxor		$T1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T2,$Ii,$T1
+	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x30($in0),%r12
+	vpclmulqdq	\$0x11,$T2,$Ii,$T2
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x40+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x48+8(%rsp)
+	 vpxor		$Hkey,$Z0,$Z0
+	 vmovdqu	0x70-0x20($Htable),$Hkey	# $Hkey^6
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x60-0x80($key),$rndkey
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x28($in0),%r13
+	 vpxor		$T2,$Z3,$Z3
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x20($in0),%r12
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x50+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x58+8(%rsp)
+	vpxor		$Z1,$Z2,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	vpxor		$T1,$Z2,$Z2
+
+	  vmovups	0x70-0x80($key),$rndkey
+	vpslldq		\$8,$Z2,$Z1
+	vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	vpxor		$Xi,$Z3,$Z3
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpxor		$Z1,$Z0,$Z0
+	movbe		0x18($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x10($in0),%r12
+	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	mov		%r13,0x60+8(%rsp)
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r12,0x68+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vmovups	0x90-0x80($key),$rndkey
+	  vaesenc	$T1,$inout1,$inout1
+	vpsrldq		\$8,$Z2,$Z2
+	  vaesenc	$T1,$inout2,$inout2
+	vpxor		$Z2,$Z3,$Z3
+	  vaesenc	$T1,$inout3,$inout3
+	vpxor		$Ii,$Z0,$Z0
+	movbe		0x08($in0),%r13
+	  vaesenc	$T1,$inout4,$inout4
+	movbe		0x00($in0),%r12
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xa0-0x80($key),$T1
+	  cmp		\$11,$rounds
+	  jb		.Lenc_tail		# 128-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xb0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xc0-0x80($key),$T1
+	  # 192-bit key support was removed.
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xd0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xe0-0x80($key),$T1
+	  jmp		.Lenc_tail		# 256-bit key
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
+	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
+	  vpaddd	$Z1,$Z2,$inout2
+	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
+	  vpaddd	$Z1,$inout1,$inout3
+	  vpshufb	$Ii,$inout1,$inout1
+	  vpaddd	$Z1,$inout2,$inout4
+	  vpshufb	$Ii,$inout2,$inout2
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpaddd	$Z1,$inout3,$inout5
+	  vpshufb	$Ii,$inout3,$inout3
+	  vpxor		$rndkey,$inout2,$inout2
+	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
+	  vpshufb	$Ii,$inout4,$inout4
+	  vpshufb	$Ii,$inout5,$inout5
+	  vpshufb	$Ii,$T1,$T1		# next counter value
+	jmp		.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	  vaesenc	$rndkey,$inout0,$inout0
+	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
+	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	  vpxor		0x00($inp),$T1,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vpxor		0x10($inp),$T1,$Ii
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vpxor		0x20($inp),$T1,$Z1
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vpxor		0x30($inp),$T1,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	  vpxor		0x40($inp),$T1,$Z3
+	  vpxor		0x50($inp),$T1,$Hkey
+	  vmovdqu	($ivp),$T1		# load next counter value
+
+	  vaesenclast	$T2,$inout0,$inout0
+	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
+	  vaesenclast	$Ii,$inout1,$inout1
+	 vpaddb		$T2,$T1,$Ii
+	mov		%r13,0x70+8(%rsp)
+	lea		0x60($inp),$inp
+	# These two prefetches were added in BoringSSL. See change that added them.
+	 prefetcht0	512($inp)		# We use 96-byte block so prefetch 2 lines (128 bytes)
+	 prefetcht0	576($inp)
+	  vaesenclast	$Z1,$inout2,$inout2
+	 vpaddb		$T2,$Ii,$Z1
+	mov		%r12,0x78+8(%rsp)
+	lea		0x60($out),$out
+	  vmovdqu	0x00-0x80($key),$rndkey
+	  vaesenclast	$Z2,$inout3,$inout3
+	 vpaddb		$T2,$Z1,$Z2
+	  vaesenclast	$Z3, $inout4,$inout4
+	 vpaddb		$T2,$Z2,$Z3
+	  vaesenclast	$Hkey,$inout5,$inout5
+	 vpaddb		$T2,$Z3,$Hkey
+
+	add		\$0x60,%rax
+	sub		\$0x6,$len
+	jc		.L6x_done
+
+	  vmovups	$inout0,-0x60($out)	# save output
+	 vpxor		$rndkey,$T1,$inout0
+	  vmovups	$inout1,-0x50($out)
+	 vmovdqa	$Ii,$inout1		# 0 latency
+	  vmovups	$inout2,-0x40($out)
+	 vmovdqa	$Z1,$inout2		# 0 latency
+	  vmovups	$inout3,-0x30($out)
+	 vmovdqa	$Z2,$inout3		# 0 latency
+	  vmovups	$inout4,-0x20($out)
+	 vmovdqa	$Z3,$inout4		# 0 latency
+	  vmovups	$inout5,-0x10($out)
+	 vmovdqa	$Hkey,$inout5		# 0 latency
+	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
+	jmp		.Loop6x
+
+.L6x_done:
+	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
+	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+
+	ret
+.cfi_endproc
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+#		const AES_KEY *key, unsigned char iv[16], const u128 Htbl[9],
+#		u128 *Xip);
+$code.=<<___;
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@abi-omnipotent
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+	xor	%rax,%rax
+
+	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
+	# bytes of input.
+	cmp	\$0x60,$len			# minimal accepted length
+	jb	.Lgcm_dec_abort
+
+	push	%rbp
+.cfi_push	%rbp
+.seh_pushreg	%rbp
+	mov	%rsp, %rbp			# save stack pointer
+.cfi_def_cfa_register	%rbp
+	push	%rbx
+.cfi_push	%rbx
+.seh_pushreg	%rbx
+	push	%r12
+.cfi_push	%r12
+.seh_pushreg	%r12
+	push	%r13
+.cfi_push	%r13
+.seh_pushreg	%r13
+	push	%r14
+.cfi_push	%r14
+.seh_pushreg	%r14
+	push	%r15
+.cfi_push	%r15
+.seh_pushreg	%r15
+___
+if ($win64) {
+$code.=<<___
+	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
+.seh_stackalloc	0xa8
+.seh_setframe	%rbp, 0xa8+5*8
+	# Load the last two parameters. These go into %rdi and %rsi, which are
+	# non-volatile on Windows, so stash them in the parameter stack area
+	# first.
+	mov	%rdi, 0x10(%rbp)
+.seh_savereg	%rdi, 0xa8+5*8+0x10
+	mov	%rsi, 0x18(%rbp)
+.seh_savereg	%rsi, 0xa8+5*8+0x18
+	mov	0x30(%rbp), $ivp
+	mov	0x38(%rbp), $Htable
+	# Save non-volatile XMM registers.
+	movaps	%xmm6,-0xd0(%rbp)
+.seh_savexmm	%xmm6, 0xa8+5*8-0xd0
+	movaps	%xmm7,-0xc0(%rbp)
+.seh_savexmm	%xmm7, 0xa8+5*8-0xc0
+	movaps	%xmm8,-0xb0(%rbp)
+.seh_savexmm	%xmm8, 0xa8+5*8-0xb0
+	movaps	%xmm9,-0xa0(%rbp)
+.seh_savexmm	%xmm9, 0xa8+5*8-0xa0
+	movaps	%xmm10,-0x90(%rbp)
+.seh_savexmm	%xmm10, 0xa8+5*8-0x90
+	movaps	%xmm11,-0x80(%rbp)
+.seh_savexmm	%xmm11, 0xa8+5*8-0x80
+	movaps	%xmm12,-0x70(%rbp)
+.seh_savexmm	%xmm12, 0xa8+5*8-0x70
+	movaps	%xmm13,-0x60(%rbp)
+.seh_savexmm	%xmm13, 0xa8+5*8-0x60
+	movaps	%xmm14,-0x50(%rbp)
+.seh_savexmm	%xmm14, 0xa8+5*8-0x50
+	movaps	%xmm15,-0x40(%rbp)
+.seh_savexmm	%xmm15, 0xa8+5*8-0x40
+.seh_endprologue
+___
+}
+$code.=<<___;
+	vzeroupper
+
+	mov		$Xip_offset(%rbp), %r12
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	vmovdqu		(%r12),$Xi		# load Xi
+	and		\$-128,%rsp		# ensure stack alignment
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	lea		0x80($key),$key		# size optimization
+	lea		0x20($Htable),$Htable	# size optimization
+	mov		0xf0-0x80($key),$rounds
+	vpshufb		$Ii,$Xi,$Xi
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+	vmovdqu		0x50($inp),$Z3		# I[5]
+	mov		$inp,$in0
+	vmovdqu		0x40($inp),$Z0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
+	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
+	# not be near the very beginning of the address space when |$len| < 2*96
+	# (0xc0).
+	lea		-0xc0($inp,$len),$end0
+
+	vmovdqu		0x30($inp),$Z1
+	shr		\$4,$len
+	xor		%rax,%rax
+	vmovdqu		0x20($inp),$Z2
+	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		0x10($inp),$T2
+	 vpshufb	$Ii,$Z0,$Z0
+	vmovdqu		($inp),$Hkey
+	 vpshufb	$Ii,$Z1,$Z1
+	vmovdqu		$Z0,0x30(%rsp)
+	 vpshufb	$Ii,$Z2,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	 vpshufb	$Ii,$T2,$T2
+	vmovdqu		$Z2,0x50(%rsp)
+	 vpshufb	$Ii,$Hkey,$Hkey
+	vmovdqu		$T2,0x60(%rsp)
+	vmovdqu		$Hkey,0x70(%rsp)
+
+	call		_aesni_ctr32_ghash_6x
+
+	mov		$Xip_offset(%rbp), %r12
+	vmovups		$inout0,-0x60($out)	# save output
+	vmovups		$inout1,-0x50($out)
+	vmovups		$inout2,-0x40($out)
+	vmovups		$inout3,-0x30($out)
+	vmovups		$inout4,-0x20($out)
+	vmovups		$inout5,-0x10($out)
+
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,(%r12)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd0(%rbp),%xmm6
+	movaps	-0xc0(%rbp),%xmm7
+	movaps	-0xb0(%rbp),%xmm8
+	movaps	-0xa0(%rbp),%xmm9
+	movaps	-0x90(%rbp),%xmm10
+	movaps	-0x80(%rbp),%xmm11
+	movaps	-0x70(%rbp),%xmm12
+	movaps	-0x60(%rbp),%xmm13
+	movaps	-0x50(%rbp),%xmm14
+	movaps	-0x40(%rbp),%xmm15
+	mov	0x10(%rbp),%rdi
+	mov	0x18(%rbp),%rsi
+___
+$code.=<<___;
+	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
+.cfi_def_cfa	%rsp, 0x38
+	pop	%r15
+.cfi_pop	%r15
+	pop	%r14
+.cfi_pop	%r14
+	pop	%r13
+.cfi_pop	%r13
+	pop	%r12
+.cfi_pop	%r12
+	pop	%rbx
+.cfi_pop	%rbx
+	pop	%rbp
+.cfi_pop	%rbp
+.Lgcm_dec_abort:
+	ret
+.seh_endproc
+.cfi_endproc
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type	_aesni_ctr32_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_6x:
+.cfi_startproc
+	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	lea		-1($rounds),%r13
+	vmovups		0x10-0x80($key),$rndkey
+	lea		0x20-0x80($key),%r12
+	vpxor		$Z0,$T1,$inout0
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32_2
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddb		$T2,$inout2,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddb		$T2,$inout3,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpaddb		$T2,$inout5,$T1
+	vpxor		$Z0,$inout5,$inout5
+	jmp		.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc		$rndkey,$inout0,$inout0
+	vaesenc		$rndkey,$inout1,$inout1
+	vaesenc		$rndkey,$inout2,$inout2
+	vaesenc		$rndkey,$inout3,$inout3
+	vaesenc		$rndkey,$inout4,$inout4
+	vaesenc		$rndkey,$inout5,$inout5
+	vmovups		(%r12),$rndkey
+	lea		0x10(%r12),%r12
+	dec		%r13d
+	jnz		.Loop_ctr32
+
+	vmovdqu		(%r12),$Hkey		# last round key
+	vaesenc		$rndkey,$inout0,$inout0
+	vpxor		0x00($inp),$Hkey,$Z0
+	vaesenc		$rndkey,$inout1,$inout1
+	vpxor		0x10($inp),$Hkey,$Z1
+	vaesenc		$rndkey,$inout2,$inout2
+	vpxor		0x20($inp),$Hkey,$Z2
+	vaesenc		$rndkey,$inout3,$inout3
+	vpxor		0x30($inp),$Hkey,$Xi
+	vaesenc		$rndkey,$inout4,$inout4
+	vpxor		0x40($inp),$Hkey,$T2
+	vaesenc		$rndkey,$inout5,$inout5
+	vpxor		0x50($inp),$Hkey,$Hkey
+	lea		0x60($inp),$inp
+
+	vaesenclast	$Z0,$inout0,$inout0
+	vaesenclast	$Z1,$inout1,$inout1
+	vaesenclast	$Z2,$inout2,$inout2
+	vaesenclast	$Xi,$inout3,$inout3
+	vaesenclast	$T2,$inout4,$inout4
+	vaesenclast	$Hkey,$inout5,$inout5
+	vmovups		$inout0,0x00($out)
+	vmovups		$inout1,0x10($out)
+	vmovups		$inout2,0x20($out)
+	vmovups		$inout3,0x30($out)
+	vmovups		$inout4,0x40($out)
+	vmovups		$inout5,0x50($out)
+	lea		0x60($out),$out
+
+	ret
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
+	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
+	vpaddd		$Z1,$Z2,$inout2
+	vpaddd		$Z1,$inout1,$inout3
+	vpshufb		$Ii,$inout1,$inout1
+	vpaddd		$Z1,$inout2,$inout4
+	vpshufb		$Ii,$inout2,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddd		$Z1,$inout3,$inout5
+	vpshufb		$Ii,$inout3,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
+	vpshufb		$Ii,$inout4,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpshufb		$Ii,$inout5,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpshufb		$Ii,$T1,$T1		# next counter value
+	vpxor		$Z0,$inout5,$inout5
+	jmp	.Loop_ctr32
+.cfi_endproc
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@abi-omnipotent
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+	movb \$1,BORINGSSL_function_hit+2(%rip)
+#endif
+	xor	%rax,%rax
+
+	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
+	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
+	# least 96 more bytes of input.
+	cmp	\$0x60*3,$len			# minimal accepted length
+	jb	.Lgcm_enc_abort
+
+	push	%rbp
+.cfi_push	%rbp
+.seh_pushreg	%rbp
+	mov	%rsp, %rbp			# save stack pointer
+.cfi_def_cfa_register	%rbp
+	push	%rbx
+.cfi_push	%rbx
+.seh_pushreg	%rbx
+	push	%r12
+.cfi_push	%r12
+.seh_pushreg	%r12
+	push	%r13
+.cfi_push	%r13
+.seh_pushreg	%r13
+	push	%r14
+.cfi_push	%r14
+.seh_pushreg	%r14
+	push	%r15
+.cfi_push	%r15
+.seh_pushreg	%r15
+___
+if ($win64) {
+$code.=<<___
+	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
+.seh_stackalloc	0xa8
+.seh_setframe	%rbp, 0xa8+5*8
+	# Load the last two parameters. These go into %rdi and %rsi, which are
+	# non-volatile on Windows, so stash them in the parameter stack area
+	# first.
+	mov	%rdi, 0x10(%rbp)
+.seh_savereg	%rdi, 0xa8+5*8+0x10
+	mov	%rsi, 0x18(%rbp)
+.seh_savereg	%rsi, 0xa8+5*8+0x18
+	mov	0x30(%rbp), $ivp
+	mov	0x38(%rbp), $Htable
+	# Save non-volatile XMM registers.
+	movaps	%xmm6,-0xd0(%rbp)
+.seh_savexmm	%xmm6, 0xa8+5*8-0xd0
+	movaps	%xmm7,-0xc0(%rbp)
+.seh_savexmm	%xmm7, 0xa8+5*8-0xc0
+	movaps	%xmm8,-0xb0(%rbp)
+.seh_savexmm	%xmm8, 0xa8+5*8-0xb0
+	movaps	%xmm9,-0xa0(%rbp)
+.seh_savexmm	%xmm9, 0xa8+5*8-0xa0
+	movaps	%xmm10,-0x90(%rbp)
+.seh_savexmm	%xmm10, 0xa8+5*8-0x90
+	movaps	%xmm11,-0x80(%rbp)
+.seh_savexmm	%xmm11, 0xa8+5*8-0x80
+	movaps	%xmm12,-0x70(%rbp)
+.seh_savexmm	%xmm12, 0xa8+5*8-0x70
+	movaps	%xmm13,-0x60(%rbp)
+.seh_savexmm	%xmm13, 0xa8+5*8-0x60
+	movaps	%xmm14,-0x50(%rbp)
+.seh_savexmm	%xmm14, 0xa8+5*8-0x50
+	movaps	%xmm15,-0x40(%rbp)
+.seh_savexmm	%xmm15, 0xa8+5*8-0x40
+.seh_endprologue
+___
+}
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	lea		0x80($key),$key		# size optimization
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	and		\$-128,%rsp		# ensure stack alignment
+	mov		0xf0-0x80($key),$rounds
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+	mov		$out,$in0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
+	# the decryption case, there's no caveat that |$out| must not be near
+	# the very beginning of the address space, because we know that
+	# |$len| >= 3*96 from the check above, and so we know
+	# |$out| + |$len| >= 2*96 (0xc0).
+	lea		-0xc0($out,$len),$end0
+
+	shr		\$4,$len
+
+	call		_aesni_ctr32_6x
+	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
+	vpshufb		$Ii,$inout1,$T2
+	vmovdqu		$Xi,0x70(%rsp)
+	vpshufb		$Ii,$inout2,$Z0
+	vmovdqu		$T2,0x60(%rsp)
+	vpshufb		$Ii,$inout3,$Z1
+	vmovdqu		$Z0,0x50(%rsp)
+	vpshufb		$Ii,$inout4,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		$Z2,0x30(%rsp)
+
+	call		_aesni_ctr32_6x
+
+	mov		$Xip_offset(%rbp), %r12
+	lea		0x20($Htable),$Htable	# size optimization
+	vmovdqu		(%r12),$Xi		# load Xi
+	sub		\$12,$len
+	mov		\$0x60*2,%rax
+	vpshufb		$Ii,$Xi,$Xi
+
+	call		_aesni_ctr32_ghash_6x
+	vmovdqu		0x20(%rsp),$Z3		# I[5]
+	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
+	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
+	vpunpckhqdq	$Z3,$Z3,$T1
+	vmovdqu		0x20-0x20($Htable),$rndkey	# borrow $rndkey for $HK
+	 vmovups	$inout0,-0x60($out)	# save output
+	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
+	vpxor		$Z3,$T1,$T1
+	 vmovups	$inout1,-0x50($out)
+	 vpshufb	$Ii,$inout1,$inout1
+	 vmovups	$inout2,-0x40($out)
+	 vpshufb	$Ii,$inout2,$inout2
+	 vmovups	$inout3,-0x30($out)
+	 vpshufb	$Ii,$inout3,$inout3
+	 vmovups	$inout4,-0x20($out)
+	 vpshufb	$Ii,$inout4,$inout4
+	 vmovups	$inout5,-0x10($out)
+	 vpshufb	$Ii,$inout5,$inout5
+	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+	 vmovdqu	0x30(%rsp),$Z2		# I[4]
+	 vmovdqu	0x10-0x20($Htable),$Ii	# borrow $Ii for $Hkey^2
+	 vpunpckhqdq	$Z2,$Z2,$T2
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
+	 vpxor		$Z2,$T2,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+
+	 vmovdqu	0x40(%rsp),$T3		# I[3]
+	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
+	 vmovdqu	0x30-0x20($Htable),$Hkey	# $Hkey^3
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$T3,$T3,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
+	 vpxor		$T3,$Z1,$Z1
+	vpxor		$Z3,$Z2,$Z2
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Htable),$HK
+	vpxor		$T1,$T2,$T2
+
+	 vmovdqu	0x50(%rsp),$T1		# I[2]
+	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
+	 vmovdqu	0x40-0x20($Htable),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z0,$Z3,$Z3
+	 vpunpckhqdq	$T1,$T1,$Z0
+	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
+	 vpxor		$T1,$Z0,$Z0
+	vpxor		$Z2,$T3,$T3
+	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
+	vpxor		$T2,$Z1,$Z1
+
+	 vmovdqu	0x60(%rsp),$T2		# I[1]
+	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
+	 vmovdqu	0x60-0x20($Htable),$Hkey	# $Hkey^5
+	vpxor		$Z3,$Z2,$Z2
+	 vpunpckhqdq	$T2,$T2,$Z3
+	vpclmulqdq	\$0x11,$Ii,$T1,$T1
+	 vpxor		$T2,$Z3,$Z3
+	vpxor		$T3,$T1,$T1
+	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
+	 vmovdqu	0x80-0x20($Htable),$HK
+	vpxor		$Z1,$Z0,$Z0
+
+	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
+	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
+	 vmovdqu	0x70-0x20($Htable),$Ii	# borrow $Ii for $Hkey^6
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpxor		$Z2,$Z1,$Z1
+	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$T1,$T2,$T2
+	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
+	vpxor		$Z0,$Z3,$Z0
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
+	 vmovdqu	0x00-0x20($Htable),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$inout5,$inout5,$T1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
+	 vpxor		$inout5,$T1,$T1
+	vpxor		$Z1,$Z2,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$T3
+	 vmovdqu	0x20-0x20($Htable),$HK
+	vpxor		$T2,$Xi,$Z3
+	vpxor		$Z0,$T3,$Z2
+
+	 vmovdqu	0x10-0x20($Htable),$Ii	# borrow $Ii for $Hkey^2
+	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
+	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
+	  vpxor		$T3,$Z2,$Z2
+	 vpunpckhqdq	$inout4,$inout4,$T2
+	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
+	 vpxor		$inout4,$T2,$T2
+	  vpslldq	\$8,$Z2,$T3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+	  vpxor		$T3,$Z1,$Xi
+	  vpsrldq	\$8,$Z2,$Z2
+	  vpxor		$Z2,$Z3,$Z3
+
+	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
+	 vmovdqu	0x30-0x20($Htable),$Hkey	# $Hkey^3
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout3,$inout3,$T3
+	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
+	 vpxor		$inout3,$T3,$T3
+	vpxor		$inout5,$inout4,$inout4
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Htable),$HK
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
+	 vmovdqu	0x40-0x20($Htable),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$inout2,$inout2,$T1
+	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
+	 vpxor		$inout2,$T1,$T1
+	vpxor		$inout4,$inout3,$inout3
+	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
+	vpclmulqdq	\$0x00,$HK,$T3,$T3
+	vpxor		$T2,$T3,$T3
+
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
+	 vmovdqu	0x60-0x20($Htable),$Hkey	# $Hkey^5
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout1,$inout1,$T2
+	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
+	 vpxor		$inout1,$T2,$T2
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
+	vpxor		$inout3,$inout2,$inout2
+	vpclmulqdq	\$0x10,$HK,$T1,$T1
+	 vmovdqu	0x80-0x20($Htable),$HK
+	vpxor		$T3,$T1,$T1
+
+	  vxorps	$Z3,$inout5,$inout5
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
+	 vmovdqu	0x70-0x20($Htable),$Ii	# borrow $Ii for $Hkey^6
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$inout2,$inout1,$inout1
+	vpclmulqdq	\$0x00,$HK,$T2,$T2
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
+	vpxor		$Z0,$Z1,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$Z2
+	vpxor		$inout1,$Z3,$Z3
+	vpxor		$T2,$Z2,$Z2
+
+	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
+	vpxor		$Z0,$Z2,$Z2
+	vpslldq		\$8,$Z2,$T1
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+	vpsrldq		\$8,$Z2,$Z2
+	vpxor		$T1,$Z1,$Xi
+	vpxor		$Z2,$Z3,$Z3
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$Z3,$T2,$T2
+	vpxor		$T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+	mov		$Xip_offset(%rbp), %r12
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,(%r12)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd0(%rbp),%xmm6
+	movaps	-0xc0(%rbp),%xmm7
+	movaps	-0xb0(%rbp),%xmm8
+	movaps	-0xa0(%rbp),%xmm9
+	movaps	-0x90(%rbp),%xmm10
+	movaps	-0x80(%rbp),%xmm11
+	movaps	-0x70(%rbp),%xmm12
+	movaps	-0x60(%rbp),%xmm13
+	movaps	-0x50(%rbp),%xmm14
+	movaps	-0x40(%rbp),%xmm15
+	mov	0x10(%rbp),%rdi
+	mov	0x18(%rbp),%rsi
+___
+$code.=<<___;
+	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
+.cfi_def_cfa	%rsp, 0x38
+	pop	%r15
+.cfi_pop	%r15
+	pop	%r14
+.cfi_pop	%r14
+	pop	%r13
+.cfi_pop	%r13
+	pop	%r12
+.cfi_pop	%r12
+	pop	%rbx
+.cfi_pop	%rbx
+	pop	%rbp
+.cfi_pop	%rbp
+.Lgcm_enc_abort:
+	ret
+.seh_endproc
+.cfi_endproc
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.section .rodata
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+.text
+___
+}}} else {{{
+$code=<<___;	# assembler is too old
+.text
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+	_CET_ENDBR
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+	_CET_ENDBR
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000000..8a7a15619d
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl
@@ -0,0 +1,973 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
+# Westmere	3.77/1.37	1.37	1.52	1.27
+# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
+# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
+# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
+# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
+# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
+# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
+
+$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-586.pl:-)
+$AESNI_PREFIX="aes_hw";
+$inline=1;		# inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0]);
+
+&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
+&external_label("BORINGSSL_function_hit");
+&preprocessor_endif();
+&static_label("key_const");
+
+if ($PREFIX eq $AESNI_PREFIX)	{ $movekey=\&movups; }
+else			{ $movekey=\&movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
+
+# AESNI extension
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc	{ aescommon(0xdb,@_); }
+sub aesenc	{ aescommon(0xdc,@_); }
+sub aesenclast	{ aescommon(0xdd,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+    &lea		($key,&DWP(32,$key));
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&dec		($rounds);
+	&$movekey	($rndkey1,&QWP(0,$key));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop_$sn"));
+    eval"&aes${p}last	($inout,$rndkey1)";
+}}
+
+sub aesni_generate1	# fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+	&movups		($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(0x10,$key));
+	&xorps		($inout,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0x20,$key));
+	&lea		($key,&DWP(0x30,$key));
+	&cmp		($rounds,11);
+	&jb		(&label("${p}128"));
+	&lea		($key,&DWP(0x40,$key));
+	# 192-bit key support was removed.
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x30,$key));
+	# 192-bit key support was removed.
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x10,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x30,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x50,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x60,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x70,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+    eval"&aes${p}last	($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addressable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt2");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&add		($rounds,16);
+
+    &set_label("${p}2_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&jnz		(&label("${p}2_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt2");
+}
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&add		($rounds,16);
+
+    &set_label("${p}3_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shl		($rounds,4);
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&data_byte	(0x0f,0x1f,0x40,0x00);
+	&add		($rounds,16);
+
+    &set_label("${p}4_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+    &jnz		(&label("${p}4_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
+	&add		($rounds,16);
+	&jmp		(&label("_aesni_${p}rypt6_inner"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	eval"&aes${p}	($inout2,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_inner");
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
+&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
+&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
+&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
+
+if ($PREFIX eq $AESNI_PREFIX) {
+
+######################################################################
+# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
+&function_begin("${PREFIX}_ctr32_encrypt_blocks");
+	&record_function_hit(0);
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",88);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(80,"esp"),$key_);
+
+	&cmp	($len,1);
+	&je	(&label("ctr32_one_shortcut"));
+
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,6);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+
+	# compose 2 vectors of 3x32-bit counters
+	&bswap	($rounds_);
+	&pxor	($rndkey0,$rndkey0);
+	&pxor	($rndkey1,$rndkey1);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey0,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey1,$key_,0);
+	&inc	($rounds_);
+	&pinsrd	($rndkey0,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey1,$key_,1);
+	&inc	($rounds_);
+	&pinsrd	($rndkey0,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey1,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movdqu	($inout4,&QWP(0,$key));		# key[0]
+	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey0,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
+	&shl	($rounds,4);
+	&mov	($rounds_,16);
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
+	&mov	($key_,$key);			# backup $key
+	&sub	($rounds_,$rounds);		# backup twisted $rounds
+	&lea	($key,&DWP(32,$key,$rounds));
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
+	&pshufd	($inout2,$rndkey0,1<<6);
+	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey1,3<<6);
+	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
+	&pshufd	($inout4,$rndkey1,2<<6);
+	&pxor		($inout1,$rndkey0);
+	&pshufd	($inout5,$rndkey1,1<<6);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout4,$rndkey0);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout1,$rndkey1);
+	&$movekey	($rndkey0,&QWP(32,$key_));
+	&mov		($rounds,$rounds_);
+	&aesenc		($inout2,$rndkey1);
+	&aesenc		($inout3,$rndkey1);
+	&aesenc		($inout4,$rndkey1);
+	&aesenc		($inout5,$rndkey1);
+
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
+	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey0,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+
+	&pshufd	($inout1,$rndkey0,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
+
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&movdqu	($inout5,&QWP(0,$key_));
+	&mov	($key,$key_);
+	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
+	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
+
+&set_label("ctr32_tail");
+	&por	($inout0,$inout5);
+	&cmp	($len,2);
+	&jb	(&label("ctr32_one"));
+
+	&pshufd	($inout2,$rndkey0,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
+
+	&pshufd	($inout3,$rndkey1,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey1,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
+	&mov	($rounds,&DWP(240,$key));
+
+&set_label("ctr32_one");
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt2");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&movups	($inout5,&QWP(0x20,$inp));
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	($inout4,&QWP(0,$inp));
+	&movups	($inout5,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout0,$inout4);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout1,$inout5);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+	&pxor	("xmm0","xmm0");		# clear register bank
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
+	&pxor	("xmm5","xmm5");
+	&movdqa	(&QWP(48,"esp"),"xmm0");
+	&pxor	("xmm6","xmm6");
+	&movdqa	(&QWP(64,"esp"),"xmm0");
+	&pxor	("xmm7","xmm7");
+	&mov	("esp",&DWP(80,"esp"));
+&function_end("${PREFIX}_ctr32_encrypt_blocks");
+}
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+
+# int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits,
+#                                   AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_base");
+	&record_function_hit(3);
+
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&push	("ebx");
+
+	&call	(&label("pic"));
+&set_label("pic");
+	&blindpop("ebx");
+	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds"));
+	# 192-bit key support was removed.
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds",16);
+	&mov		($rounds,9);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
+	&call		(&label("key_128_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
+	&call		(&label("key_128"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(80,$key),$rounds);
+
+	&jmp	(&label("good_key"));
+
+&set_label("key_128",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_128_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&lea		($key,&DWP(16,$key));
+
+	&mov		($rounds,13);
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+
+	&jmp	(&label("good_key"));
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("good_key");
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&xor	("eax","eax");
+	&pop	("ebx");
+	&ret	();
+
+&set_label("bad_keybits",4);
+	&pxor	("xmm0","xmm0");
+	&mov	("eax",-2);
+	&pop	("ebx");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key_base");
+
+# int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits,
+#                                  AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_alt");
+	&record_function_hit(3);
+
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&push	("ebx");
+
+	&call	(&label("pic"));
+&set_label("pic");
+	&blindpop("ebx");
+	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds_alt"));
+	# 192-bit key support was removed.
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds_alt",16);
+	&movdqa		("xmm5",&QWP(0x00,"ebx"));
+	&mov		($rounds,8);
+	&movdqa		("xmm4",&QWP(0x20,"ebx"));
+	&movdqa		("xmm2","xmm0");
+	&movdqu		(&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+	&pshufb		("xmm0","xmm5");
+	&aesenclast	("xmm0","xmm4");
+	&pslld		("xmm4",1);
+	&lea		($key,&DWP(16,$key));
+
+	&movdqa		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm2","xmm3");
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(-16,$key),"xmm0");
+	&movdqa		("xmm2","xmm0");
+
+	&dec		($rounds);
+	&jnz		(&label("loop_key128"));
+
+	&movdqa		("xmm4",&QWP(0x30,"ebx"));
+
+	&pshufb		("xmm0","xmm5");
+	&aesenclast	("xmm0","xmm4");
+	&pslld		("xmm4",1);
+
+	&movdqa		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm2","xmm3");
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(0,$key),"xmm0");
+
+	&movdqa		("xmm2","xmm0");
+	&pshufb		("xmm0","xmm5");
+	&aesenclast	("xmm0","xmm4");
+
+	&movdqa		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm3","xmm2");
+	&pslldq		("xmm2",4);
+	&pxor		("xmm2","xmm3");
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(16,$key),"xmm0");
+
+	&mov		($rounds,9);
+	&mov		(&DWP(96,$key),$rounds);
+
+	&jmp	(&label("good_key"));
+
+	# 192-bit key support was removed.
+
+&set_label("14rounds_alt",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&lea		($key,&DWP(16,$key));
+	&movdqa		("xmm5",&QWP(0x00,"ebx"));
+	&movdqa		("xmm4",&QWP(0x20,"ebx"));
+	&mov		($rounds,7);
+	&movdqu		(&QWP(-32,$key),"xmm0");
+	&movdqa		("xmm1","xmm2");
+	&movdqu		(&QWP(-16,$key),"xmm2");
+
+&set_label("loop_key256");
+	&pshufb		("xmm2","xmm5");
+	&aesenclast	("xmm2","xmm4");
+
+	&movdqa		("xmm3","xmm0");
+	&pslldq		("xmm0",4);
+	&pxor		("xmm3","xmm0");
+	&pslldq		("xmm0",4);
+	&pxor		("xmm3","xmm0");
+	&pslldq		("xmm0",4);
+	&pxor		("xmm0","xmm3");
+	&pslld		("xmm4",1);
+
+	&pxor		("xmm0","xmm2");
+	&movdqu		(&QWP(0,$key),"xmm0");
+
+	&dec		($rounds);
+	&jz		(&label("done_key256"));
+
+	&pshufd		("xmm2","xmm0",0xff);
+	&pxor		("xmm3","xmm3");
+	&aesenclast	("xmm2","xmm3");
+
+	&movdqa		("xmm3","xmm1");
+	&pslldq		("xmm1",4);
+	&pxor		("xmm3","xmm1");
+	&pslldq		("xmm1",4);
+	&pxor		("xmm3","xmm1");
+	&pslldq		("xmm1",4);
+	&pxor		("xmm1","xmm3");
+
+	&pxor		("xmm2","xmm1");
+	&movdqu		(&QWP(16,$key),"xmm2");
+	&lea		($key,&DWP(32,$key));
+	&movdqa		("xmm1","xmm2");
+	&jmp		(&label("loop_key256"));
+
+&set_label("done_key256");
+	&mov		($rounds,13);
+	&mov		(&DWP(16,$key),$rounds);
+
+&set_label("good_key");
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&xor	("eax","eax");
+	&pop	("ebx");
+	&ret	();
+
+&set_label("bad_keybits",4);
+	&pxor	("xmm0","xmm0");
+	&mov	("eax",-2);
+	&pop	("ebx");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key_alt");
+
+
+&set_label("key_const",64);
+&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
+&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
+&data_word(1,1,1,1);
+&data_word(0x1b,0x1b,0x1b,0x1b);
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
new file mode 100644
index 0000000000..94fd3a95ab
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,1600 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
+# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
+# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straightforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor		3x	6x	8x
+# theoretical asymptotic limit		1.67	0.83	0.625
+# measured performance for 8KB block	1.05	0.86	0.84
+#
+# "as if" interleave factor		4.7x	5.8x	6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt				1.16	0.93	0.74
+# CTR					1.14	0.91	0.74
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instructions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizes 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
+# Westmere	3.77/1.25	1.25	1.25	1.26
+# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
+# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
+# Skylake	2.62/0.63	0.63	0.63	0.63
+# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
+# Knights L	2.54/0.77	0.78	0.85	-	1.50
+# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
+# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
+# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
+#
+# (*)	Atom Silvermont ECB result is suboptimal because of penalties
+#	incurred by operations on %xmm8-15. As ECB is not considered
+#	critical, nothing was done to mitigate the problem.
+
+$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+$code=".text\n";
+
+$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8";	# cbc, ctr, ...
+
+$rnds_="%r10d";	# backup copy for $rounds
+$key_="%r11";	# backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0";	$rndkey1="%xmm1";
+$inout0="%xmm2";	$inout1="%xmm3";
+$inout2="%xmm4";	$inout3="%xmm5";
+$inout4="%xmm6";	$inout5="%xmm7";
+$inout6="%xmm8";	$inout7="%xmm9";
+
+$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
+$in0="%xmm8";		$iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+	$movkey	($key),$rndkey0
+	$movkey	16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+	xorps	$rndkey0,$ivec
+	lea	32($key),$key
+	xorps	$ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+	lea	32($key),$key
+	xorps	$rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+	aes${p}	$rndkey1,$inout
+	dec	$rounds
+	$movkey	($key),$rndkey1
+	lea	16($key),$key
+	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
+	aes${p}last	$rndkey1,$inout
+___
+}}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt2,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt2:
+.cfi_startproc
+	$movkey	($key),$rndkey0
+	shl	\$4,$rounds
+	$movkey	16($key),$rndkey1
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	add	\$16,%rax
+
+.L${dir}_loop2:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop2
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	ret
+.cfi_endproc
+.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt3,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt3:
+.cfi_startproc
+	$movkey	($key),$rndkey0
+	shl	\$4,$rounds
+	$movkey	16($key),$rndkey1
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	add	\$16,%rax
+
+.L${dir}_loop3:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	aes${dir}	$rndkey0,$inout2
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop3
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	ret
+.cfi_endproc
+.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt4,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt4:
+.cfi_startproc
+	$movkey	($key),$rndkey0
+	shl	\$4,$rounds
+	$movkey	16($key),$rndkey1
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	xorps	$rndkey0,$inout3
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	.byte	0x0f,0x1f,0x00
+	add	\$16,%rax
+
+.L${dir}_loop4:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop4
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	ret
+.cfi_endproc
+.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt6,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt6:
+.cfi_startproc
+	$movkey		($key),$rndkey0
+	shl		\$4,$rounds
+	$movkey		16($key),$rndkey1
+	xorps		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout0
+	lea		32($key,$rounds),$key
+	neg		%rax			# $rounds
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout5
+	$movkey		($key,%rax),$rndkey0
+	add		\$16,%rax
+	jmp		.L${dir}_loop6_enter
+.align	16
+.L${dir}_loop6:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+.L${dir}_loop6_enter:
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop6
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	ret
+.cfi_endproc
+.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt8,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt8:
+.cfi_startproc
+	$movkey		($key),$rndkey0
+	shl		\$4,$rounds
+	$movkey		16($key),$rndkey1
+	xorps		$rndkey0,$inout0
+	xorps		$rndkey0,$inout1
+	pxor		$rndkey0,$inout2
+	pxor		$rndkey0,$inout3
+	pxor		$rndkey0,$inout4
+	lea		32($key,$rounds),$key
+	neg		%rax			# $rounds
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout5
+	pxor		$rndkey0,$inout6
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout7
+	$movkey		($key,%rax),$rndkey0
+	add		\$16,%rax
+	jmp		.L${dir}_loop8_inner
+.align	16
+.L${dir}_loop8:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+.L${dir}_loop8_inner:
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+.L${dir}_loop8_enter:
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	aes${dir}	$rndkey0,$inout6
+	aes${dir}	$rndkey0,$inout7
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop8
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	aes${dir}last	$rndkey0,$inout6
+	aes${dir}last	$rndkey0,$inout7
+	ret
+.cfi_endproc
+.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate2("enc") if ($PREFIX eq "aes_hw");
+&aesni_generate3("enc") if ($PREFIX eq "aes_hw");
+&aesni_generate4("enc") if ($PREFIX eq "aes_hw");
+&aesni_generate6("enc") if ($PREFIX eq "aes_hw");
+&aesni_generate8("enc") if ($PREFIX eq "aes_hw");
+
+if ($PREFIX eq "aes_hw") {
+{
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
+{
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("%ebp","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
+
+$code.=<<___;
+.globl	${PREFIX}_ctr32_encrypt_blocks
+.type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
+.align	16
+${PREFIX}_ctr32_encrypt_blocks:
+.cfi_startproc
+	_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb \$1,BORINGSSL_function_hit(%rip)
+#endif
+	cmp	\$1,$len
+	jne	.Lctr32_bulk
+
+	# handle single block without allocating stack frame,
+	# useful when handling edges
+	movups	($ivp),$inout0
+	movups	($inp),$inout1
+	mov	240($key),%edx			# key->rounds
+___
+	&aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+	 pxor	$rndkey0,$rndkey0		# clear register bank
+	 pxor	$rndkey1,$rndkey1
+	xorps	$inout1,$inout0
+	 pxor	$inout1,$inout1
+	movups	$inout0,($out)
+	 xorps	$inout0,$inout0
+	jmp	.Lctr32_epilogue
+
+.align	16
+.Lctr32_bulk:
+	lea	(%rsp),$key_			# use $key_ as frame pointer
+.cfi_def_cfa_register	$key_
+	push	%rbp
+.cfi_push	%rbp
+	sub	\$$frame_size,%rsp
+	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,-0xa8($key_)		# offload everything
+	movaps	%xmm7,-0x98($key_)
+	movaps	%xmm8,-0x88($key_)
+	movaps	%xmm9,-0x78($key_)
+	movaps	%xmm10,-0x68($key_)
+	movaps	%xmm11,-0x58($key_)
+	movaps	%xmm12,-0x48($key_)
+	movaps	%xmm13,-0x38($key_)
+	movaps	%xmm14,-0x28($key_)
+	movaps	%xmm15,-0x18($key_)
+.Lctr32_body:
+___
+$code.=<<___;
+
+	# 8 16-byte words on top of stack are counter values
+	# xor-ed with zero-round key
+
+	movdqu	($ivp),$inout0
+	movdqu	($key),$rndkey0
+	mov	12($ivp),$ctr			# counter LSB
+	pxor	$rndkey0,$inout0
+	mov	12($key),$key0			# 0-round key LSB
+	movdqa	$inout0,0x00(%rsp)		# populate counter block
+	bswap	$ctr
+	movdqa	$inout0,$inout1
+	movdqa	$inout0,$inout2
+	movdqa	$inout0,$inout3
+	movdqa	$inout0,0x40(%rsp)
+	movdqa	$inout0,0x50(%rsp)
+	movdqa	$inout0,0x60(%rsp)
+	mov	%rdx,%r10			# about to borrow %rdx
+	movdqa	$inout0,0x70(%rsp)
+
+	lea	1($ctr),%rax
+	 lea	2($ctr),%rdx
+	bswap	%eax
+	 bswap	%edx
+	xor	$key0,%eax
+	 xor	$key0,%edx
+	pinsrd	\$3,%eax,$inout1
+	lea	3($ctr),%rax
+	movdqa	$inout1,0x10(%rsp)
+	 pinsrd	\$3,%edx,$inout2
+	bswap	%eax
+	 mov	%r10,%rdx			# restore %rdx
+	 lea	4($ctr),%r10
+	 movdqa	$inout2,0x20(%rsp)
+	xor	$key0,%eax
+	 bswap	%r10d
+	pinsrd	\$3,%eax,$inout3
+	 xor	$key0,%r10d
+	movdqa	$inout3,0x30(%rsp)
+	lea	5($ctr),%r9
+	 mov	%r10d,0x40+12(%rsp)
+	bswap	%r9d
+	 lea	6($ctr),%r10
+	mov	240($key),$rounds		# key->rounds
+	xor	$key0,%r9d
+	 bswap	%r10d
+	mov	%r9d,0x50+12(%rsp)
+	 xor	$key0,%r10d
+	lea	7($ctr),%r9
+	 mov	%r10d,0x60+12(%rsp)
+	bswap	%r9d
+	xor	$key0,%r9d
+	mov	%r9d,0x70+12(%rsp)
+
+	$movkey	0x10($key),$rndkey1
+
+	movdqa	0x40(%rsp),$inout4
+	movdqa	0x50(%rsp),$inout5
+
+	cmp	\$8,$len		# $len is in blocks
+	jb	.Lctr32_tail		# short input if ($len<8)
+
+	lea	0x80($key),$key		# size optimization
+	sub	\$8,$len		# $len is biased by -8
+	jmp	.Lctr32_loop8
+
+.align	32
+.Lctr32_loop8:
+	 add		\$8,$ctr		# next counter value
+	movdqa		0x60(%rsp),$inout6
+	aesenc		$rndkey1,$inout0
+	 mov		$ctr,%r9d
+	movdqa		0x70(%rsp),$inout7
+	aesenc		$rndkey1,$inout1
+	 bswap		%r9d
+	$movkey		0x20-0x80($key),$rndkey0
+	aesenc		$rndkey1,$inout2
+	 xor		$key0,%r9d
+	 nop
+	aesenc		$rndkey1,$inout3
+	 mov		%r9d,0x00+12(%rsp)	# store next counter value
+	 lea		1($ctr),%r9
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+	 bswap		%r9d
+	aesenc		$rndkeyx,$inout0
+	aesenc		$rndkeyx,$inout1
+	 xor		$key0,%r9d
+	 .byte		0x66,0x90
+	aesenc		$rndkeyx,$inout2
+	aesenc		$rndkeyx,$inout3
+	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
+	 lea		$i($ctr),%r9
+	aesenc		$rndkeyx,$inout4
+	aesenc		$rndkeyx,$inout5
+	aesenc		$rndkeyx,$inout6
+	aesenc		$rndkeyx,$inout7
+	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+	 bswap		%r9d
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	 xor		$key0,%r9d
+	 movdqu		0x00($inp),$in0		# start loading input
+	aesenc		$rndkey0,$inout3
+	 mov		%r9d,0x70+12(%rsp)
+	 cmp		\$11,$rounds
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	aesenc		$rndkey0,$inout6
+	aesenc		$rndkey0,$inout7
+	$movkey		0xa0-0x80($key),$rndkey0
+
+	jb		.Lctr32_enc_done
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0xb0-0x80($key),$rndkey1
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	aesenc		$rndkey0,$inout6
+	aesenc		$rndkey0,$inout7
+	$movkey		0xc0-0x80($key),$rndkey0
+	# 192-bit key support was removed.
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0xd0-0x80($key),$rndkey1
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	aesenc		$rndkey0,$inout6
+	aesenc		$rndkey0,$inout7
+	$movkey		0xe0-0x80($key),$rndkey0
+	jmp		.Lctr32_enc_done
+
+.align	16
+.Lctr32_enc_done:
+	movdqu		0x10($inp),$in1
+	pxor		$rndkey0,$in0		# input^=round[last]
+	movdqu		0x20($inp),$in2
+	pxor		$rndkey0,$in1
+	movdqu		0x30($inp),$in3
+	pxor		$rndkey0,$in2
+	movdqu		0x40($inp),$in4
+	pxor		$rndkey0,$in3
+	movdqu		0x50($inp),$in5
+	pxor		$rndkey0,$in4
+	prefetcht0	0x1c0($inp)	# We process 128 bytes (8*16), so to prefetch 1 iteration
+	prefetcht0	0x200($inp)	# We need to prefetch 2 64 byte lines
+	pxor		$rndkey0,$in5
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
+	lea		0x80($inp),$inp		# $inp+=8*16
+
+	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
+	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
+	movdqu		0x70-0x80($inp),$in0
+	aesenclast	$in1,$inout1
+	pxor		$rndkey0,$in0
+	movdqa		0x00(%rsp),$in1		# load next counter block
+	aesenclast	$in2,$inout2
+	aesenclast	$in3,$inout3
+	movdqa		0x10(%rsp),$in2
+	movdqa		0x20(%rsp),$in3
+	aesenclast	$in4,$inout4
+	aesenclast	$in5,$inout5
+	movdqa		0x30(%rsp),$in4
+	movdqa		0x40(%rsp),$in5
+	aesenclast	$rndkey1,$inout6
+	movdqa		0x50(%rsp),$rndkey0
+	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
+	aesenclast	$in0,$inout7
+
+	movups		$inout0,($out)		# store 8 output blocks
+	movdqa		$in1,$inout0
+	movups		$inout1,0x10($out)
+	movdqa		$in2,$inout1
+	movups		$inout2,0x20($out)
+	movdqa		$in3,$inout2
+	movups		$inout3,0x30($out)
+	movdqa		$in4,$inout3
+	movups		$inout4,0x40($out)
+	movdqa		$in5,$inout4
+	movups		$inout5,0x50($out)
+	movdqa		$rndkey0,$inout5
+	movups		$inout6,0x60($out)
+	movups		$inout7,0x70($out)
+	lea		0x80($out),$out		# $out+=8*16
+
+	sub	\$8,$len
+	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
+
+	add	\$8,$len			# restore real remaining $len
+	jz	.Lctr32_done			# done if ($len==0)
+	lea	-0x80($key),$key
+
+.Lctr32_tail:
+	# note that at this point $inout0..5 are populated with
+	# counter values xor-ed with 0-round key
+	lea	16($key),$key
+	cmp	\$4,$len
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
+
+	# if ($len>4) compute 7 E(counter)
+	shl		\$4,$rounds
+	movdqa		0x60(%rsp),$inout6
+	pxor		$inout7,$inout7
+
+	$movkey		16($key),$rndkey0
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+	neg		%rax
+	aesenc		$rndkey1,$inout2
+	add		\$16,%rax		# prepare for .Lenc_loop8_enter
+	 movups		($inp),$in0
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	 movups		0x10($inp),$in1		# pre-load input
+	 movups		0x20($inp),$in2
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+
+	call            .Lenc_loop8_enter
+
+	movdqu	0x30($inp),$in3
+	pxor	$in0,$inout0
+	movdqu	0x40($inp),$in0
+	pxor	$in1,$inout1
+	movdqu	$inout0,($out)			# store output
+	pxor	$in2,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in3,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in0,$inout4
+	movdqu	$inout3,0x30($out)
+	movdqu	$inout4,0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr32_done			# $len was 5, stop store
+
+	movups	0x50($inp),$in1
+	xorps	$in1,$inout5
+	movups	$inout5,0x50($out)
+	je	.Lctr32_done			# $len was 6, stop store
+
+	movups	0x60($inp),$in2
+	xorps	$in2,$inout6
+	movups	$inout6,0x60($out)
+	jmp	.Lctr32_done			# $len was 7, stop store
+
+.align	32
+.Lctr32_loop4:
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	dec		$rounds
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		($key),$rndkey1
+	jnz		.Lctr32_loop4
+	aesenclast	$rndkey1,$inout0
+	aesenclast	$rndkey1,$inout1
+	 movups		($inp),$in0		# load input
+	 movups		0x10($inp),$in1
+	aesenclast	$rndkey1,$inout2
+	aesenclast	$rndkey1,$inout3
+	 movups		0x20($inp),$in2
+	 movups		0x30($inp),$in3
+
+	xorps	$in0,$inout0
+	movups	$inout0,($out)			# store output
+	xorps	$in1,$inout1
+	movups	$inout1,0x10($out)
+	pxor	$in2,$inout2
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout3
+	movdqu	$inout3,0x30($out)
+	jmp	.Lctr32_done			# $len was 4, stop store
+
+.align	32
+.Lctr32_loop3:
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	dec		$rounds
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	$movkey		($key),$rndkey1
+	jnz		.Lctr32_loop3
+	aesenclast	$rndkey1,$inout0
+	aesenclast	$rndkey1,$inout1
+	aesenclast	$rndkey1,$inout2
+
+	movups	($inp),$in0			# load input
+	xorps	$in0,$inout0
+	movups	$inout0,($out)			# store output
+	cmp	\$2,$len
+	jb	.Lctr32_done			# $len was 1, stop store
+
+	movups	0x10($inp),$in1
+	xorps	$in1,$inout1
+	movups	$inout1,0x10($out)
+	je	.Lctr32_done			# $len was 2, stop store
+
+	movups	0x20($inp),$in2
+	xorps	$in2,$inout2
+	movups	$inout2,0x20($out)		# $len was 3, stop store
+
+.Lctr32_done:
+	xorps	%xmm0,%xmm0			# clear register bank
+	xor	$key0,$key0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0x00(%rsp)		# clear stack
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,0x10(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,0x20(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,0x30(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,0x40(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,0x50(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,0x60(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,0x70(%rsp)
+	pxor	%xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+	movaps	-0xa8($key_),%xmm6
+	movaps	%xmm0,-0xa8($key_)		# clear stack
+	movaps	-0x98($key_),%xmm7
+	movaps	%xmm0,-0x98($key_)
+	movaps	-0x88($key_),%xmm8
+	movaps	%xmm0,-0x88($key_)
+	movaps	-0x78($key_),%xmm9
+	movaps	%xmm0,-0x78($key_)
+	movaps	-0x68($key_),%xmm10
+	movaps	%xmm0,-0x68($key_)
+	movaps	-0x58($key_),%xmm11
+	movaps	%xmm0,-0x58($key_)
+	movaps	-0x48($key_),%xmm12
+	movaps	%xmm0,-0x48($key_)
+	movaps	-0x38($key_),%xmm13
+	movaps	%xmm0,-0x38($key_)
+	movaps	-0x28($key_),%xmm14
+	movaps	%xmm0,-0x28($key_)
+	movaps	-0x18($key_),%xmm15
+	movaps	%xmm0,-0x18($key_)
+	movaps	%xmm0,0x00(%rsp)
+	movaps	%xmm0,0x10(%rsp)
+	movaps	%xmm0,0x20(%rsp)
+	movaps	%xmm0,0x30(%rsp)
+	movaps	%xmm0,0x40(%rsp)
+	movaps	%xmm0,0x50(%rsp)
+	movaps	%xmm0,0x60(%rsp)
+	movaps	%xmm0,0x70(%rsp)
+___
+$code.=<<___;
+	mov	-8($key_),%rbp
+.cfi_restore	%rbp
+	lea	($key_),%rsp
+.cfi_def_cfa_register	%rsp
+.Lctr32_epilogue:
+	ret
+.cfi_endproc
+.size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
+___
+} }}
+
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+# This is based on submission from Intel by
+#	Huang Ying
+#	Vinodh Gopal
+#	Kahraman Akdemir
+#
+# Aggressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
+#				int bits, AES_KEY * const key);
+#
+# input:	$inp	user-supplied key
+#		$bits	$inp length in bits
+#		$key	pointer to key schedule
+# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
+#		$bits	rounds-1 (used in aesni_set_decrypt_key)
+#		*$key	key schedule
+#		$key	pointer to key schedule (used in
+#			aesni_set_decrypt_key)
+#
+# Subroutine is frame-less, which means that only volatile registers
+# are used. Note that it's declared "abi-omnipotent", which means that
+# amount of volatile registers is smaller on Windows.
+#
+# There are two variants of this function, one which uses aeskeygenassist
+# ("base") and one which uses aesenclast + pshufb ("alt"). See aes/internal.h
+# for details.
+$code.=<<___;
+.globl	${PREFIX}_set_encrypt_key_base
+.type	${PREFIX}_set_encrypt_key_base,\@abi-omnipotent
+.align	16
+${PREFIX}_set_encrypt_key_base:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb \$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.seh_stackalloc	8
+.seh_endprologue
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	lea	16($key),%rax		# %rax is used as modifiable copy of $key
+	cmp	\$256,$bits
+	je	.L14rounds
+	# 192-bit key support was removed.
+
+	cmp	\$128,$bits
+	jne	.Lbad_keybits
+
+.L10rounds:
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
+	call		.Lkey_expansion_128_cold
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
+	call		.Lkey_expansion_128
+	$movkey	%xmm0,(%rax)
+	mov	$bits,80(%rax)	# 240(%rdx)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret
+
+	# 192-bit key support was removed.
+
+.align	16
+.L14rounds:
+	movups	16($inp),%xmm2			# remaining half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+
+	$movkey	%xmm0,($key)			# round 0
+	$movkey	%xmm2,16($key)			# round 1
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
+	call		.Lkey_expansion_256a_cold
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
+	call		.Lkey_expansion_256a
+	$movkey	%xmm0,(%rax)
+	mov	$bits,16(%rax)	# 240(%rdx)
+	xor	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	mov	\$-2,%rax
+.Lenc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	add	\$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc
+.seh_endproc
+
+.align	16
+.Lkey_expansion_128:
+.cfi_startproc
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+.cfi_endproc
+
+.align	16
+.Lkey_expansion_256a:
+.cfi_startproc
+	$movkey	%xmm2,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+.cfi_endproc
+
+.align 16
+.Lkey_expansion_256b:
+.cfi_startproc
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+
+	shufps	\$0b00010000,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10001100,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm2
+	ret
+.cfi_endproc
+.size	${PREFIX}_set_encrypt_key_base,.-${PREFIX}_set_encrypt_key_base
+
+.globl	${PREFIX}_set_encrypt_key_alt
+.type	${PREFIX}_set_encrypt_key_alt,\@abi-omnipotent
+.align	16
+${PREFIX}_set_encrypt_key_alt:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb \$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.seh_stackalloc	8
+.seh_endprologue
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	lea	16($key),%rax		# %rax is used as modifiable copy of $key
+	cmp	\$256,$bits
+	je	.L14rounds_alt
+	# 192-bit key support was removed.
+	cmp	\$128,$bits
+	jne	.Lbad_keybits_alt
+
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	mov	\$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,($key)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+	pshufb		%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
+	pslld		\$1,%xmm4
+	lea		16(%rax),%rax
+
+	movdqa		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm3,%xmm2
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,-16(%rax)
+	movdqa		%xmm0,%xmm2
+
+	dec	%r10d
+	jnz	.Loop_key128
+
+	movdqa		.Lkey_rcon1b(%rip),%xmm4
+
+	pshufb		%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
+	pslld		\$1,%xmm4
+
+	movdqa		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm3,%xmm2
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,(%rax)
+
+	movdqa		%xmm0,%xmm2
+	pshufb		%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
+
+	movdqa		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm3,%xmm2
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,16(%rax)
+
+	mov	$bits,96(%rax)	# 240($key)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+	# 192-bit key support was removed.
+
+.align	16
+.L14rounds_alt:
+	movups	16($inp),%xmm2			# remaining half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	mov	\$7,%r10d
+	movdqu	%xmm0,0($key)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16($key)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+	pshufb		%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
+
+	movdqa		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm3,%xmm0
+	pslld		\$1,%xmm4
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,(%rax)
+
+	dec	%r10d
+	jz	.Ldone_key256
+
+	pshufd		\$0xff,%xmm0,%xmm2
+	pxor		%xmm3,%xmm3
+	aesenclast	%xmm3,%xmm2
+
+	movdqa		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm3,%xmm1
+
+	pxor		%xmm1,%xmm2
+	movdqu		%xmm2,16(%rax)
+	lea		32(%rax),%rax
+	movdqa		%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	mov	$bits,16(%rax)	# 240($key)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.Lbad_keybits_alt:
+	mov	\$-2,%rax
+.Lenc_key_ret_alt:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	add	\$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc
+.seh_endproc
+.size	${PREFIX}_set_encrypt_key_alt,.-${PREFIX}_set_encrypt_key_alt
+___
+}
+
+$code.=<<___;
+.section .rodata
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+	.long	6,6,6,0
+.Lincrement64:
+	.long	1,0,0,0
+.Lincrement1:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+	.long	0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+	.long	1,1,1,1
+.Lkey_rcon1b:
+	.long	0x1b,0x1b,0x1b,0x1b
+
+.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+.text
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aes_hw");
+.type	ctr_xts_se_handler,\@abi-omnipotent
+.align	16
+ctr_xts_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	208($context),%rax	# pull context->R11
+
+	lea	-0xa8(%rax),%rsi	# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	-8(%rax),%rbp		# restore saved %rbp
+	mov	%rbp,160($context)	# restore context->Rbp
+
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($PREFIX eq "aes_hw");
+	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
+	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
+	.rva	.LSEH_info_ctr32
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($PREFIX eq "aes_hw");
+.LSEH_info_ctr32:
+	.byte	9,0,0,0
+	.rva	ctr_xts_se_handler
+	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
+___
+}
+
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	rex(\@opcode,$4,$3);
+	push @opcode,0x0f,0x3a,0xdf;
+	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
+	my $c=$2;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesimc" => 0xdb,
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	my $off = $2;
+	push @opcode,0x44 if ($3>=8);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
+	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+sub movbe {
+	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl
new file mode 100644
index 0000000000..5dfb4b1d8d
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl
@@ -0,0 +1,587 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+#		CBC enc		CBC dec		CTR
+# Apple A7	2.39		1.20		1.20
+# Cortex-A53	1.32		1.29		1.46
+# Cortex-A57(*)	1.95		0.85		0.93
+# Denver	1.96		0.86		0.80
+# Mongoose	1.33		1.20		1.20
+#
+# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
+#	and are still same even for updated module;
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$prefix="aes_hw";
+
+$code=<<___;
+#if __ARM_MAX_ARCH__>=7
+.text
+___
+$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
+$code.=<<___						if ($flavour !~ /64/);
+.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
+.fpu	neon
+.code	32
+#undef	__thumb2__
+___
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+# On AArch64, put the data .rodata and use adrp + add for compatibility with
+# execute-only memory. On AArch32, put it in .text and use adr.
+$code.= ".section .rodata\n" if ($flavour =~ /64/);
+$code.=<<___;
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	${prefix}_set_encrypt_key
+.type	${prefix}_set_encrypt_key,%function
+.align	5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___;
+	mov	$ptr,#-2
+	cmp	$bits,#128
+	b.lt	.Lenc_key_abort
+	cmp	$bits,#256
+	b.gt	.Lenc_key_abort
+	tst	$bits,#0x3f
+	b.ne	.Lenc_key_abort
+
+___
+$code.=<<___	if ($flavour =~ /64/);
+	adrp	$ptr,:pg_hi21:.Lrcon
+	add	$ptr,$ptr,:lo12:.Lrcon
+___
+$code.=<<___	if ($flavour !~ /64/);
+	adr	$ptr,.Lrcon
+___
+$code.=<<___;
+	cmp	$bits,#192
+
+	veor	$zero,$zero,$zero
+	vld1.8	{$in0},[$inp],#16
+	mov	$bits,#8		// reuse $bits
+	vld1.32	{$rcon,$mask},[$ptr],#32
+
+	b.lt	.Loop128
+	// 192-bit key support was removed.
+	b	.L256
+
+.align	4
+.Loop128:
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	b.ne	.Loop128
+
+	vld1.32	{$rcon},[$ptr]
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out]
+	add	$out,$out,#0x50
+
+	mov	$rounds,#10
+	b	.Ldone
+
+// 192-bit key support was removed.
+
+.align	4
+.L256:
+	vld1.8	{$in1},[$inp]
+	mov	$bits,#7
+	mov	$rounds,#14
+	vst1.32	{$in0},[$out],#16
+
+.Loop256:
+	vtbl.8	$key,{$in1},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in1},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out],#16
+	b.eq	.Ldone
+
+	vdup.32	$key,${in0}[3]		// just splat
+	vext.8	$tmp,$zero,$in1,#12
+	aese	$key,$zero
+
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+
+	veor	$in1,$in1,$key
+	b	.Loop256
+
+.Ldone:
+	str	$rounds,[$out]
+	mov	$ptr,#0
+
+.Lenc_key_abort:
+	mov	x0,$ptr			// return value
+	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
+	ret
+.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12";		# aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15	preloaded key schedule
+
+$code.=<<___;
+.globl	${prefix}_ctr32_encrypt_blocks
+.type	${prefix}_ctr32_encrypt_blocks,%function
+.align	5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	mov		ip,sp
+	stmdb		sp!,{r4-r10,lr}
+	vstmdb		sp!,{d8-d15}            @ ABI specification says so
+	ldr		r4, [ip]		@ load remaining arg
+___
+$code.=<<___;
+	ldr		$rounds,[$key,#240]
+
+	ldr		$ctr, [$ivp, #12]
+	vld1.32		{$dat0},[$ivp]
+
+	vld1.32		{q8-q9},[$key]		// load key schedule...
+	sub		$rounds,$rounds,#4
+	mov		$step,#16
+	cmp		$len,#2
+	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
+	sub		$rounds,$rounds,#2
+	vld1.32		{q12-q13},[$key_],#32
+	vld1.32		{q14-q15},[$key_],#32
+	vld1.32		{$rndlast},[$key_]
+	add		$key_,$key,#32
+	mov		$cnt,$rounds
+	cclr		$step,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov.32 lines
+	// could write to $dat1 and $dat2 directly, but that trips this bugs.
+	// We write to $ivec and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+	rev		$ctr, $ctr
+#endif
+	add		$tctr1, $ctr, #1
+	vorr		$ivec,$dat0,$dat0
+	rev		$tctr1, $tctr1
+	vmov.32		${ivec}[3],$tctr1
+	add		$ctr, $ctr, #2
+	vorr		$dat1,$ivec,$ivec
+	b.ls		.Lctr32_tail
+	rev		$tctr2, $ctr
+	vmov.32		${ivec}[3],$tctr2
+	sub		$len,$len,#3		// bias
+	vorr		$dat2,$ivec,$ivec
+	b		.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese		$dat0,q8
+	aesmc		$dat0,$dat0
+	aese		$dat1,q8
+	aesmc		$dat1,$dat1
+	aese		$dat2,q8
+	aesmc		$dat2,$dat2
+	vld1.32		{q8},[$key_],#16
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aesmc		$dat0,$dat0
+	aese		$dat1,q9
+	aesmc		$dat1,$dat1
+	aese		$dat2,q9
+	aesmc		$dat2,$dat2
+	vld1.32		{q9},[$key_],#16
+	b.gt		.Loop3x_ctr32
+
+	aese		$dat0,q8
+	aesmc		$tmp0,$dat0
+	aese		$dat1,q8
+	aesmc		$tmp1,$dat1
+	 vld1.8		{$in0},[$inp],#16
+	 add		$tctr0,$ctr,#1
+	aese		$dat2,q8
+	aesmc		$dat2,$dat2
+	 vld1.8		{$in1},[$inp],#16
+	 rev		$tctr0,$tctr0
+	aese		$tmp0,q9
+	aesmc		$tmp0,$tmp0
+	aese		$tmp1,q9
+	aesmc		$tmp1,$tmp1
+	 vld1.8		{$in2},[$inp],#16
+	 mov		$key_,$key
+	aese		$dat2,q9
+	aesmc		$tmp2,$dat2
+	aese		$tmp0,q12
+	aesmc		$tmp0,$tmp0
+	aese		$tmp1,q12
+	aesmc		$tmp1,$tmp1
+	 veor		$in0,$in0,$rndlast
+	 add		$tctr1,$ctr,#2
+	aese		$tmp2,q12
+	aesmc		$tmp2,$tmp2
+	 veor		$in1,$in1,$rndlast
+	 add		$ctr,$ctr,#3
+	aese		$tmp0,q13
+	aesmc		$tmp0,$tmp0
+	aese		$tmp1,q13
+	aesmc		$tmp1,$tmp1
+	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	 veor		$in2,$in2,$rndlast
+	 vmov.32	${ivec}[3], $tctr0
+	aese		$tmp2,q13
+	aesmc		$tmp2,$tmp2
+	 vorr		$dat0,$ivec,$ivec
+	 rev		$tctr1,$tctr1
+	aese		$tmp0,q14
+	aesmc		$tmp0,$tmp0
+	 vmov.32	${ivec}[3], $tctr1
+	 rev		$tctr2,$ctr
+	aese		$tmp1,q14
+	aesmc		$tmp1,$tmp1
+	 vorr		$dat1,$ivec,$ivec
+	 vmov.32	${ivec}[3], $tctr2
+	aese		$tmp2,q14
+	aesmc		$tmp2,$tmp2
+	 vorr		$dat2,$ivec,$ivec
+	 subs		$len,$len,#3
+	aese		$tmp0,q15
+	aese		$tmp1,q15
+	aese		$tmp2,q15
+
+	veor		$in0,$in0,$tmp0
+	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
+	vst1.8		{$in0},[$out],#16
+	veor		$in1,$in1,$tmp1
+	 mov		$cnt,$rounds
+	vst1.8		{$in1},[$out],#16
+	veor		$in2,$in2,$tmp2
+	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	vst1.8		{$in2},[$out],#16
+	b.hs		.Loop3x_ctr32
+
+	adds		$len,$len,#3
+	b.eq		.Lctr32_done
+	cmp		$len,#1
+	mov		$step,#16
+	cclr		$step,eq
+
+.Lctr32_tail:
+	aese		$dat0,q8
+	aesmc		$dat0,$dat0
+	aese		$dat1,q8
+	aesmc		$dat1,$dat1
+	vld1.32		{q8},[$key_],#16
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aesmc		$dat0,$dat0
+	aese		$dat1,q9
+	aesmc		$dat1,$dat1
+	vld1.32		{q9},[$key_],#16
+	b.gt		.Lctr32_tail
+
+	aese		$dat0,q8
+	aesmc		$dat0,$dat0
+	aese		$dat1,q8
+	aesmc		$dat1,$dat1
+	aese		$dat0,q9
+	aesmc		$dat0,$dat0
+	aese		$dat1,q9
+	aesmc		$dat1,$dat1
+	 vld1.8		{$in0},[$inp],$step
+	aese		$dat0,q12
+	aesmc		$dat0,$dat0
+	aese		$dat1,q12
+	aesmc		$dat1,$dat1
+	 vld1.8		{$in1},[$inp]
+	aese		$dat0,q13
+	aesmc		$dat0,$dat0
+	aese		$dat1,q13
+	aesmc		$dat1,$dat1
+	 veor		$in0,$in0,$rndlast
+	aese		$dat0,q14
+	aesmc		$dat0,$dat0
+	aese		$dat1,q14
+	aesmc		$dat1,$dat1
+	 veor		$in1,$in1,$rndlast
+	aese		$dat0,q15
+	aese		$dat1,q15
+
+	cmp		$len,#1
+	veor		$in0,$in0,$dat0
+	veor		$in1,$in1,$dat1
+	vst1.8		{$in0},[$out],#16
+	b.eq		.Lctr32_done
+	vst1.8		{$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___	if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}
+	ldmia		sp!,{r4-r10,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldr		x29,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) {			######## 64-bit code
+    my %opcode = (
+	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
+	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5),
+			$mnemonic,$arg;
+    };
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;			# old->new style commentary
+
+	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
+	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
+	s/vext\.8/ext/o		or
+	s/vrev32\.8/rev32/o	or
+	s/vtst\.8/cmtst/o	or
+	s/vshr/ushr/o		or
+	s/^(\s+)v/$1/o		or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	# fix up remaining legacy suffixes
+	s/\.[ui]?8//o;
+	m/\],#8/o and s/\.16b/\.8b/go;
+	s/\.[ui]?32//o and s/\.16b/\.4s/go;
+	s/\.[ui]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	# Switch preprocessor checks to aarch64 versions.
+	s/__ARME([BL])__/__AARCH64E$1__/go;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    my %opcode = (
+	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
+	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<1) |(($2&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    };
+
+    sub unvtbl {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
+		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+    }
+
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+
+    sub unvmov32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+    }
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remaining new-style suffixes
+	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
+	s/\],#[0-9]+/]!/o;
+
+	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
+	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
+	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
+	s/^(\s+)b\./$1b/o				or
+	s/^(\s+)mov\./$1mov/o				or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl
new file mode 100644
index 0000000000..4f862a9ba5
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl
@@ -0,0 +1,1541 @@
+#! /usr/bin/env perl
+
+# Copyright (c) 2022, ARM Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#========================================================================
+# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+# derived from https://github.com/ARM-software/AArch64cryptolib, original
+# author Samuel Lee <Samuel.Lee@arm.com>.
+#========================================================================
+#
+# Approach - assume we don't want to reload constants, so reserve ~half of
+# vector register file for constants
+#
+# main loop to act on 4 16B blocks per iteration, and then do modulo of the
+# accumulated intermediate hashes from the 4 blocks
+#
+#  ____________________________________________________
+# |                                                    |
+# | PRE                                                |
+# |____________________________________________________|
+# |                |                |                  |
+# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
+# |________________|____(mostly)____|__________________|
+# |                                                    |
+# | MODULO                                             |
+# |____________________________________________________|
+#
+# PRE: Ensure previous generated intermediate hash is aligned and merged with
+# result for GHASH 4k+0
+#
+# EXT low_acc, low_acc, low_acc, #8
+# EOR res_curr (4k+0), res_curr (4k+0), low_acc
+#
+# CTR block: Increment and byte reverse counter in scalar registers and transfer
+# to SIMD registers
+#
+# REV     ctr32, rev_ctr32
+# ORR     ctr64, constctr96_top32, ctr32, LSL #32
+# // Keeping this in scalar registers to free up space in SIMD RF
+# INS     ctr_next.d[0], constctr96_bottom64
+# INS     ctr_next.d[1], ctr64X
+# ADD     rev_ctr32, #1
+#
+# AES block:
+#
+# Do AES encryption/decryption on CTR block X and EOR it with input block X.
+# Take 256 bytes key below for example. Doing small trick here of loading input
+# in scalar registers, EORing with last key and then transferring Given we are
+# very constrained in our ASIMD registers this is quite important
+#
+#     Encrypt:
+# LDR     input_low, [ input_ptr  ], #8
+# LDR     input_high, [ input_ptr  ], #8
+# EOR     input_low, k14_low
+# EOR     input_high, k14_high
+# INS     res_curr.d[0], input_low
+# INS     res_curr.d[1], input_high
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# EOR     res_curr, res_curr, ctr_curr
+# ST1     { res_curr.16b  }, [ output_ptr  ], #16
+#
+#     Decrypt:
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# LDR     res_curr, [ input_ptr  ], #16
+# EOR     res_curr, res_curr, ctr_curr
+# MOV     output_low, res_curr.d[0]
+# MOV     output_high, res_curr.d[1]
+# EOR     output_low, k14_low
+# EOR     output_high, k14_high
+# STP     output_low, output_high, [ output_ptr  ], #16
+#
+# GHASH block X:
+#     Do 128b karatsuba polynomial multiplication on block. We only have
+#     64b->128b polynomial multipliers, naively that means we need to do 4 64b
+#     multiplies to generate a 128b.
+#
+# multiplication:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
+#                   (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
+#
+#     The idea behind Karatsuba multiplication is that we can do just 3 64b
+#     multiplies:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
+#                   (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^
+#                   Pmull(Al,Bl))<<64
+#
+#     There is some complication here because the bit order of GHASH's PMULL is
+#     reversed compared to elsewhere, so we are multiplying with "twisted"
+#     powers of H
+#
+# Note: We can PMULL directly into the acc_x in first GHASH of the loop
+#
+# Note: For scheduling big cores we want to split the processing to happen over
+#       two loop iterations - otherwise the critical path latency dominates the
+#       performance.
+#
+#       This has a knock on effect on register pressure, so we have to be a bit
+#       more clever with our temporary registers than indicated here
+#
+# REV64   res_curr, res_curr
+# INS     t_m.d[0], res_curr.d[1]
+# EOR     t_m.8B, t_m.8B, res_curr.8B
+# PMULL2  t_h, res_curr, HX
+# PMULL   t_l, res_curr, HX
+# PMULL   t_m, t_m, HX_k
+# EOR     acc_h, acc_h, t_h
+# EOR     acc_l, acc_l, t_l
+# EOR     acc_m, acc_m, t_m
+#
+# MODULO: take the partial accumulators (~representing sum of 256b
+#         multiplication results), from GHASH and do modulo reduction on them
+#         There is some complication here because the bit order of GHASH's
+#         PMULL is reversed compared to elsewhere, so we are doing modulo with
+#         a reversed constant
+#
+# EOR     acc_m, acc_m, acc_h
+# EOR     acc_m, acc_m, acc_l                // Finish off karatsuba processing
+# PMULL   t_mod, acc_h, mod_constant
+# EXT     acc_h, acc_h, acc_h, #8
+# EOR     acc_m, acc_m, acc_h
+# EOR     acc_m, acc_m, t_mod
+# PMULL   acc_h, acc_m, mod_constant
+# EXT     acc_m, acc_m, acc_m, #8
+# EOR     acc_l, acc_l, acc_h
+# EOR     acc_l, acc_l, acc_m
+#
+# This code was then modified to merge the AES-128-GCM, AES-192-GCM, and
+# AES-256-GCM implementations into a single function to reduce size. We move the
+# last two round keys into consistent registers across all sizes, as they're
+# treated special. Then, after rounds 0 through 8, we added some branches to
+# conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before
+# merging back into code which finishes up the last two rounds.
+#
+# There is a mostly decision to be made around how much parallel work goes
+# before or after the conditional part. We attempted to preserve the original
+# scheduling where possible, but it's possible other schedulings are more
+# optimal with the current ordering.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code=<<___;
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch armv8-a+crypto
+.text
+___
+
+$input_ptr="x0";  #argument block
+$bit_length="x1";
+$output_ptr="x2";
+$current_tag="x3";
+$Htable="x6";
+$counter="x16";
+$cc="x8";
+
+{
+my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
+my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
+my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
+my ($output_l0,$output_h0)=map("x$_",(6..7));
+
+# rkN_l and rkN_h store the final round key, which is handled slightly
+# differently because it is EORed through general-purpose registers.
+my $ctr32w="w9";
+my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15));
+my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
+
+my $rounds="x17";
+my $roundsw="w17";
+
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
+my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
+
+my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
+my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
+my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
+
+my $t0="v8";
+my $t0d="d8";
+my $t1="v4";
+my $t1d="d4";
+my $t2="v8";
+my $t2d="d8";
+my $t3="v4";
+my $t3d="d4";
+my $t4="v4";
+my $t4d="d4";
+my $t5="v5";
+my $t5d="d5";
+my $t6="v8";
+my $t6d="d8";
+my $t7="v5";
+my $t7d="d5";
+my $t8="v6";
+my $t8d="d6";
+my $t9="v4";
+my $t9d="d4";
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
+my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
+
+my $mod_constantd="d8";
+my $mod_constant="v8";
+my $mod_t="v7";
+
+# rkNm1 stores the second-to-last round key, which is handled slightly
+# differently because it uses plain AESE instead of an AESE + AESMC macro-op.
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31));
+my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31));
+my $rk2q1="v20.1q";
+my $rk3q1="v21.1q";
+my $rk4v="v22";
+my $rk4d="d22";
+
+################################################################################
+# size_t aes_gcm_enc_kernel(const uint8_t *in,
+#                           size_t len_bits,
+#                           uint8_t *out,
+#                           u64 *Xi,
+#                           uint8_t ivec[16],
+#                           const void *key,
+#                           const void *Htable);
+#
+$code.=<<___;
+.global aes_gcm_enc_kernel
+.type   aes_gcm_enc_kernel,%function
+.align  4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp     x19, x20, [sp, #16]
+	mov     $counter, x4
+	mov     $cc, x5
+	stp     x21, x22, [sp, #32]
+	stp     x23, x24, [sp, #48]
+	stp     d8, d9, [sp, #64]
+	stp     d10, d11, [sp, #80]
+	stp     d12, d13, [sp, #96]
+	stp     d14, d15, [sp, #112]
+	ldr	$roundsw, [$cc, #240]
+	add	$input_l1, $cc, $rounds, lsl #4                   // borrow input_l1 for last key
+	ldp     $rkN_l, $rkN_h, [$input_l1]                       // load round N keys
+	ldr     $rkNm1q, [$input_l1, #-16]                        // load round N-1 keys
+	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   // end_input_ptr
+	lsr     $main_end_input_ptr, $bit_length, #3              // byte_len
+	mov     $len, $main_end_input_ptr
+	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              // ctr96_b64, ctr96_t32
+	ld1     { $ctr0b}, [$counter]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub     $main_end_input_ptr, $main_end_input_ptr, #1      // byte_len - 1
+	ldr     $rk0q, [$cc, #0]                                  // load rk0
+	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr     $rk7q, [$cc, #112]                                // load rk7
+	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+	lsr     $rctr32x, $ctr96_t32x, #32
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 2
+	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+	rev     $rctr32w, $rctr32w                                // rev_ctr32
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 1
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
+	add     $rctr32w, $rctr32w, #1                            // increment rev_ctr32
+	rev     $ctr32w, $rctr32w                                 // CTR block 1
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 3
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 1
+	add     $rctr32w, $rctr32w, #1                            // CTR block 1
+	ldr     $rk1q, [$cc, #16]                                 // load rk1
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 1
+	rev     $ctr32w, $rctr32w                                 // CTR block 2
+	add     $rctr32w, $rctr32w, #1                            // CTR block 2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 2
+	ldr     $rk2q, [$cc, #32]                                 // load rk2
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 2
+	rev     $ctr32w, $rctr32w                                 // CTR block 3
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 3
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 3
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
+	ldr     $rk3q, [$cc, #48]                                 // load rk3
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
+	ldr     $rk6q, [$cc, #96]                                 // load rk6
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
+	ldr     $rk5q, [$cc, #80]                                 // load rk5
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
+	ldr     $h3q, [$Htable, #48]                              // load h3l | h3h
+	ext     $h3b, $h3b, $h3b, #8
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
+	ldr     $rk4q, [$cc, #64]                                 // load rk4
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
+	ldr     $h2q, [$Htable, #32]                              // load h2l | h2h
+	ext     $h2b, $h2b, $h2b, #8
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
+	ldr     $rk12q, [$cc, #192]                               // load rk12
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
+	ldr     $h4q, [$Htable, #80]                              // load h4l | h4h
+	ext     $h4b, $h4b, $h4b, #8
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
+	ldr     $rk11q, [$cc, #176]                               // load rk11
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
+	ldr     $rk8q, [$cc, #128]                                // load rk8
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 3
+	add     $rctr32w, $rctr32w, #1                            // CTR block 3
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 3
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 3
+	ld1     { $acc_lb}, [$current_tag]
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 4
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 4
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 4
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 4
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 5
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 5
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 5
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 5
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 6
+	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      // h4l | h3l
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 6
+	ldr     $rk9q, [$cc, #144]                                // load rk9
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
+	ldr     $h1q, [$Htable]                                   // load h1l | h1h
+	ext     $h1b, $h1b, $h1b, #8
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
+	ldr     $rk10q, [$cc, #160]                               // load rk10
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
+	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      // h4h | h3h
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 7
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 7
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 7
+	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      // h2l | h1l
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 8
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 8
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 8
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 8
+	b.lt	.Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 9
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 10
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 10
+	b.eq	.Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 11
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 12
+
+.Lenc_finish_first_blocks:
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 4 blocks
+	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  // h4k | h3k
+	aese    $ctr2b, $rkNm1                                    // AES block 2 - round N-1
+	trn1    $t0.2d,    $h1.2d,    $h2.2d                      // h2h | h1h
+	aese    $ctr1b, $rkNm1                                    // AES block 1 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 0 - round N-1
+	aese    $ctr3b, $rkNm1                                    // AES block 3 - round N-1
+	eor     $h12k.16b, $h12k.16b, $t0.16b                     // h2k | h1k
+	b.ge    .Lenc_tail                                        // handle tail
+
+	ldp     $input_l1, $input_h1, [$input_ptr, #16]           // AES block 1 - load plaintext
+	rev     $ctr32w, $rctr32w                                 // CTR block 4
+	ldp     $input_l0, $input_h0, [$input_ptr, #0]            // AES block 0 - load plaintext
+	ldp     $input_l3, $input_h3, [$input_ptr, #48]           // AES block 3 - load plaintext
+	ldp     $input_l2, $input_h2, [$input_ptr, #32]           // AES block 2 - load plaintext
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	eor     $input_l1, $input_l1, $rkN_l                      // AES block 1 - round N low
+	eor     $input_h1, $input_h1, $rkN_h                      // AES block 1 - round N high
+	fmov    $ctr_t1d, $input_l1                               // AES block 1 - mov low
+	eor     $input_l0, $input_l0, $rkN_l                      // AES block 0 - round N low
+	eor     $input_h0, $input_h0, $rkN_h                      // AES block 0 - round N high
+	eor     $input_h3, $input_h3, $rkN_h                      // AES block 3 - round N high
+	fmov    $ctr_t0d, $input_l0                               // AES block 0 - mov low
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 8 blocks
+	fmov    $ctr_t0.d[1], $input_h0                           // AES block 0 - mov high
+	eor     $input_l3, $input_l3, $rkN_l                      // AES block 3 - round N low
+	eor     $input_l2, $input_l2, $rkN_l                      // AES block 2 - round N low
+	fmov    $ctr_t1.d[1], $input_h1                           // AES block 1 - mov high
+	fmov    $ctr_t2d, $input_l2                               // AES block 2 - mov low
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4
+	fmov    $ctr_t3d, $input_l3                               // AES block 3 - mov low
+	eor     $input_h2, $input_h2, $rkN_h                      // AES block 2 - round N high
+	fmov    $ctr_t2.d[1], $input_h2                           // AES block 2 - mov high
+	eor     $res0b, $ctr_t0b, $ctr0b                          // AES block 0 - result
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4
+	rev     $ctr32w, $rctr32w                                 // CTR block 5
+	add     $rctr32w, $rctr32w, #1                            // CTR block 5
+	eor     $res1b, $ctr_t1b, $ctr1b                          // AES block 1 - result
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 5
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 5
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 5
+	rev     $ctr32w, $rctr32w                                 // CTR block 6
+	st1     { $res0b}, [$output_ptr], #16                     // AES block 0 - store result
+	fmov    $ctr_t3.d[1], $input_h3                           // AES block 3 - mov high
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 6
+	eor     $res2b, $ctr_t2b, $ctr2b                          // AES block 2 - result
+	st1     { $res1b}, [$output_ptr], #16                     // AES block 1 - store result
+	add     $rctr32w, $rctr32w, #1                            // CTR block 6
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 6
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 6
+	st1     { $res2b}, [$output_ptr], #16                     // AES block 2 - store result
+	rev     $ctr32w, $rctr32w                                 // CTR block 7
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 7
+	eor     $res3b, $ctr_t3b, $ctr3b                          // AES block 3 - result
+	st1     { $res3b}, [$output_ptr], #16                     // AES block 3 - store result
+	b.ge    .Lenc_prepretail                                  // do prepretail
+
+.Lenc_main_loop:                                                  // main loop start
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	rev64   $res0b, $res0b                                    // GHASH block 4k (only t0 is free)
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+3
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+3
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	ldp     $input_l3, $input_h3, [$input_ptr, #48]           // AES block 4k+7 - load plaintext
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	ldp     $input_l2, $input_h2, [$input_ptr, #32]           // AES block 4k+6 - load plaintext
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	eor     $input_l3, $input_l3, $rkN_l                      // AES block 4k+7 - round N low
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	eor     $input_h2, $input_h2, $rkN_h                      // AES block 4k+6 - round N high
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	rev64   $res1b, $res1b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	ldp     $input_l1, $input_h1, [$input_ptr, #16]           // AES block 4k+5 - load plaintext
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $input_l1, $input_l1, $rkN_l                      // AES block 4k+5 - round N low
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	eor     $input_l2, $input_l2, $rkN_l                      // AES block 4k+6 - round N low
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	movi    $mod_constant.8b, #0xc2
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	fmov    $ctr_t1d, $input_l1                               // AES block 4k+5 - mov low
+	ldp     $input_l0, $input_h0, [$input_ptr, #0]            // AES block 4k+4 - load plaintext
+	b.lt	.Lenc_main_loop_continue                          // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	b.eq	.Lenc_main_loop_continue                          // branch if AES-192
+
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+
+.Lenc_main_loop_continue:
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+3
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+8
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $input_l0, $input_l0, $rkN_l                      // AES block 4k+4 - round N low
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	eor     $input_h0, $input_h0, $rkN_h                      // AES block 4k+4 - round N high
+	fmov    $ctr_t0d, $input_l0                               // AES block 4k+4 - mov low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+8
+	eor     $mod_t.16b, $acc_hb, $mod_t.16b                   // MODULO - fold into mid
+	eor     $input_h1, $input_h1, $rkN_h                      // AES block 4k+5 - round N high
+	eor     $input_h3, $input_h3, $rkN_h                      // AES block 4k+7 - round N high
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+8
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	fmov    $ctr_t0.d[1], $input_h0                           // AES block 4k+4 - mov high
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	fmov    $ctr_t3d, $input_l3                               // AES block 4k+7 - mov low
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	fmov    $ctr_t1.d[1], $input_h1                           // AES block 4k+5 - mov high
+	fmov    $ctr_t2d, $input_l2                               // AES block 4k+6 - mov low
+	cmp     $input_ptr, $main_end_input_ptr                   // LOOP CONTROL
+	fmov    $ctr_t2.d[1], $input_h2                           // AES block 4k+6 - mov high
+	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            // MODULO - mid 64b align with low
+	eor     $res0b, $ctr_t0b, $ctr0b                          // AES block 4k+4 - result
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4k+8
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4k+8
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+9
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+9
+	eor     $res1b, $ctr_t1b, $ctr1b                          // AES block 4k+5 - result
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 4k+9
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+9
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 4k+9
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+10
+	st1     { $res0b}, [$output_ptr], #16                     // AES block 4k+4 - store result
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+10
+	eor     $acc_lb, $acc_lb, $acc_hb                         // MODULO - fold into low
+	fmov    $ctr_t3.d[1], $input_h3                           // AES block 4k+7 - mov high
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	st1     { $res1b}, [$output_ptr], #16                     // AES block 4k+5 - store result
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+10
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	eor     $res2b, $ctr_t2b, $ctr2b                          // AES block 4k+6 - result
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+10
+	st1     { $res2b}, [$output_ptr], #16                     // AES block 4k+6 - store result
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+10
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+11
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+11
+	eor     $res3b, $ctr_t3b, $ctr3b                          // AES block 4k+7 - result
+	st1     { $res3b}, [$output_ptr], #16                     // AES block 4k+7 - store result
+	b.lt    .Lenc_main_loop
+
+.Lenc_prepretail:                                                 // PREPRETAIL
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+3
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	rev64   $res0b, $res0b                                    // GHASH block 4k (only t0 is free)
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+3
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	rev64   $res1b, $res1b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+3
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	movi    $mod_constant.8b, #0xc2
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $acc_mb, $acc_mb, $acc_hb                         // karatsuba tidy up
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
+	ext     $acc_hb, $acc_hb, $acc_hb, #8
+	eor     $acc_mb, $acc_mb, $acc_lb
+	b.lt	.Lenc_finish_prepretail                           // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	b.eq	.Lenc_finish_prepretail                           // branch if AES-192
+
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+
+.Lenc_finish_prepretail:
+	eor     $acc_mb, $acc_mb, $t1.16b
+	eor     $acc_mb, $acc_mb, $acc_hb
+	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
+	ext     $acc_mb, $acc_mb, $acc_mb, #8
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	eor     $acc_lb, $acc_lb, $t1.16b
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	eor     $acc_lb, $acc_lb, $acc_mb
+
+.Lenc_tail:                                                       // TAIL
+	ext     $t0.16b, $acc_lb, $acc_lb, #8                     // prepare final partial tag
+	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   // main_end_input_ptr is number of bytes left to process
+	ldp     $input_l0, $input_h0, [$input_ptr], #16           // AES block 4k+4 - load plaintext
+	eor     $input_l0, $input_l0, $rkN_l                      // AES block 4k+4 - round N low
+	eor     $input_h0, $input_h0, $rkN_h                      // AES block 4k+4 - round N high
+	cmp     $main_end_input_ptr, #48
+	fmov    $ctr_t0d, $input_l0                               // AES block 4k+4 - mov low
+	fmov    $ctr_t0.d[1], $input_h0                           // AES block 4k+4 - mov high
+	eor     $res1b, $ctr_t0b, $ctr0b                          // AES block 4k+4 - result
+	b.gt    .Lenc_blocks_more_than_3
+	cmp     $main_end_input_ptr, #32
+	mov     $ctr3b, $ctr2b
+	movi    $acc_l.8b, #0
+	movi    $acc_h.8b, #0
+	sub     $rctr32w, $rctr32w, #1
+	mov     $ctr2b, $ctr1b
+	movi    $acc_m.8b, #0
+	b.gt    .Lenc_blocks_more_than_2
+	mov     $ctr3b, $ctr1b
+	sub     $rctr32w, $rctr32w, #1
+	cmp     $main_end_input_ptr, #16
+	b.gt    .Lenc_blocks_more_than_1
+	sub     $rctr32w, $rctr32w, #1
+	b       .Lenc_blocks_less_than_1
+.Lenc_blocks_more_than_3:                                        // blocks left >  3
+	st1     { $res1b}, [$output_ptr], #16                    // AES final-3 block  - store result
+	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final-2 block - load input low & high
+	rev64   $res0b, $res1b                                   // GHASH final-3 block
+	eor     $input_l0, $input_l0, $rkN_l                     // AES final-2 block - round N low
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	eor     $input_h0, $input_h0, $rkN_h                     // AES final-2 block - round N high
+	mov     $rk4d, $res0.d[1]                                // GHASH final-3 block - mid
+	fmov    $res1d, $input_l0                                // AES final-2 block - mov low
+	fmov    $res1.d[1], $input_h0                            // AES final-2 block - mov high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-3 block - mid
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	mov     $acc_md, $h34k.d[1]                              // GHASH final-3 block - mid
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                      // GHASH final-3 block - low
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      // GHASH final-3 block - high
+	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   // GHASH final-3 block - mid
+	eor     $res1b, $res1b, $ctr1b                           // AES final-2 block - result
+.Lenc_blocks_more_than_2:                                        // blocks left >  2
+	st1     { $res1b}, [$output_ptr], #16                    // AES final-2 block - store result
+	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final-1 block - load input low & high
+	rev64   $res0b, $res1b                                   // GHASH final-2 block
+	eor     $input_l0, $input_l0, $rkN_l                     // AES final-1 block - round N low
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	fmov    $res1d, $input_l0                                // AES final-1 block - mov low
+	eor     $input_h0, $input_h0, $rkN_h                     // AES final-1 block - round N high
+	fmov    $res1.d[1], $input_h0                            // AES final-1 block - mov high
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	pmull2  $rk2q1, $res0.2d, $h3.2d                         // GHASH final-2 block - high
+	mov     $rk4d, $res0.d[1]                                // GHASH final-2 block - mid
+	pmull   $rk3q1, $res0.1d, $h3.1d                         // GHASH final-2 block - low
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-2 block - mid
+	eor     $res1b, $res1b, $ctr2b                           // AES final-1 block - result
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-2 block - high
+	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     // GHASH final-2 block - mid
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-2 block - low
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-2 block - mid
+.Lenc_blocks_more_than_1:                                        // blocks left >  1
+	st1     { $res1b}, [$output_ptr], #16                    // AES final-1 block - store result
+	rev64   $res0b, $res1b                                   // GHASH final-1 block
+	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final block - load input low & high
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	eor     $input_l0, $input_l0, $rkN_l                     // AES final block - round N low
+	mov     $rk4d, $res0.d[1]                                // GHASH final-1 block - mid
+	pmull2  $rk2q1, $res0.2d, $h2.2d                         // GHASH final-1 block - high
+	eor     $input_h0, $input_h0, $rkN_h                     // AES final block - round N high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-1 block - mid
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-1 block - high
+	ins     $rk4v.d[1], $rk4v.d[0]                           // GHASH final-1 block - mid
+	fmov    $res1d, $input_l0                                // AES final block - mov low
+	fmov    $res1.d[1], $input_h0                            // AES final block - mov high
+	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     // GHASH final-1 block - mid
+	pmull   $rk3q1, $res0.1d, $h2.1d                         // GHASH final-1 block - low
+	eor     $res1b, $res1b, $ctr3b                           // AES final block - result
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-1 block - mid
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-1 block - low
+.Lenc_blocks_less_than_1:                                        // blocks left <= 1
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	mvn     $rkN_l, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub     $bit_length, $bit_length, #128                   // bit_length -= 128
+	neg     $bit_length, $bit_length                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1     { $rk0}, [$output_ptr]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn     $rkN_h, xzr                                      // rkN_h = 0xffffffffffffffff
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	lsr     $rkN_h, $rkN_h, $bit_length                      // rkN_h is mask for top 64b of last block
+	cmp     $bit_length, #64
+	csel    $input_l0, $rkN_l, $rkN_h, lt
+	csel    $input_h0, $rkN_h, xzr, lt
+	fmov    $ctr0d, $input_l0                                // ctr0b is mask for last block
+	fmov    $ctr0.d[1], $input_h0
+	and     $res1b, $res1b, $ctr0b                           // possibly partial last block has zeroes in highest bits
+	rev64   $res0b, $res1b                                   // GHASH final block
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	bif     $res1b, $rk0, $ctr0b                             // insert existing bytes in top end of result before storing
+	pmull2  $rk2q1, $res0.2d, $h1.2d                         // GHASH final block - high
+	mov     $t0d, $res0.d[1]                                 // GHASH final block - mid
+	rev     $ctr32w, $rctr32w
+	pmull   $rk3q1, $res0.1d, $h1.1d                         // GHASH final block - low
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final block - high
+	eor     $t0.8b, $t0.8b, $res0.8b                         // GHASH final block - mid
+	pmull   $t0.1q, $t0.1d, $h12k.1d                         // GHASH final block - mid
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final block - low
+	eor     $acc_mb, $acc_mb, $t0.16b                        // GHASH final block - mid
+	movi    $mod_constant.8b, #0xc2
+	eor     $t9.16b, $acc_lb, $acc_hb                        // MODULO - karatsuba tidy up
+	shl     $mod_constantd, $mod_constantd, #56              // mod_constant
+	eor     $acc_mb, $acc_mb, $t9.16b                        // MODULO - karatsuba tidy up
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           // MODULO - top 64b align with mid
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                    // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $mod_t.16b                     // MODULO - fold into mid
+	eor     $acc_mb, $acc_mb, $acc_hb                        // MODULO - fold into mid
+	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           // MODULO - mid 64b align with low
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                    // MODULO - other mid alignment
+	str     $ctr32w, [$counter, #12]                         // store the updated counter
+	st1     { $res1b}, [$output_ptr]                         // store all 16B
+	eor     $acc_lb, $acc_lb, $acc_hb                        // MODULO - fold into low
+	eor     $acc_lb, $acc_lb, $acc_mb                        // MODULO - fold into low
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	mov     x0, $len
+	st1     { $acc_l.16b }, [$current_tag]
+	ldp     x19, x20, [sp, #16]
+	ldp     x21, x22, [sp, #32]
+	ldp     x23, x24, [sp, #48]
+	ldp     d8, d9, [sp, #64]
+	ldp     d10, d11, [sp, #80]
+	ldp     d12, d13, [sp, #96]
+	ldp     d14, d15, [sp, #112]
+	ldp     x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
+___
+
+{
+my $t8="v4";
+my $t8d="d4";
+my $t9="v6";
+my $t9d="d6";
+################################################################################
+# size_t aes_gcm_dec_kernel(const uint8_t *in,
+#                           size_t len_bits,
+#                           uint8_t *out,
+#                           u64 *Xi,
+#                           uint8_t ivec[16],
+#                           const void *key);
+#
+$code.=<<___;
+.global aes_gcm_dec_kernel
+.type   aes_gcm_dec_kernel,%function
+.align  4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp     x19, x20, [sp, #16]
+	mov     $counter, x4
+	mov     $cc, x5
+	stp     x21, x22, [sp, #32]
+	stp     x23, x24, [sp, #48]
+	stp     d8, d9, [sp, #64]
+	stp     d10, d11, [sp, #80]
+	stp     d12, d13, [sp, #96]
+	stp     d14, d15, [sp, #112]
+	ldr	$roundsw, [$cc, #240]
+	add	$input_l1, $cc, $rounds, lsl #4                   // borrow input_l1 for last key
+	ldp     $rkN_l, $rkN_h, [$input_l1]                       // load round N keys
+	ldr     $rkNm1q, [$input_l1, #-16]                        // load round N-1 keys
+	lsr     $main_end_input_ptr, $bit_length, #3              // byte_len
+	mov     $len, $main_end_input_ptr
+	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              // ctr96_b64, ctr96_t32
+	ldr     $rk8q, [$cc, #128]                                // load rk8
+	sub     $main_end_input_ptr, $main_end_input_ptr, #1      // byte_len - 1
+	ldr     $rk7q, [$cc, #112]                                // load rk7
+	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   // end_input_ptr
+	ldr     $rk6q, [$cc, #96]                                 // load rk6
+	lsr     $rctr32x, $ctr96_t32x, #32
+	ldr     $rk5q, [$cc, #80]                                 // load rk5
+	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+	ldr     $rk3q, [$cc, #48]                                 // load rk3
+	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+	rev     $rctr32w, $rctr32w                                // rev_ctr32
+	add     $rctr32w, $rctr32w, #1                            // increment rev_ctr32
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 3
+	rev     $ctr32w, $rctr32w                                 // CTR block 1
+	add     $rctr32w, $rctr32w, #1                            // CTR block 1
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 1
+	ld1     { $ctr0b}, [$counter]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 1
+	rev     $ctr32w, $rctr32w                                 // CTR block 2
+	add     $rctr32w, $rctr32w, #1                            // CTR block 2
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 2
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 2
+	rev     $ctr32w, $rctr32w                                 // CTR block 3
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 3
+	ldr     $rk0q, [$cc, #0]                                  // load rk0
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 3
+	add     $rctr32w, $rctr32w, #1                            // CTR block 3
+	ldr     $rk4q, [$cc, #64]                                 // load rk4
+	ldr     $rk1q, [$cc, #16]                                 // load rk1
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
+	ldr     $h3q, [$Htable, #48]                              // load h3l | h3h
+	ext     $h3b, $h3b, $h3b, #8
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
+	ldr     $h4q, [$Htable, #80]                              // load h4l | h4h
+	ext     $h4b, $h4b, $h4b, #8
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
+	ldr     $h2q, [$Htable, #32]                              // load h2l | h2h
+	ext     $h2b, $h2b, $h2b, #8
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
+	ldr     $rk2q, [$cc, #32]                                 // load rk2
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
+	ld1     { $acc_lb}, [$current_tag]
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
+	ldr     $rk9q, [$cc, #144]                                // load rk9
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
+	ldr     $rk12q, [$cc, #192]                               // load rk12
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
+	ldr     $h1q, [$Htable]                                   // load h1l | h1h
+	ext     $h1b, $h1b, $h1b, #8
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
+	ldr     $rk10q, [$cc, #160]                               // load rk10
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 3
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 3
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 4
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 3
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 4
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 4
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 4
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 5
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 5
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 5
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 5
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 6
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 6
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 7
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 7
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 8
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 7
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 8
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 8
+	ldr     $rk11q, [$cc, #176]                               // load rk11
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 8
+	b.lt	.Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 9
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 9
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 10
+	b.eq	.Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 12
+
+.Ldec_finish_first_blocks:
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 4 blocks
+	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      // h4h | h3h
+	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      // h4l | h3l
+	trn1    $t0.2d,    $h1.2d,    $h2.2d                      // h2h | h1h
+	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      // h2l | h1l
+	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  // h4k | h3k
+	aese    $ctr1b, $rkNm1                                    // AES block 1 - round N-1
+	aese    $ctr2b, $rkNm1                                    // AES block 2 - round N-1
+	eor     $h12k.16b, $h12k.16b, $t0.16b                     // h2k | h1k
+	aese    $ctr3b, $rkNm1                                    // AES block 3 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 0 - round N-1
+	b.ge    .Ldec_tail                                        // handle tail
+
+	ldr     $res0q, [$input_ptr, #0]                          // AES block 0 - load ciphertext
+	ldr     $res1q, [$input_ptr, #16]                         // AES block 1 - load ciphertext
+	rev     $ctr32w, $rctr32w                                 // CTR block 4
+	eor     $ctr0b, $res0b, $ctr0b                            // AES block 0 - result
+	eor     $ctr1b, $res1b, $ctr1b                            // AES block 1 - result
+	rev64   $res1b, $res1b                                    // GHASH block 1
+	ldr     $res3q, [$input_ptr, #48]                         // AES block 3 - load ciphertext
+	mov     $output_h0, $ctr0.d[1]                            // AES block 0 - mov high
+	mov     $output_l0, $ctr0.d[0]                            // AES block 0 - mov low
+	rev64   $res0b, $res0b                                    // GHASH block 0
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4
+	rev     $ctr32w, $rctr32w                                 // CTR block 5
+	add     $rctr32w, $rctr32w, #1                            // CTR block 5
+	mov     $output_l1, $ctr1.d[0]                            // AES block 1 - mov low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 5
+	mov     $output_h1, $ctr1.d[1]                            // AES block 1 - mov high
+	eor     $output_h0, $output_h0, $rkN_h                    // AES block 0 - round N high
+	eor     $output_l0, $output_l0, $rkN_l                    // AES block 0 - round N low
+	stp     $output_l0, $output_h0, [$output_ptr], #16        // AES block 0 - store result
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 5
+	ldr     $res2q, [$input_ptr, #32]                         // AES block 2 - load ciphertext
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 5
+	rev     $ctr32w, $rctr32w                                 // CTR block 6
+	add     $rctr32w, $rctr32w, #1                            // CTR block 6
+	eor     $output_l1, $output_l1, $rkN_l                    // AES block 1 - round N low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 6
+	eor     $output_h1, $output_h1, $rkN_h                    // AES block 1 - round N high
+	stp     $output_l1, $output_h1, [$output_ptr], #16        // AES block 1 - store result
+	eor     $ctr2b, $res2b, $ctr2b                            // AES block 2 - result
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 8 blocks
+	b.ge    .Ldec_prepretail                                  // do prepretail
+
+.Ldec_main_loop:                                                  // main loop start
+	mov     $output_l2, $ctr2.d[0]                            // AES block 4k+2 - mov low
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	eor     $ctr3b, $res3b, $ctr3b                            // AES block 4k+3 - result
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	mov     $output_h2, $ctr2.d[1]                            // AES block 4k+2 - mov high
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+6
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+6
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+7
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	mov     $output_h3, $ctr3.d[1]                            // AES block 4k+3 - mov high
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	mov     $output_l3, $ctr3.d[0]                            // AES block 4k+3 - mov low
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+7
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+7
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+7
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	eor     $output_h2, $output_h2, $rkN_h                    // AES block 4k+2 - round N high
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	eor     $output_l2, $output_l2, $rkN_l                    // AES block 4k+2 - round N low
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	stp     $output_l2, $output_h2, [$output_ptr], #16        // AES block 4k+2 - store result
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	eor     $output_l3, $output_l3, $rkN_l                    // AES block 4k+3 - round N low
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	eor     $output_h3, $output_h3, $rkN_h                    // AES block 4k+3 - round N high
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+7
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+8
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+8
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+8
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	movi    $mod_constant.8b, #0xc2
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	b.lt	.Ldec_main_loop_continue                          // branch if AES-128
+
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	b.eq	.Ldec_main_loop_continue                          // branch if AES-192
+
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+
+.Ldec_main_loop_continue:
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	ldr     $res0q, [$input_ptr, #0]                          // AES block 4k+4 - load ciphertext
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	ldr     $res1q, [$input_ptr, #16]                         // AES block 4k+5 - load ciphertext
+	eor     $ctr0b, $res0b, $ctr0b                            // AES block 4k+4 - result
+	stp     $output_l3, $output_h3, [$output_ptr], #16        // AES block 4k+3 - store result
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	ldr     $res3q, [$input_ptr, #48]                         // AES block 4k+7 - load ciphertext
+	ldr     $res2q, [$input_ptr, #32]                         // AES block 4k+6 - load ciphertext
+	mov     $output_h0, $ctr0.d[1]                            // AES block 4k+4 - mov high
+	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	mov     $output_l0, $ctr0.d[0]                            // AES block 4k+4 - mov low
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4k+8
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4k+8
+	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
+	eor     $ctr1b, $res1b, $ctr1b                            // AES block 4k+5 - result
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+9
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+9
+	cmp     $input_ptr, $main_end_input_ptr                   // LOOP CONTROL
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+9
+	eor     $output_l0, $output_l0, $rkN_l                    // AES block 4k+4 - round N low
+	eor     $output_h0, $output_h0, $rkN_h                    // AES block 4k+4 - round N high
+	mov     $output_h1, $ctr1.d[1]                            // AES block 4k+5 - mov high
+	eor     $ctr2b, $res2b, $ctr2b                            // AES block 4k+6 - result
+	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
+	mov     $output_l1, $ctr1.d[0]                            // AES block 4k+5 - mov low
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 4k+9
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 4k+9
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+10
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+10
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+10
+	rev64   $res1b, $res1b                                    // GHASH block 4k+5
+	eor     $output_h1, $output_h1, $rkN_h                    // AES block 4k+5 - round N high
+	stp     $output_l0, $output_h0, [$output_ptr], #16        // AES block 4k+4 - store result
+	eor     $output_l1, $output_l1, $rkN_l                    // AES block 4k+5 - round N low
+	stp     $output_l1, $output_h1, [$output_ptr], #16        // AES block 4k+5 - store result
+	rev64   $res0b, $res0b                                    // GHASH block 4k+4
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+	b.lt    .Ldec_main_loop
+
+.Ldec_prepretail:                                                 // PREPRETAIL
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	mov     $output_l2, $ctr2.d[0]                            // AES block 4k+2 - mov low
+	eor     $ctr3b, $res3b, $ctr3b                            // AES block 4k+3 - result
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	mov     $output_h2, $ctr2.d[1]                            // AES block 4k+2 - mov high
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+6
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+6
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+7
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+7
+	mov     $output_l3, $ctr3.d[0]                            // AES block 4k+3 - mov low
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	mov     $output_h3, $ctr3.d[1]                            // AES block 4k+3 - mov high
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+7
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+7
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	movi    $mod_constant.8b, #0xc2
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	b.lt	.Ldec_finish_prepretail                           // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	b.eq	.Ldec_finish_prepretail                           // branch if AES-192
+
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+
+.Ldec_finish_prepretail:
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	eor     $output_h2, $output_h2, $rkN_h                    // AES block 4k+2 - round N high
+	eor     $output_l3, $output_l3, $rkN_l                    // AES block 4k+3 - round N low
+	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+7
+	eor     $output_l2, $output_l2, $rkN_l                    // AES block 4k+2 - round N low
+	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
+	eor     $output_h3, $output_h3, $rkN_h                    // AES block 4k+3 - round N high
+	stp     $output_l2, $output_h2, [$output_ptr], #16        // AES block 4k+2 - store result
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	stp     $output_l3, $output_h3, [$output_ptr], #16        // AES block 4k+3 - store result
+
+	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+
+.Ldec_tail:                                                       // TAIL
+	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   // main_end_input_ptr is number of bytes left to process
+	ld1     { $res1b}, [$input_ptr], #16                      // AES block 4k+4 - load ciphertext
+	eor     $ctr0b, $res1b, $ctr0b                            // AES block 4k+4 - result
+	mov     $output_l0, $ctr0.d[0]                            // AES block 4k+4 - mov low
+	mov     $output_h0, $ctr0.d[1]                            // AES block 4k+4 - mov high
+	ext     $t0.16b, $acc_lb, $acc_lb, #8                     // prepare final partial tag
+	cmp     $main_end_input_ptr, #48
+	eor     $output_l0, $output_l0, $rkN_l                    // AES block 4k+4 - round N low
+	eor     $output_h0, $output_h0, $rkN_h                    // AES block 4k+4 - round N high
+	b.gt    .Ldec_blocks_more_than_3
+	sub     $rctr32w, $rctr32w, #1
+	mov     $ctr3b, $ctr2b
+	movi    $acc_m.8b, #0
+	movi    $acc_l.8b, #0
+	cmp     $main_end_input_ptr, #32
+	movi    $acc_h.8b, #0
+	mov     $ctr2b, $ctr1b
+	b.gt    .Ldec_blocks_more_than_2
+	sub     $rctr32w, $rctr32w, #1
+	mov     $ctr3b, $ctr1b
+	cmp     $main_end_input_ptr, #16
+	b.gt    .Ldec_blocks_more_than_1
+	sub     $rctr32w, $rctr32w, #1
+	b       .Ldec_blocks_less_than_1
+.Ldec_blocks_more_than_3:                                    // blocks left >  3
+	rev64   $res0b, $res1b                                   // GHASH final-3 block
+	ld1     { $res1b}, [$input_ptr], #16                     // AES final-2 block - load ciphertext
+	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-3 block  - store result
+	mov     $acc_md, $h34k.d[1]                              // GHASH final-3 block - mid
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	eor     $ctr0b, $res1b, $ctr1b                           // AES final-2 block - result
+	mov     $rk4d, $res0.d[1]                                // GHASH final-3 block - mid
+	mov     $output_l0, $ctr0.d[0]                           // AES final-2 block - mov low
+	mov     $output_h0, $ctr0.d[1]                           // AES final-2 block - mov high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-3 block - mid
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      // GHASH final-3 block - high
+	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   // GHASH final-3 block - mid
+	eor     $output_l0, $output_l0, $rkN_l                   // AES final-2 block - round N low
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                      // GHASH final-3 block - low
+	eor     $output_h0, $output_h0, $rkN_h                   // AES final-2 block - round N high
+.Ldec_blocks_more_than_2:                                    // blocks left >  2
+	rev64   $res0b, $res1b                                   // GHASH final-2 block
+	ld1     { $res1b}, [$input_ptr], #16                     // AES final-1 block - load ciphertext
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-2 block  - store result
+	eor     $ctr0b, $res1b, $ctr2b                           // AES final-1 block - result
+	mov     $rk4d, $res0.d[1]                                // GHASH final-2 block - mid
+	pmull   $rk3q1, $res0.1d, $h3.1d                         // GHASH final-2 block - low
+	pmull2  $rk2q1, $res0.2d, $h3.2d                         // GHASH final-2 block - high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-2 block - mid
+	mov     $output_l0, $ctr0.d[0]                           // AES final-1 block - mov low
+	mov     $output_h0, $ctr0.d[1]                           // AES final-1 block - mov high
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-2 block - low
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     // GHASH final-2 block - mid
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-2 block - high
+	eor     $output_l0, $output_l0, $rkN_l                   // AES final-1 block - round N low
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-2 block - mid
+	eor     $output_h0, $output_h0, $rkN_h                   // AES final-1 block - round N high
+.Ldec_blocks_more_than_1:                                        // blocks left >  1
+	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-1 block  - store result
+	rev64   $res0b, $res1b                                   // GHASH final-1 block
+	ld1     { $res1b}, [$input_ptr], #16                     // AES final block - load ciphertext
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	mov     $rk4d, $res0.d[1]                                // GHASH final-1 block - mid
+	eor     $ctr0b, $res1b, $ctr3b                           // AES final block - result
+	pmull2  $rk2q1, $res0.2d, $h2.2d                         // GHASH final-1 block - high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-1 block - mid
+	pmull   $rk3q1, $res0.1d, $h2.1d                         // GHASH final-1 block - low
+	mov     $output_l0, $ctr0.d[0]                           // AES final block - mov low
+	ins     $rk4v.d[1], $rk4v.d[0]                           // GHASH final-1 block - mid
+	mov     $output_h0, $ctr0.d[1]                           // AES final block - mov high
+	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     // GHASH final-1 block - mid
+	eor     $output_l0, $output_l0, $rkN_l                   // AES final block - round N low
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-1 block - low
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-1 block - high
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-1 block - mid
+	eor     $output_h0, $output_h0, $rkN_h                   // AES final block - round N high
+.Ldec_blocks_less_than_1:                                        // blocks left <= 1
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	mvn     $rkN_h, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub     $bit_length, $bit_length, #128                   // bit_length -= 128
+	mvn     $rkN_l, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite
+	neg     $bit_length, $bit_length                         // bit_length = 128 - #bits in input (in range [1,128])
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	lsr     $rkN_h, $rkN_h, $bit_length                      // rkN_h is mask for top 64b of last block
+	cmp     $bit_length, #64
+	csel    $ctr32x, $rkN_l, $rkN_h, lt
+	csel    $ctr96_b64x, $rkN_h, xzr, lt
+	fmov    $ctr0d, $ctr32x                                  // ctr0b is mask for last block
+	and     $output_l0, $output_l0, $ctr32x
+	mov     $ctr0.d[1], $ctr96_b64x
+	bic     $end_input_ptr, $end_input_ptr, $ctr32x          // mask out low existing bytes
+	rev     $ctr32w, $rctr32w
+	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x      // mask out high existing bytes
+	orr     $output_l0, $output_l0, $end_input_ptr
+	and     $output_h0, $output_h0, $ctr96_b64x
+	orr     $output_h0, $output_h0, $main_end_input_ptr
+	and     $res1b, $res1b, $ctr0b                            // possibly partial last block has zeroes in highest bits
+	rev64   $res0b, $res1b                                    // GHASH final block
+	eor     $res0b, $res0b, $t0.16b                           // feed in partial tag
+	pmull   $rk3q1, $res0.1d, $h1.1d                          // GHASH final block - low
+	mov     $t0d, $res0.d[1]                                  // GHASH final block - mid
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH final block - mid
+	pmull2  $rk2q1, $res0.2d, $h1.2d                          // GHASH final block - high
+	pmull   $t0.1q, $t0.1d, $h12k.1d                          // GHASH final block - mid
+	eor     $acc_hb, $acc_hb, $rk2                            // GHASH final block - high
+	eor     $acc_lb, $acc_lb, $rk3                            // GHASH final block - low
+	eor     $acc_mb, $acc_mb, $t0.16b                         // GHASH final block - mid
+	movi    $mod_constant.8b, #0xc2
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
+	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
+	stp     $output_l0, $output_h0, [$output_ptr]
+	str     $ctr32w, [$counter, #12]                          // store the updated counter
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	mov     x0, $len
+	st1     { $acc_l.16b }, [$current_tag]
+	ldp     x19, x20, [sp, #16]
+	ldp     x21, x22, [sp, #32]
+	ldp     x23, x24, [sp, #48]
+	ldp     d8, d9, [sp, #64]
+	ldp     d10, d11, [sp, #80]
+	ldp     d12, d13, [sp, #96]
+	ldp     d14, d15, [sp, #112]
+	ldp     x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
+___
+}
+}
+
+$code.=<<___;
+#endif
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
new file mode 100644
index 0000000000..a7518f6e63
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
@@ -0,0 +1,1144 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# of Linaro.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt	19.5 cycles per byte processed with 128-bit key
+# decrypt	22.1 cycles per byte processed with 128-bit key
+# key conv.	440  cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+#						<appro@openssl.org>
+
+# April-August 2013
+# Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InBasisChange	(@b);
+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	veor	@b[2], @b[2], @b[1]
+	veor	@b[5], @b[5], @b[6]
+	veor	@b[3], @b[3], @b[0]
+	veor	@b[6], @b[6], @b[2]
+	veor	@b[5], @b[5], @b[0]
+
+	veor	@b[6], @b[6], @b[3]
+	veor	@b[3], @b[3], @b[7]
+	veor	@b[7], @b[7], @b[5]
+	veor	@b[3], @b[3], @b[4]
+	veor	@b[4], @b[4], @b[5]
+
+	veor	@b[2], @b[2], @b[7]
+	veor	@b[3], @b[3], @b[1]
+	veor	@b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	veor	@b[0], @b[0], @b[6]
+	veor	@b[1], @b[1], @b[4]
+	veor	@b[4], @b[4], @b[6]
+	veor	@b[2], @b[2], @b[0]
+	veor	@b[6], @b[6], @b[1]
+
+	veor	@b[1], @b[1], @b[5]
+	veor	@b[5], @b[5], @b[3]
+	veor	@b[3], @b[3], @b[7]
+	veor	@b[7], @b[7], @b[5]
+	veor	@b[2], @b[2], @b[5]
+
+	veor	@b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InvInBasisChange	(@b);
+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+	 veor	@b[1], @b[1], @b[7]
+	veor	@b[4], @b[4], @b[7]
+
+	veor	@b[7], @b[7], @b[5]
+	 veor	@b[1], @b[1], @b[3]
+	veor	@b[2], @b[2], @b[5]
+	veor	@b[3], @b[3], @b[7]
+
+	veor	@b[6], @b[6], @b[1]
+	veor	@b[2], @b[2], @b[0]
+	 veor	@b[5], @b[5], @b[3]
+	veor	@b[4], @b[4], @b[6]
+	veor	@b[0], @b[0], @b[6]
+	veor	@b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange {		# InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+	veor	@b[1], @b[1], @b[5]
+	veor	@b[2], @b[2], @b[7]
+
+	veor	@b[3], @b[3], @b[1]
+	veor	@b[4], @b[4], @b[5]
+	veor	@b[7], @b[7], @b[5]
+	veor	@b[3], @b[3], @b[4]
+	 veor 	@b[5], @b[5], @b[0]
+	veor	@b[3], @b[3], @b[7]
+	 veor	@b[6], @b[6], @b[2]
+	 veor	@b[2], @b[2], @b[1]
+	veor	@b[6], @b[6], @b[3]
+
+	veor	@b[3], @b[3], @b[0]
+	veor	@b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+	veor 	$t0, $y0, $y1
+	vand	$t0, $t0, $x0
+	veor	$x0, $x0, $x1
+	vand	$t1, $x1, $y0
+	vand	$x0, $x0, $y1
+	veor	$x1, $t1, $t0
+	veor	$x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N {				# not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	veor	$t0, $y0, $y1
+	vand	$t0, $t0, $x0
+	veor	$x0, $x0, $x1
+	vand	$x1, $x1, $y0
+	vand	$x0, $x0, $y1
+	veor	$x1, $x1, $x0
+	veor	$x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+	veor	$t0, $y0, $y1
+	 veor 	$t1, $y2, $y3
+	vand	$t0, $t0, $x0
+	 vand	$t1, $t1, $x2
+	veor	$x0, $x0, $x1
+	 veor	$x2, $x2, $x3
+	vand	$x1, $x1, $y0
+	 vand	$x3, $x3, $y2
+	vand	$x0, $x0, $y1
+	 vand	$x2, $x2, $y3
+	veor	$x1, $x1, $x0
+	 veor	$x2, $x2, $x3
+	veor	$x0, $x0, $t0
+	 veor	$x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+	veor	@t[0], @x[0], @x[2]
+	veor	@t[1], @x[1], @x[3]
+___
+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+	veor	@y[0], @y[0], @y[2]
+	veor	@y[1], @y[1], @y[3]
+___
+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	veor	@x[0], @x[0], @t[0]
+	veor	@x[2], @x[2], @t[0]
+	veor	@x[1], @x[1], @t[1]
+	veor	@x[3], @x[3], @t[1]
+
+	veor	@t[0], @x[4], @x[6]
+	veor	@t[1], @x[5], @x[7]
+___
+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	veor	@y[0], @y[0], @y[2]
+	veor	@y[1], @y[1], @y[3]
+___
+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+	veor	@x[4], @x[4], @t[0]
+	veor	@x[6], @x[6], @t[0]
+	veor	@x[5], @x[5], @t[1]
+	veor	@x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+	veor	@t[3], @x[4], @x[6]
+	veor	@t[2], @x[5], @x[7]
+	veor	@t[1], @x[1], @x[3]
+	veor	@s[1], @x[7], @x[6]
+	 vmov	@t[0], @t[2]
+	veor	@s[0], @x[0], @x[2]
+
+	vorr	@t[2], @t[2], @t[1]
+	veor	@s[3], @t[3], @t[0]
+	vand	@s[2], @t[3], @s[0]
+	vorr	@t[3], @t[3], @s[0]
+	veor	@s[0], @s[0], @t[1]
+	vand	@t[0], @t[0], @t[1]
+	veor	@t[1], @x[3], @x[2]
+	vand	@s[3], @s[3], @s[0]
+	vand	@s[1], @s[1], @t[1]
+	veor	@t[1], @x[4], @x[5]
+	veor	@s[0], @x[1], @x[0]
+	veor	@t[3], @t[3], @s[1]
+	veor	@t[2], @t[2], @s[1]
+	vand	@s[1], @t[1], @s[0]
+	vorr	@t[1], @t[1], @s[0]
+	veor	@t[3], @t[3], @s[3]
+	veor	@t[0], @t[0], @s[1]
+	veor	@t[2], @t[2], @s[2]
+	veor	@t[1], @t[1], @s[3]
+	veor	@t[0], @t[0], @s[2]
+	vand	@s[0], @x[7], @x[3]
+	veor	@t[1], @t[1], @s[2]
+	vand	@s[1], @x[6], @x[2]
+	vand	@s[2], @x[5], @x[1]
+	vorr	@s[3], @x[4], @x[0]
+	veor	@t[3], @t[3], @s[0]
+	veor	@t[1], @t[1], @s[2]
+	veor	@t[0], @t[0], @s[3]
+	veor	@t[2], @t[2], @s[1]
+
+	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+	@ new smaller inversion
+
+	vand	@s[2], @t[3], @t[1]
+	vmov	@s[0], @t[0]
+
+	veor	@s[1], @t[2], @s[2]
+	veor	@s[3], @t[0], @s[2]
+	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
+
+	vbsl	@s[1], @t[1], @t[0]
+	vbsl	@s[3], @t[3], @t[2]
+	veor	@t[3], @t[3], @t[2]
+
+	vbsl	@s[0], @s[1], @s[2]
+	vbsl	@t[0], @s[2], @s[1]
+
+	vand	@s[2], @s[0], @s[3]
+	veor	@t[1], @t[1], @t[0]
+
+	veor	@s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+	vldmia	$key!, {@t[0]-@t[3]}
+	veor	@t[0], @t[0], @x[0]
+	veor	@t[1], @t[1], @x[1]
+	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[0]}
+	veor	@t[2], @t[2], @x[2]
+	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[1]}
+	veor	@t[3], @t[3], @x[3]
+	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[2]}
+	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+	vldmia	$key!, {@t[3]}
+	veor	@t[0], @t[0], @x[4]
+	veor	@t[1], @t[1], @x[5]
+	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+	veor	@t[2], @t[2], @x[6]
+	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+	veor	@t[3], @t[3], @x[7]
+	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16];	# optional
+$code.=<<___;
+	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
+	vext.8	@t[1], @x[1], @x[1], #12
+	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
+	vext.8	@t[2], @x[2], @x[2], #12
+	 veor	@x[1], @x[1], @t[1]
+	vext.8	@t[3], @x[3], @x[3], #12
+	 veor	@x[2], @x[2], @t[2]
+	vext.8	@t[4], @x[4], @x[4], #12
+	 veor	@x[3], @x[3], @t[3]
+	vext.8	@t[5], @x[5], @x[5], #12
+	 veor	@x[4], @x[4], @t[4]
+	vext.8	@t[6], @x[6], @x[6], #12
+	 veor	@x[5], @x[5], @t[5]
+	vext.8	@t[7], @x[7], @x[7], #12
+	 veor	@x[6], @x[6], @t[6]
+
+	veor	@t[1], @t[1], @x[0]
+	 veor	@x[7], @x[7], @t[7]
+	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	@t[2], @t[2], @x[1]
+	veor	@t[0], @t[0], @x[7]
+	veor	@t[1], @t[1], @x[7]
+	 vext.8	@x[1], @x[1], @x[1], #8
+	veor	@t[5], @t[5], @x[4]
+	 veor	@x[0], @x[0], @t[0]
+	veor	@t[6], @t[6], @x[5]
+	 veor	@x[1], @x[1], @t[1]
+	 vext.8	@t[0], @x[4], @x[4], #8
+	veor	@t[4], @t[4], @x[3]
+	 vext.8	@t[1], @x[5], @x[5], #8
+	veor	@t[7], @t[7], @x[6]
+	 vext.8	@x[4], @x[3], @x[3], #8
+	veor	@t[3], @t[3], @x[2]
+	 vext.8	@x[5], @x[7], @x[7], #8
+	veor	@t[4], @t[4], @x[7]
+	 vext.8	@x[3], @x[6], @x[6], #8
+	veor	@t[3], @t[3], @x[7]
+	 vext.8	@x[6], @x[2], @x[2], #8
+	veor	@x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+	veor	@x[2], @t[0], @t[4]
+	veor	@x[4], @x[4], @t[3]
+	veor	@x[5], @x[5], @t[7]
+	veor	@x[3], @x[3], @t[6]
+	 @ vmov	@x[2], @t[0]
+	veor	@x[6], @x[6], @t[2]
+	 @ vmov	@x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+	veor	@t[3], @t[3], @x[4]
+	veor	@x[5], @x[5], @t[7]
+	veor	@x[2], @x[3], @t[6]
+	veor	@x[3], @t[0], @t[4]
+	veor	@x[4], @x[6], @t[2]
+	vmov	@x[6], @t[3]
+	 @ vmov	@x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+	@ multiplication by 0x0e
+	vext.8	@t[7], @x[7], @x[7], #12
+	vmov	@t[2], @x[2]
+	veor	@x[2], @x[2], @x[5]		@ 2 5
+	veor	@x[7], @x[7], @x[5]		@ 7 5
+	vext.8	@t[0], @x[0], @x[0], #12
+	vmov	@t[5], @x[5]
+	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
+	veor	@x[0], @x[0], @x[1]		@ 0 1
+	vext.8	@t[1], @x[1], @x[1], #12
+	veor	@x[1], @x[1], @x[2]		@ 1 25
+	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
+	vext.8	@t[3], @x[3], @x[3], #12
+	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
+	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
+	veor	@x[3], @x[3], @x[7]		@ 3 75
+	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
+	vext.8	@t[6], @x[6], @x[6], #12
+	vmov	@t[4], @x[4]
+	veor	@x[6], @x[6], @x[4]		@ 6 4
+	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
+	veor	@x[3], @x[3], @x[7]		@ 375 756=36
+	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
+	veor	@x[3], @x[3], @t[2]		@ 36 2
+	vext.8	@t[5], @t[5], @t[5], #12
+	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
+___
+					my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+	@ multiplication by 0x0b
+	veor	@y[1], @y[1], @y[0]
+	veor	@y[0], @y[0], @t[0]
+	vext.8	@t[2], @t[2], @t[2], #12
+	veor	@y[1], @y[1], @t[1]
+	veor	@y[0], @y[0], @t[5]
+	vext.8	@t[4], @t[4], @t[4], #12
+	veor	@y[1], @y[1], @t[6]
+	veor	@y[0], @y[0], @t[7]
+	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
+
+	veor	@y[3], @y[3], @t[0]
+	 veor	@y[1], @y[1], @y[0]
+	vext.8	@t[0], @t[0], @t[0], #12
+	veor	@y[2], @y[2], @t[1]
+	veor	@y[4], @y[4], @t[1]
+	vext.8	@t[1], @t[1], @t[1], #12
+	veor	@y[2], @y[2], @t[2]
+	veor	@y[3], @y[3], @t[2]
+	veor	@y[5], @y[5], @t[2]
+	veor	@y[2], @y[2], @t[7]
+	vext.8	@t[2], @t[2], @t[2], #12
+	veor	@y[3], @y[3], @t[3]
+	veor	@y[6], @y[6], @t[3]
+	veor	@y[4], @y[4], @t[3]
+	veor	@y[7], @y[7], @t[4]
+	vext.8	@t[3], @t[3], @t[3], #12
+	veor	@y[5], @y[5], @t[4]
+	veor	@y[7], @y[7], @t[7]
+	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
+	veor	@y[3], @y[3], @t[5]
+	veor	@y[4], @y[4], @t[4]
+
+	veor	@y[5], @y[5], @t[7]
+	vext.8	@t[4], @t[4], @t[4], #12
+	veor	@y[6], @y[6], @t[7]
+	veor	@y[4], @y[4], @t[7]
+
+	veor	@t[7], @t[7], @t[5]
+	vext.8	@t[5], @t[5], @t[5], #12
+
+	@ multiplication by 0x0d
+	veor	@y[4], @y[4], @y[7]
+	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
+	veor	@y[7], @y[7], @t[4]
+	vext.8	@t[6], @t[6], @t[6], #12
+	veor	@y[2], @y[2], @t[0]
+	veor	@y[7], @y[7], @t[5]
+	vext.8	@t[7], @t[7], @t[7], #12
+	veor	@y[2], @y[2], @t[2]
+
+	veor	@y[3], @y[3], @y[1]
+	veor	@y[1], @y[1], @t[1]
+	veor	@y[0], @y[0], @t[0]
+	veor	@y[3], @y[3], @t[0]
+	veor	@y[1], @y[1], @t[5]
+	veor	@y[0], @y[0], @t[5]
+	vext.8	@t[0], @t[0], @t[0], #12
+	veor	@y[1], @y[1], @t[7]
+	veor	@y[0], @y[0], @t[6]
+	veor	@y[3], @y[3], @y[1]
+	veor	@y[4], @y[4], @t[1]
+	vext.8	@t[1], @t[1], @t[1], #12
+
+	veor	@y[7], @y[7], @t[7]
+	veor	@y[4], @y[4], @t[2]
+	veor	@y[5], @y[5], @t[2]
+	veor	@y[2], @y[2], @t[6]
+	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
+	vext.8	@t[2], @t[2], @t[2], #12
+	veor	@y[4], @y[4], @y[7]
+	veor	@y[3], @y[3], @t[6]
+
+	veor	@y[6], @y[6], @t[6]
+	veor	@y[5], @y[5], @t[5]
+	vext.8	@t[5], @t[5], @t[5], #12
+	veor	@y[6], @y[6], @t[4]
+	vext.8	@t[4], @t[4], @t[4], #12
+	veor	@y[5], @y[5], @t[6]
+	veor	@y[6], @y[6], @t[7]
+	vext.8	@t[7], @t[7], @t[7], #12
+	veor	@t[6], @t[6], @t[3]		@ restore t[6]
+	vext.8	@t[3], @t[3], @t[3], #12
+
+	@ multiplication by 0x09
+	veor	@y[4], @y[4], @y[1]
+	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
+	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
+	vext.8	@t[6], @t[6], @t[6], #12
+	veor	@t[1], @t[1], @t[5]
+	veor	@y[3], @y[3], @t[0]
+	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
+	veor	@t[1], @t[1], @t[6]
+	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
+	veor	@y[4], @y[4], @t[1]
+	veor	@y[7], @y[7], @t[4]
+	veor	@y[6], @y[6], @t[3]
+	veor	@y[5], @y[5], @t[2]
+	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
+	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
+	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
+	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
+	veor	@t[3], @t[3], @t[7]
+	veor	@XMM[5], @t[5], @t[6]
+	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
+	veor	@XMM[2], @t[2], @t[6]
+	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
+
+	vmov	@XMM[0], @t[0]
+	vmov	@XMM[1], @t[1]
+	@ vmov	@XMM[2], @t[2]
+	vmov	@XMM[3], @t[3]
+	vmov	@XMM[4], @t[4]
+	@ vmov	@XMM[5], @t[5]
+	@ vmov	@XMM[6], @t[6]
+	@ vmov	@XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	@t[0], @x[0], @x[0], #8
+	vext.8	@t[6], @x[6], @x[6], #8
+	vext.8	@t[7], @x[7], @x[7], #8
+	veor	@t[0], @t[0], @x[0]
+	vext.8	@t[1], @x[1], @x[1], #8
+	veor	@t[6], @t[6], @x[6]
+	vext.8	@t[2], @x[2], @x[2], #8
+	veor	@t[7], @t[7], @x[7]
+	vext.8	@t[3], @x[3], @x[3], #8
+	veor	@t[1], @t[1], @x[1]
+	vext.8	@t[4], @x[4], @x[4], #8
+	veor	@t[2], @t[2], @x[2]
+	vext.8	@t[5], @x[5], @x[5], #8
+	veor	@t[3], @t[3], @x[3]
+	veor	@t[4], @t[4], @x[4]
+	veor	@t[5], @t[5], @x[5]
+
+	 veor	@x[0], @x[0], @t[6]
+	 veor	@x[1], @x[1], @t[6]
+	 veor	@x[2], @x[2], @t[0]
+	 veor	@x[4], @x[4], @t[2]
+	 veor	@x[3], @x[3], @t[1]
+	 veor	@x[1], @x[1], @t[7]
+	 veor	@x[2], @x[2], @t[7]
+	 veor	@x[4], @x[4], @t[6]
+	 veor	@x[5], @x[5], @t[3]
+	 veor	@x[3], @x[3], @t[6]
+	 veor	@x[6], @x[6], @t[4]
+	 veor	@x[4], @x[4], @t[7]
+	 veor	@x[5], @x[5], @t[7]
+	 veor	@x[7], @x[7], @t[5]
+___
+	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+	vshr.u64	$t, $b, #$n
+	veor		$t, $t, $a
+	vand		$t, $t, $mask
+	veor		$a, $a, $t
+	vshl.u64	$t, $t, #$n
+	veor		$b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+	vshr.u64	$t0, $b0, #$n
+	 vshr.u64	$t1, $b1, #$n
+	veor		$t0, $t0, $a0
+	 veor		$t1, $t1, $a1
+	vand		$t0, $t0, $mask
+	 vand		$t1, $t1, $mask
+	veor		$a0, $a0, $t0
+	vshl.u64	$t0, $t0, #$n
+	 veor		$a1, $a1, $t1
+	 vshl.u64	$t1, $t1, #$n
+	veor		$b0, $b0, $t0
+	 veor		$b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+	vmov.i8	$t0,#0x55			@ compose .LBS0
+	vmov.i8	$t1,#0x33			@ compose .LBS1
+___
+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+	vmov.i8	$t0,#0x0f			@ compose .LBS2
+___
+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code   32
+# undef __thumb2__
+#endif
+
+.type	_bsaes_const,%object
+.align	6
+_bsaes_const:
+.LM0ISR:	@ InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:		@ ShiftRows constants
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+	.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	6
+.size	_bsaes_const,.-_bsaes_const
+
+.type	_bsaes_encrypt8,%function
+.align	4
+_bsaes_encrypt8:
+	adr	$const,.
+	vldmia	$key!, {@XMM[9]}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$const,.LM0SR
+#else
+	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
+#endif
+
+	vldmia	$const!, {@XMM[8]}		@ .LM0SR
+_bsaes_encrypt8_alt:
+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
+	veor	@XMM[11], @XMM[1], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+	veor	@XMM[12], @XMM[2], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+	veor	@XMM[13], @XMM[3], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+	veor	@XMM[14], @XMM[4], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+	veor	@XMM[15], @XMM[5], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+	veor	@XMM[10], @XMM[6], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+	veor	@XMM[11], @XMM[7], @XMM[9]
+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	sub	$rounds,$rounds,#1
+	b	.Lenc_sbox
+.align	4
+.Lenc_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+	&Sbox		(@XMM[0..7, 8..15]);
+$code.=<<___;
+	subs	$rounds,$rounds,#1
+	bcc	.Lenc_done
+___
+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+	vldmia	$const, {@XMM[12]}		@ .LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	$const,$const,#0x10
+	bne	.Lenc_loop
+	vldmia	$const, {@XMM[12]}		@ .LSRM0
+	b	.Lenc_loop
+.align	4
+.Lenc_done:
+___
+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+	vldmia	$key, {@XMM[8]}			@ last round key
+	veor	@XMM[4], @XMM[4], @XMM[8]
+	veor	@XMM[6], @XMM[6], @XMM[8]
+	veor	@XMM[3], @XMM[3], @XMM[8]
+	veor	@XMM[7], @XMM[7], @XMM[8]
+	veor	@XMM[2], @XMM[2], @XMM[8]
+	veor	@XMM[5], @XMM[5], @XMM[8]
+	veor	@XMM[0], @XMM[0], @XMM[8]
+	veor	@XMM[1], @XMM[1], @XMM[8]
+	bx	lr
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+	vmov	@x[2], @x[0]
+	vmov	@x[3], @x[1]
+___
+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+	vmov	@x[4], @x[0]
+	vmov	@x[6], @x[2]
+	vmov	@x[5], @x[1]
+	vmov	@x[7], @x[3]
+___
+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type	_bsaes_key_convert,%function
+.align	4
+_bsaes_key_convert:
+	adr	$const,.
+	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$const,.LM0
+#else
+	sub	$const,$const,#_bsaes_key_convert-.LM0
+#endif
+	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
+
+	vmov.i8	@XMM[8],  #0x01			@ bit masks
+	vmov.i8	@XMM[9],  #0x02
+	vmov.i8	@XMM[10], #0x04
+	vmov.i8	@XMM[11], #0x08
+	vmov.i8	@XMM[12], #0x10
+	vmov.i8	@XMM[13], #0x20
+	vldmia	$const, {@XMM[14]}		@ .LM0
+
+#ifdef __ARMEL__
+	vrev32.8	@XMM[7],  @XMM[7]
+	vrev32.8	@XMM[15], @XMM[15]
+#endif
+	sub	$rounds,$rounds,#1
+	vstmia	$out!, {@XMM[7]}		@ save round 0 key
+	b	.Lkey_loop
+
+.align	4
+.Lkey_loop:
+	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+	vmov.i8	@XMM[6],  #0x40
+	vmov.i8	@XMM[15], #0x80
+
+	vtst.8	@XMM[0], @XMM[7], @XMM[8]
+	vtst.8	@XMM[1], @XMM[7], @XMM[9]
+	vtst.8	@XMM[2], @XMM[7], @XMM[10]
+	vtst.8	@XMM[3], @XMM[7], @XMM[11]
+	vtst.8	@XMM[4], @XMM[7], @XMM[12]
+	vtst.8	@XMM[5], @XMM[7], @XMM[13]
+	vtst.8	@XMM[6], @XMM[7], @XMM[6]
+	vtst.8	@XMM[7], @XMM[7], @XMM[15]
+	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
+	vmvn	@XMM[0], @XMM[0]		@ "pnot"
+	vmvn	@XMM[1], @XMM[1]
+	vmvn	@XMM[5], @XMM[5]
+	vmvn	@XMM[6], @XMM[6]
+#ifdef __ARMEL__
+	vrev32.8	@XMM[15], @XMM[15]
+#endif
+	subs	$rounds,$rounds,#1
+	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
+	bne	.Lkey_loop
+
+	vmov.i8	@XMM[7],#0x63			@ compose .L63
+	@ don't save last round key
+	bx	lr
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6";	# shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.global	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,%function
+.align	5
+bsaes_ctr32_encrypt_blocks:
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
+	mov	ip, sp
+	stmdb	sp!, {r4-r10, lr}
+	VFP_ABI_PUSH
+	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	$fp, sp				@ save sp
+
+	ldr	$rounds, [$key, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
+	add	r12, #`128-32`			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	mov	sp, r12				@ sp is $keysched
+	bl	_bsaes_key_convert
+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}			@ save last round key
+
+	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
+#ifdef	__APPLE__
+	mov	$ctr, #:lower16:(.LREVM0SR-.LM0)
+	add	$ctr, $const, $ctr
+#else
+	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
+#endif
+	vldmia	$keysched, {@XMM[4]}		@ load round0 key
+#else
+	ldr	r12, [$key, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [$key, #244]
+	mov	r4, $key			@ pass key
+	mov	r5, $rounds			@ pass # of rounds
+	add	r12, $key, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
+	vstmia	r12, {@XMM[7]}			@ save last round key
+
+.align	2
+0:	add	r12, $key, #248
+	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
+	adrl	$ctr, .LREVM0SR			@ borrow $ctr
+	vldmia	r12, {@XMM[4]}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	@XMM[8],#1		@ compose 1<<96
+	veor		@XMM[9],@XMM[9],@XMM[9]
+	vrev32.8	@XMM[0],@XMM[0]
+	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
+	vrev32.8	@XMM[4],@XMM[4]
+	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
+	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
+	b	.Lctr_enc_loop
+
+.align	4
+.Lctr_enc_loop:
+	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
+	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
+	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
+	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
+	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
+	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
+	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
+	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
+	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia		$keysched, {@XMM[9]}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add		r4, $keysched, #0x10		@ pass next round key
+#else
+	add		r4, $key, #`248+16`
+#endif
+	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
+	mov		r5, $rounds			@ pass rounds
+	vstmia		$fp, {@XMM[10]}			@ save next counter
+#ifdef	__APPLE__
+	mov		$const, #:lower16:(.LREVM0SR-.LSR)
+	sub		$const, $ctr, $const
+#else
+	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
+#endif
+
+	bl		_bsaes_encrypt8_alt
+
+	subs		$len, $len, #8
+	blo		.Lctr_enc_loop_done
+
+	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
+	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
+	veor		@XMM[0], @XMM[8]
+	veor		@XMM[1], @XMM[9]
+	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
+	veor		@XMM[4], @XMM[10]
+	veor		@XMM[6], @XMM[11]
+	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
+	veor		@XMM[3], @XMM[12]
+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
+	veor		@XMM[7], @XMM[13]
+	veor		@XMM[2], @XMM[14]
+	vst1.8		{@XMM[4]}, [$out]!
+	veor		@XMM[5], @XMM[15]
+	vst1.8		{@XMM[6]}, [$out]!
+	vmov.i32	@XMM[8], #1			@ compose 1<<96
+	vst1.8		{@XMM[3]}, [$out]!
+	veor		@XMM[9], @XMM[9], @XMM[9]
+	vst1.8		{@XMM[7]}, [$out]!
+	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
+	vst1.8		{@XMM[2]}, [$out]!
+	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
+	vst1.8		{@XMM[5]}, [$out]!
+	vldmia		$fp, {@XMM[0]}			@ load counter
+
+	bne		.Lctr_enc_loop
+	b		.Lctr_enc_done
+
+.align	4
+.Lctr_enc_loop_done:
+	add		$len, $len, #8
+	vld1.8		{@XMM[8]}, [$inp]!	@ load input
+	veor		@XMM[0], @XMM[8]
+	vst1.8		{@XMM[0]}, [$out]!	@ write output
+	cmp		$len, #2
+	blo		.Lctr_enc_done
+	vld1.8		{@XMM[9]}, [$inp]!
+	veor		@XMM[1], @XMM[9]
+	vst1.8		{@XMM[1]}, [$out]!
+	beq		.Lctr_enc_done
+	vld1.8		{@XMM[10]}, [$inp]!
+	veor		@XMM[4], @XMM[10]
+	vst1.8		{@XMM[4]}, [$out]!
+	cmp		$len, #4
+	blo		.Lctr_enc_done
+	vld1.8		{@XMM[11]}, [$inp]!
+	veor		@XMM[6], @XMM[11]
+	vst1.8		{@XMM[6]}, [$out]!
+	beq		.Lctr_enc_done
+	vld1.8		{@XMM[12]}, [$inp]!
+	veor		@XMM[3], @XMM[12]
+	vst1.8		{@XMM[3]}, [$out]!
+	cmp		$len, #6
+	blo		.Lctr_enc_done
+	vld1.8		{@XMM[13]}, [$inp]!
+	veor		@XMM[7], @XMM[13]
+	vst1.8		{@XMM[7]}, [$out]!
+	beq		.Lctr_enc_done
+	vld1.8		{@XMM[14]}, [$inp]
+	veor		@XMM[2], @XMM[14]
+	vst1.8		{@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:			@ wipe key schedule [if any]
+	vstmia		$keysched!, {q0-q1}
+	cmp		$keysched, $fp
+	bne		.Lctr_enc_bzero
+#else
+	vstmia		$keysched, {q0-q1}
+#endif
+
+	mov	sp, $fp
+	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4-r10, pc}	@ return
+
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+        last if (!s/^#/@/ and !/^$/);
+        print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl
new file mode 100644
index 0000000000..c16ed377d3
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl
@@ -0,0 +1,303 @@
+#! /usr/bin/env perl
+# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+# This file was patched in BoringSSL to remove the variable-time 4-bit
+# implementation.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$code=<<___;
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+___
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
+
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+___
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64		$IN#hi,[r1]!		@ load H
+	vmov.i8		$t0,#0xe1
+	vld1.64		$IN#lo,[r1]
+	vshl.i64	$t0#hi,#57
+	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
+	vdup.8		$t1,$IN#hi[7]
+	vshr.u64	$Hlo,$IN#lo,#63
+	vshr.s8		$t1,#7			@ broadcast carry bit
+	vshl.i64	$IN,$IN,#1
+	vand		$t0,$t0,$t1
+	vorr		$IN#hi,$Hlo		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vstmia		r0,{$IN}
+
+	ret					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64		$IN#hi,[$Xi]!		@ load Xi
+	vld1.64		$IN#lo,[$Xi]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+	mov		$len,#16
+	b		.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		$Xl#hi,[$Xi]!		@ load Xi
+	vld1.64		$Xl#lo,[$Xi]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64		$IN#hi,[$inp]!		@ load inp
+	vld1.64		$IN#lo,[$inp]!
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$IN,$Xl			@ inp^=Xi
+.Lgmult_neon:
+___
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
+$code.=<<___;
+	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
+___
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
+$code.=<<___;
+	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
+	veor		$Xm,$Xm,$Xh
+	veor		$Xl#hi,$Xl#hi,$Xm#lo
+	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	$t1,$Xl,#57		@ 1st phase
+	vshl.i64	$t2,$Xl,#62
+	veor		$t2,$t2,$t1		@
+	vshl.i64	$t1,$Xl,#63
+	veor		$t2, $t2, $t1		@
+ 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
+	veor		$Xh#lo,$Xh#lo,$t2#hi
+
+	vshr.u64	$t2,$Xl,#1		@ 2nd phase
+	veor		$Xh,$Xh,$Xl
+	veor		$Xl,$Xl,$t2		@
+	vshr.u64	$t2,$t2,#6
+	vshr.u64	$Xl,$Xl,#1		@
+	veor		$Xl,$Xl,$Xh		@
+	veor		$Xl,$Xl,$t2		@
+
+	subs		$len,#16
+	bne		.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	sub		$Xi,#16
+	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
+	vst1.64		$Xl#lo,[$Xi]
+
+	ret					@ bx lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl
new file mode 100644
index 0000000000..b4ae1db8c9
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,297 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
+# implements the multiplication algorithm described in:
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+#
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
+# vN. Where the 32-bit version would use the upper half, this file must keep
+# halves in separate registers.
+#
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
+#
+#     vshl.i64 q0, q0, #1
+#
+# in 64-bit, it would be written:
+#
+#     shl v0.2d, v0.2d, #1
+#
+# See Programmer's Guide for ARMv8-A, section 7 for details.
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
+#
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
+# polynomial and is conditioned on the PMULL extension. This file emulates the
+# latter with the former.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
+    my $dir = $1;
+    my $xlate;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
+# to spare.
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
+
+my $code = "";
+
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
+sub clmul64x64 {
+my ($r, $a, $b) = @_;
+$code .= <<___;
+	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
+	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
+	ext	$r.8b, $b.8b, $b.8b, #1		// B1
+	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
+	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
+	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
+	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
+	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
+	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
+	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
+	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
+	ext	$r.8b, $b.8b, $b.8b, #3		// B3
+	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
+	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	\$t0#hi, \$t0#hi, \$k48
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
+	//
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	\$t1#hi, \$t1#hi, \$k32
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
+	//
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	\$t2#hi, \$t2#hi, \$k16
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
+	//
+	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	\$t3#hi, #0
+	//
+	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
+	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
+	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
+	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
+	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
+	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
+	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
+	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
+
+	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
+	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
+	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
+	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
+	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
+	eor	$t0.16b, $t0.16b, $t1.16b
+	eor	$t2.16b, $t2.16b, $t3.16b
+	eor	$r.16b, $r.16b, $t0.16b
+	eor	$r.16b, $r.16b, $t2.16b
+___
+}
+
+$code .= <<___;
+.text
+
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{$t1.2d}, [x1]			// load H
+	movi	$t3.16b, #0xe1
+	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
+	ext	$INlo.16b, $t1.16b, $t1.16b, #8
+	ushr	$t2.2d, $t3.2d, #63
+	dup	$t1.4s, $t1.s[1]
+	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
+	ushr	$t2.2d, $INlo.2d, #63
+	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
+	and	$t2.16b, $t2.16b, $t0.16b
+	shl	$INlo.2d, $INlo.2d, #1
+	ext	$t2.16b, $t2.16b, $t2.16b, #8
+	and	$t0.16b, $t0.16b, $t1.16b
+	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
+	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
+	st1	{$Hlo.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$INlo.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+	mov	$len, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$Xl.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{$INlo.16b}, [$inp], #16	// load inp
+	rev64	$INlo.16b, $INlo.16b		// byteswap inp
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into $INlo and $INhi. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	$INhi.d[0], $INlo.d[1]
+___
+&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
+$code .= <<___;
+	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
+___
+&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
+$code .= <<___;
+	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
+	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
+	eor	$Xm.16b, $Xm.16b, $Xh.16b
+	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	$Xh.d[0], $Xm.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	$t1.2d, $Xl.2d, #57		// 1st phase
+	shl	$t2.2d, $Xl.2d, #62
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	shl	$t1.2d, $Xl.2d, #63
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	$t2.16b, $t2.16b, $Xm.16b
+	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
+	eor	$Xh.16b, $Xh.16b,$Xl.16b
+	eor	$Xl.16b, $Xl.16b,$t2.16b	//
+	ushr	$t2.2d, $t2.2d, #6
+	ushr	$Xl.2d, $Xl.2d, #1		//
+	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
+	eor	$Xl.16b, $Xl.16b, $t2.16b	//
+
+	subs	$len, $len, #16
+	bne	.Loop_neon
+
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	st1	{$Xl.16b}, [$Xi]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl
new file mode 100644
index 0000000000..1cebb4b20e
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl
@@ -0,0 +1,460 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	SSE assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with SSE results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be between
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed against slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
+# This file was patched in BoringSSL to remove the variable-time 4-bit
+# implementation.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$x86only=0;
+$sse2=1;
+
+if (!$x86only) {{{
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey)		if (!defined($HK));
+			$HK=$T2			if (!defined($HK));
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$HK,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);
+	&psllq		($Xi,5);
+	&pxor		($T1,$Xi);		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T1,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T1,8);		#
+	&pxor		($Xi,$T2);
+	&pxor		($Xhi,$T1);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,1);
+	&pxor		($Xhi,$T2);		#
+	&pxor		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$Xhi)		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufd		($T1,$Hkey,0b01001110);
+	&pshufd		($T2,$Xi,0b01001110);
+	&pxor		($T1,$Hkey);		# Karatsuba pre-processing
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&pxor		($T2,$Xi);		# Karatsuba pre-processing
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi
+	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&movdqu		($T3,&QWP(32,$Htbl));
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	&pxor		($T1,$Xn);		#
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&nop		();
+
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+	&jmp		(&label("mod_loop"));
+
+&set_label("mod_loop",32);
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+	&nop		();
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&movdqa		($T3,&QWP(0,$const));
+	&xorps		($Xhi,$Xhn);
+	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
+	&pxor		($T1,$Xhi);		#
+
+	 &pshufb	($Xhn,$T3);
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+	 &pshufb	($Xn,$T3);
+	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early
+
+	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &movdqa	($T1,$Xi);
+	  &psllq	($Xi,5);
+	  &pxor		($T1,$Xi);		#
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&movups		($T3,&QWP(32,$Htbl));
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T1,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T1,8);		#
+	  &pxor		($Xi,$T2);
+	  &pxor		($Xhi,$T1);		#
+	&pshufd		($T1,$Xhn,0b01001110);
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,1);
+	&pxor		($T1,$Xhn);
+	  &pxor		($Xhi,$T2);		#
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	  &pxor		($T2,$Xi);
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$Xhi)		#
+	&pclmulqdq	($T1,$T3,0x00);		#######
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	&pxor		($T1,$Xhi);		#
+
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+}}	# $sse2
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000000..1a228d75ab
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl
@@ -0,0 +1,1266 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Skylake	0.44(+110%)(if system doesn't support AVX)
+# Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
+# Knights L	2.12(-)    (if system doesn't support AVX)
+# Goldmont	1.08(+24%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
+#
+# Knights Landing achieves 1.09 cpb.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+# This file was patched in BoringSSL to remove the variable-time 4-bit
+# implementation.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be
+# computed incorrectly.
+#
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+$avx = 1;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$do4xaggr=1;
+
+
+$code=<<___;
+.text
+___
+
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+} else {
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1			#
+___
+}
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$HK,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+
+sub reduction_alg9 {	# 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1
+	psllq		\$5,$Xi
+	pxor		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T1			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T1			#
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$1,$Xi
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+  my $HK="%xmm6";
+
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+	sub	\$0x18,%rsp
+.seh_stackalloc	0x18
+	movaps	%xmm6,(%rsp)
+.seh_savexmm	%xmm6, 0
+.seh_endprologue
+___
+$code.=<<___;
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	pshufd		\$0b01001110,$Hkey,$HK
+	movdqa		$Hkey,$Xi
+	pxor		$Hkey,$HK
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$Hkey,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$Hkey,$T1		# Karatsuba pre-processing
+	movdqu		$Hkey,0x00($Htbl)	# save H
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x10($Htbl)		# save H^2
+	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
+	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
+___
+if ($do4xaggr) {
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqa		$Xi,$T3
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$T3,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$T3,$T1			# Karatsuba pre-processing
+	movdqu		$T3,0x30($Htbl)		# save H^3
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x40($Htbl)		# save H^4
+	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
+	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.seh_endproc
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
+
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	32
+gcm_ghash_clmul:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+.L_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+	lea	-0x20(%rax),%rsp
+.seh_stackalloc	0x20+0x88
+	movaps	%xmm6,-0x20(%rax)
+.seh_savexmm	%xmm6, 0x20-0x20
+	movaps	%xmm7,-0x10(%rax)
+.seh_savexmm	%xmm7, 0x20-0x10
+	movaps	%xmm8,0(%rax)
+.seh_savexmm	%xmm8, 0x20+0
+	movaps	%xmm9,0x10(%rax)
+.seh_savexmm	%xmm9, 0x20+0x10
+	movaps	%xmm10,0x20(%rax)
+.seh_savexmm	%xmm10, 0x20+0x20
+	movaps	%xmm11,0x30(%rax)
+.seh_savexmm	%xmm11, 0x20+0x30
+	movaps	%xmm12,0x40(%rax)
+.seh_savexmm	%xmm12, 0x20+0x40
+	movaps	%xmm13,0x50(%rax)
+.seh_savexmm	%xmm13, 0x20+0x50
+	movaps	%xmm14,0x60(%rax)
+.seh_savexmm	%xmm14, 0x20+0x60
+	movaps	%xmm15,0x70(%rax)
+.seh_savexmm	%xmm15, 0x20+0x70
+.seh_endprologue
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$HK
+	pshufb		$T3,$Xi
+
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+
+	movdqu		0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+	cmp		\$0x30,$len
+	jb		.Lskip4x
+
+	sub		\$0x30,$len
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
+	movdqu		0x30($Htbl),$Hkey3
+	movdqu		0x40($Htbl),$Hkey4
+
+	#######
+	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+	#
+	movdqu		0x30($inp),$Xln
+	 movdqu		0x20($inp),$Xl
+	pshufb		$T3,$Xln
+	 pshufb		$T3,$Xl
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey2,$Xl
+	pclmulqdq	\$0x11,$Hkey2,$Xh
+	pclmulqdq	\$0x10,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+	movups		0x50($Htbl),$HK
+	xorps		$Xm,$Xmn
+
+	movdqu		0x10($inp),$Xl
+	 movdqu		0($inp),$T1
+	pshufb		$T3,$Xl
+	 pshufb		$T3,$T1
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	 pxor		$T1,$Xi
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey3,$Xl
+	 movdqa		$Xi,$Xhi
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pxor		$Xi,$T1
+	pclmulqdq	\$0x11,$Hkey3,$Xh
+	pclmulqdq	\$0x00,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	xorps		$Xm,$Xmn
+	 movdqu		0x30($inp),$Xl
+	 pshufb		$T3,$Xl
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	xorps		$Xln,$Xi
+	 movdqu		0x20($inp),$Xln
+	 movdqa		$Xl,$Xh
+	pclmulqdq	\$0x10,$HK,$T1
+	 pshufd		\$0b01001110,$Xl,$Xm
+	xorps		$Xhn,$Xhi
+	 pxor		$Xl,$Xm
+	 pshufb		$T3,$Xln
+	movups		0x20($Htbl),$HK
+	xorps		$Xmn,$T1
+	 pclmulqdq	\$0x00,$Hkey,$Xl
+	 pshufd		\$0b01001110,$Xln,$Xmn
+
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	 movdqa		$Xln,$Xhn
+	pxor		$Xhi,$T1		#
+	 pxor		$Xln,$Xmn
+	movdqa		$T1,$T2			#
+	 pclmulqdq	\$0x11,$Hkey,$Xh
+	pslldq		\$8,$T1
+	psrldq		\$8,$T2			#
+	pxor		$T1,$Xi
+	movdqa		.L7_mask(%rip),$T1
+	pxor		$T2,$Xhi		#
+	movq		%rax,$T2
+
+	pand		$Xi,$T1			# 1st phase
+	pshufb		$T1,$T2			#
+	pxor		$Xi,$T2			#
+	 pclmulqdq	\$0x00,$HK,$Xm
+	psllq		\$57,$T2		#
+	movdqa		$T2,$T1			#
+	pslldq		\$8,$T2
+	 pclmulqdq	\$0x00,$Hkey2,$Xln
+	psrldq		\$8,$T1			#
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+	movdqu		0($inp),$T1
+
+	movdqa		$Xi,$T2			# 2nd phase
+	psrlq		\$1,$Xi
+	 pclmulqdq	\$0x11,$Hkey2,$Xhn
+	 xorps		$Xl,$Xln
+	 movdqu		0x10($inp),$Xl
+	 pshufb		$T3,$Xl
+	 pclmulqdq	\$0x10,$HK,$Xmn
+	 xorps		$Xh,$Xhn
+	 movups		0x50($Htbl),$HK
+	pshufb		$T3,$T1
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+
+	 movdqa		$Xl,$Xh
+	 pxor		$Xm,$Xmn
+	 pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$T2,$Xi			#
+	pxor		$T1,$Xhi
+	 pxor		$Xl,$Xm
+	 pclmulqdq	\$0x00,$Hkey3,$Xl
+	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+	movdqa		$Xi,$Xhi
+	 pclmulqdq	\$0x11,$Hkey3,$Xh
+	 xorps		$Xl,$Xln
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1
+
+	 pclmulqdq	\$0x00,$HK,$Xm
+	 xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xm,$Xmn
+	xorps		$Xln,$Xi
+	xorps		$Xhn,$Xhi
+	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
+	pxor		$Xmn,$T1
+
+	pxor		$Xhi,$T1		#
+	pxor		$Xi,$Xhi
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+	&reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+	add	\$0x40,$len
+	jz	.Ldone
+	movdqu	0x20($Htbl),$HK
+	sub	\$0x10,$len
+	jz	.Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xln		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xln
+	pxor		$T1,$Xi			# Ii+Xi
+
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	lea		32($inp),$inp		# i+=2
+	nop
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+	nop
+	jmp		.Lmod_loop
+
+.align	32
+.Lmod_loop:
+	movdqa		$Xi,$Xhi
+	movdqa		$Xmn,$T1
+	pshufd		\$0b01001110,$Xi,$Xmn	#
+	pxor		$Xi,$Xmn		#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	  movdqu	($inp),$T2		# Ii
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	  pshufb	$T3,$T2
+	  movdqu	16($inp),$Xln		# Ii+1
+
+	pxor		$Xhi,$T1
+	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
+	pxor		$T1,$Xmn
+	 pshufb		$T3,$Xln
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
+
+	movdqa		$Xln,$Xhn		#
+
+	  movdqa	$Xi,$T2			# 1st phase
+	  movdqa	$Xi,$T1
+	  psllq		\$5,$Xi
+	  pxor		$Xi,$T1			#
+	pclmulqdq	\$0x00,$Hkey,$Xln	#######
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T1			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T1			#
+	  pxor		$T2,$Xi
+	pshufd		\$0b01001110,$Xhn,$Xmn
+	  pxor		$T1,$Xhi		#
+	pxor		$Xhn,$Xmn		#
+
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$1,$Xi
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  pxor		$T2,$Xhi		#
+	  pxor		$Xi,$T2
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	lea		32($inp),$inp
+	  psrlq		\$1,$Xi			#
+	pclmulqdq	\$0x00,$HK,$Xmn		#######
+	  pxor		$Xhi,$Xi		#
+
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+
+.Leven_tail:
+	 movdqa		$Xi,$Xhi
+	 movdqa		$Xmn,$T1
+	 pshufd		\$0b01001110,$Xi,$Xmn	#
+	 pxor		$Xi,$Xmn		#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	pxor		$Xi,$T1
+	pxor		$Xhi,$T1
+	pxor		$T1,$Xmn
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.seh_endproc
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.globl	gcm_init_avx
+.type	gcm_init_avx,\@abi-omnipotent
+.align	32
+gcm_init_avx:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+	sub	\$0x18,%rsp
+.seh_stackalloc	0x18
+	movaps	%xmm6,(%rsp)
+.seh_savexmm	%xmm6, 0
+.seh_endprologue
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Hkey
+	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	vpsrlq		\$63,$Hkey,$T1
+	vpsllq		\$1,$Hkey,$Hkey
+	vpxor		$T3,$T3,$T3		#
+	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
+	vpslldq		\$8,$T1,$T1
+	vpor		$T1,$Hkey,$Hkey		# H<<=1
+
+	# magic reduction
+	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
+	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	vpunpckhqdq	$Hkey,$Hkey,$HK
+	vmovdqa		$Hkey,$Xi
+	vpxor		$Hkey,$HK,$HK
+	mov		\$4,%r10		# up to H^8
+	jmp		.Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpunpckhqdq	$Hkey,$Hkey,$T2
+	vpxor		$Xi,$T1,$T1		#
+	vpxor		$Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpxor		$Xi,$T1,$T1		#
+___
+}
+$code.=<<___;
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
+	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
+	vpxor		$Xi,$Xhi,$T2		#
+	vpxor		$T2,$T1,$T1		#
+
+	vpslldq		\$8,$T1,$T2		#
+	vpsrldq		\$8,$T1,$T1
+	vpxor		$T2,$Xi,$Xi		#
+	vpxor		$T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	vpsllq		\$57,$Xi,$T1		# 1st phase
+	vpsllq		\$62,$Xi,$T2
+	vpxor		$T1,$T2,$T2		#
+	vpsllq		\$63,$Xi,$T1
+	vpxor		$T1,$T2,$T2		#
+	vpslldq		\$8,$T2,$T1		#
+	vpsrldq		\$8,$T2,$T2
+	vpxor		$T1,$Xi,$Xi		#
+	vpxor		$T2,$Xhi,$Xhi
+
+	vpsrlq		\$1,$Xi,$T2		# 2nd phase
+	vpxor		$Xi,$Xhi,$Xhi
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$5,$T2,$T2
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$1,$Xi,$Xi		#
+	vpxor		$Xhi,$Xi,$Xi		#
+___
+}
+
+$code.=<<___;
+.align	32
+.Linit_loop_avx:
+	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
+	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+	vmovdqa		$Xi,$T3
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+	vpshufd		\$0b01001110,$T3,$T1
+	vpshufd		\$0b01001110,$Xi,$T2
+	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
+	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
+	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
+	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
+	lea		0x30($Htbl),$Htbl
+	sub		\$1,%r10
+	jnz		.Linit_loop_avx
+
+	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
+	vmovdqu		$T3,-0x10($Htbl)
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.seh_endproc
+.cfi_endproc
+.size	gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_init_clmul
+.size	gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,\@abi-omnipotent
+.align	32
+gcm_ghash_avx:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+    $Zlo,$Zhi,$Zmi,
+    $Hkey,$HK,$T1,$T2,
+    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+	lea	-0x20(%rax),%rsp
+.seh_stackalloc	0x20+0x88
+	movaps	%xmm6,-0x20(%rax)
+.seh_savexmm	%xmm6, 0x20-0x20
+	movaps	%xmm7,-0x10(%rax)
+.seh_savexmm	%xmm7, 0x20-0x10
+	movaps	%xmm8,0(%rax)
+.seh_savexmm	%xmm8, 0x20+0
+	movaps	%xmm9,0x10(%rax)
+.seh_savexmm	%xmm9, 0x20+0x10
+	movaps	%xmm10,0x20(%rax)
+.seh_savexmm	%xmm10, 0x20+0x20
+	movaps	%xmm11,0x30(%rax)
+.seh_savexmm	%xmm11, 0x20+0x30
+	movaps	%xmm12,0x40(%rax)
+.seh_savexmm	%xmm12, 0x20+0x40
+	movaps	%xmm13,0x50(%rax)
+.seh_savexmm	%xmm13, 0x20+0x50
+	movaps	%xmm14,0x60(%rax)
+.seh_savexmm	%xmm14, 0x20+0x60
+	movaps	%xmm15,0x70(%rax)
+.seh_savexmm	%xmm15, 0x20+0x70
+.seh_endprologue
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Xi		# load $Xi
+	lea		.L0x1c2_polynomial(%rip),%r10
+	lea		0x40($Htbl),$Htbl	# size optimization
+	vmovdqu		.Lbswap_mask(%rip),$bswap
+	vpshufb		$bswap,$Xi,$Xi
+	cmp		\$0x80,$len
+	jb		.Lshort_avx
+	sub		\$0x80,$len
+
+	vmovdqu		0x70($inp),$Ii		# I[7]
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vpshufb		$bswap,$Ii,$Ii
+	vmovdqu		0x20-0x40($Htbl),$HK
+
+	vpunpckhqdq	$Ii,$Ii,$T2
+	 vmovdqu	0x60($inp),$Ij		# I[6]
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Ii,$T2,$T2
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x50($inp),$Ii		# I[5]
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpxor		$Ii,$T2,$T2
+	 vmovdqu	0x40($inp),$Ij		# I[4]
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x50-0x40($Htbl),$HK
+
+	 vpshufb	$bswap,$Ij,$Ij
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x30($inp),$Ii		# I[3]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	0x20($inp),$Ij		# I[2]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	vpxor		$Xmi,$Zmi,$Zmi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x10($inp),$Ii		# I[1]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	($inp),$Ij		# I[0]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
+
+	lea		0x80($inp),$inp
+	cmp		\$0x80,$len
+	jb		.Ltail_avx
+
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+	sub		\$0x80,$len
+	jmp		.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x70($inp),$Ii		# I[7]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpxor		$Ij,$T1,$T1
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
+	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Tred
+	 vmovdqu	0x20-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	  vmovdqu	0x60($inp),$Ij		# I[6]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Zlo,$Xi,$Xi		# collect result
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	vxorps		$Zhi,$Xo,$Xo
+	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	vpxor		$Zmi,$Tred,$Tred
+	 vxorps		$Ij,$T1,$T1
+
+	  vmovdqu	0x50($inp),$Ii		# I[5]
+	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Xo,$Tred,$Tred
+	vpslldq		\$8,$Tred,$T2
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	vpsrldq		\$8,$Tred,$Tred
+	vpxor		$T2, $Xi, $Xi
+	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	  vpshufb	$bswap,$Ii,$Ii
+	vxorps		$Tred,$Xo, $Xo
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x50-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x40($inp),$Ij		# I[4]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vxorps		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+
+	  vmovdqu	0x30($inp),$Ii		# I[3]
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x20($inp),$Ij		# I[2]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+	vxorps		$Tred,$Xi,$Xi
+
+	  vmovdqu	0x10($inp),$Ii		# I[1]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	vxorps		$Xo,$Tred,$Tred
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	($inp),$Ij		# I[0]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Tred,$Ij,$Ij
+	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+
+	lea		0x80($inp),$inp
+	sub		\$0x80,$len
+	jnc		.Loop8x_avx
+
+	add		\$0x80,$len
+	jmp		.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu		-0x10($inp,$len),$Ii	# very last word
+	lea		($inp,$len),$inp
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vmovdqu		0x20-0x40($Htbl),$HK
+	vpshufb		$bswap,$Ii,$Ij
+
+	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
+	vmovdqa		$Xhi,$Zhi		# $Zhi and
+	vmovdqa		$Xmi,$Zmi		# $Zmi
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x20($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x30($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x50-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x40($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x50($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x80-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x60($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x70($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovq		0xb8-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jmp		.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+.Ltail_no_xor_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+
+	vmovdqu		(%r10),$Tred
+
+	vpxor		$Xlo,$Zlo,$Xi
+	vpxor		$Xhi,$Zhi,$Xo
+	vpxor		$Xmi,$Zmi,$Zmi
+
+	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
+	vpxor		$Xo, $Zmi,$Zmi
+	vpslldq		\$8, $Zmi,$T2
+	vpsrldq		\$8, $Zmi,$Zmi
+	vpxor		$T2, $Xi, $Xi
+	vpxor		$Zmi,$Xo, $Xo
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$Xo,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	cmp		\$0,$len
+	jne		.Lshort_avx
+
+	vpshufb		$bswap,$Xi,$Xi
+	vmovdqu		$Xi,($Xip)
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.seh_endproc
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_ghash_clmul
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
+.section .rodata
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+	.long	7,0,7,0
+.align	64
+
+.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+.text
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl
new file mode 100644
index 0000000000..1f1df1cf84
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl
@@ -0,0 +1,334 @@
+#! /usr/bin/env perl
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
+#
+# July 2014
+#
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction and
+# improve performance by 20-70% depending on processor.
+#
+# Current performance in cycles per processed byte:
+#
+#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
+# Apple A7	0.58		0.92		5.62
+# Cortex-A53	0.85		1.01		8.39
+# Cortex-A57	0.73		1.17		7.61
+# Denver	0.51		0.65		6.02
+# Mongoose	0.65		1.10		8.06
+# Kryo		0.76		1.16		8.00
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+
+$code=<<___;
+#if __ARM_MAX_ARCH__>=7
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=<<___				if ($flavour !~ /64/);
+.fpu	neon
+.code	32
+#undef	__thumb2__
+___
+
+################################################################################
+# void gcm_init_clmul(u128 Htable[16],const u64 H[2]);
+#
+# input:	128-bit H - secret parameter E(K,0^128)
+# output:	precomputed table filled with degrees of twisted H;
+#		H is twisted to handle reverse bitness of GHASH;
+#		only few of 16 slots of Htable[16] are used;
+#		data is opaque to outside world (which allows to
+#		optimize the code independently);
+#
+$code.=<<___;
+.global	gcm_init_clmul
+.type	gcm_init_clmul,%function
+.align	4
+gcm_init_clmul:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64		{$t1},[x1]		@ load input H
+	vmov.i8		$xC2,#0xe1
+	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
+	vext.8		$IN,$t1,$t1,#8
+	vshr.u64	$t2,$xC2,#63
+	vdup.32		$t1,${t1}[1]
+	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
+	vshr.u64	$t2,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t2,$t2,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t2,$t2,$t2,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t2		@ H<<<=1
+	veor		$H,$IN,$t0		@ twisted H
+	vst1.64		{$H},[x0],#16		@ store Htable[0]
+
+	@ calculate H^2
+	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
+	vpmull.p64	$Xl,$H,$H
+	veor		$t0,$t0,$H
+	vpmull2.p64	$Xh,$H,$H
+	vpmull.p64	$Xm,$t0,$t0
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$H2,$Xl,$t2
+
+	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
+	veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+
+$code.=<<___;
+	@ calculate H^3 and H^4
+	vpmull.p64	$Xl,$H, $H2
+	 vpmull.p64	$Yl,$H2,$H2
+	vpmull2.p64	$Xh,$H, $H2
+	 vpmull2.p64	$Yh,$H2,$H2
+	vpmull.p64	$Xm,$t0,$t1
+	 vpmull.p64	$Ym,$t1,$t1
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	 vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	 veor		$t3,$Yl,$Yh
+	 veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	 veor		$Ym,$Ym,$t3
+	 vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	 vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	 veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	 vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	 veor		$t3,$t3,$Yh
+	veor		$H, $Xl,$t2		@ H^3
+	 veor		$H2,$Yl,$t3		@ H^4
+
+	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
+	 vext.8		$t1,$H2,$H2,#8
+	veor		$t0,$t0,$H
+	 veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
+___
+}
+$code.=<<___;
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+################################################################################
+# void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+#
+# input:	Xi - current hash value;
+#		Htable - table precomputed in gcm_init_clmul;
+# output:	Xi - next hash value Xi;
+#
+$code.=<<___;
+.global	gcm_gmult_clmul
+.type	gcm_gmult_clmul,%function
+.align	4
+gcm_gmult_clmul:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+	vshl.u64	$xC2,$xC2,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$IN,$t1,$t1,#8
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+#endif
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remaining legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	# Switch preprocessor checks to aarch64 versions.
+	s/__ARME([BL])__/__AARCH64E$1__/go;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remaining new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
new file mode 100644
index 0000000000..a46ad6c785
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -0,0 +1,884 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# Adapted from the original x86_64 version and <appro@openssl.org>'s ARMv8
+# version.
+#
+# armv7, aarch64, and x86_64 differ in several ways:
+#
+# * x86_64 SSSE3 instructions are two-address (destination operand is also a
+#   source), while NEON is three-address (destination operand is separate from
+#   two sources).
+#
+# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16.
+#
+# * x86_64 instructions can take memory references, while ARM is a load/store
+#   architecture. This means we sometimes need a spare register.
+#
+# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb),
+#   while armv7 only has a 64-bit byte shuffle (vtbl).
+#
+# This means this armv7 version must be a mix of both aarch64 and x86_64
+# implementations. armv7 and aarch64 have analogous SIMD instructions, so we
+# base the instructions on aarch64. However, we cannot use aarch64's register
+# allocation. x86_64's register count matches, but x86_64 is two-address.
+# vpaes-armv8.pl already accounts for this in the comments, which use
+# three-address AVX instructions instead of the original SSSE3 ones. We base
+# register usage on these comments, which are preserved in this file.
+#
+# This means we do not use separate input and output registers as in aarch64 and
+# cannot pin as many constants in the preheat functions. However, the load/store
+# architecture means we must still deviate from x86_64 in places.
+#
+# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source
+# and destination and 128-bit table. Fortunately, armv7 also allows addressing
+# upper and lower halves of each 128-bit register. The lower half of q{N} is
+# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent
+# instruction,
+#
+#     vtbl.8 q0, q1, q2   @ Index each of q2's 16 bytes into q1. Store in q0.
+#
+# we write:
+#
+#     vtbl.8 d0, q1, d4   @ Index each of d4's 8 bytes into q1. Store in d0.
+#     vtbl.8 d1, q1, d5   @ Index each of d5's 8 bytes into q1. Store in d1.
+#
+# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and
+# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note,
+# however, that destination (q0) and table (q1) registers may no longer match.
+# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the
+# two-address pshufb always matched these operands, so this is common.)
+#
+# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR
+# expands to an ADD or SUB of the pc register to find an address. That immediate
+# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation.
+# This means larger values must be more aligned.
+#
+# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may
+# use either encoding (do we actually need to support this?). In ARM mode, the
+# distances get large enough to require 16-byte alignment. Moving constants
+# closer to their use resolves most of this, but common constants in
+# _vpaes_consts are used by the whole file. Affected ADR instructions must be
+# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this
+# constraint have been commented.
+#
+# For details on ARM's immediate value encoding scheme, see
+# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
+#
+# Finally, a summary of armv7 and aarch64 SIMD syntax differences:
+#
+# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not.
+#
+# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones).
+#   aarch64 names registers like v0, and denotes half-width operations in an
+#   instruction suffix (see below).
+#
+# * aarch64 embeds size and lane information in register suffixes. v0.16b is
+#   16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s.
+#   armv7 embeds the total size in the register name (see above) and the size of
+#   each element in an instruction suffix, which may look like vmov.i8,
+#   vshr.u8, or vtbl.8, depending on instruction.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir=$1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+my $code = "";
+
+$code.=<<___;
+.syntax	unified
+
+.arch	armv7-a
+.fpu	neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.type	_vpaes_consts,%object
+.align	7	@ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	@ mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:		@ sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:	@ inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	@ input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	@ sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	@ sb1u, sb1t
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	@ sb2u, sb2t
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.asciz  "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+___
+
+{
+my ($inp,$out,$key) = map("r$_", (0..2));
+
+my ($invlo,$invhi) = map("q$_", (10..11));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15));
+
+$code.=<<___;
+@@
+@@  _aes_preheat
+@@
+@@  Fills q9-q15 as specified below.
+@@
+.type	_vpaes_preheat,%function
+.align	4
+_vpaes_preheat:
+	adr	r10, .Lk_inv
+	vmov.i8	q9, #0x0f		@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
+	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
+	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
+	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
+	bx	lr
+
+@@
+@@  _aes_encrypt_core
+@@
+@@  AES-encrypt q0.
+@@
+@@  Inputs:
+@@     q0 = input
+@@     q9-q15 as in _vpaes_preheat
+@@    [$key] = scheduled keys
+@@
+@@  Output in q0
+@@  Clobbers  q1-q5, r8-r11
+@@  Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type	_vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+	mov	r9, $key
+	ldr	r8, [$key,#240]		@ pull rounds
+	adr	r11, .Lk_ipt
+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	vld1.64	{q2, q3}, [r11]
+	adr	r11, .Lk_mc_forward+16
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0
+	vtbl.8	q1#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm2,	%xmm1
+	vtbl.8	q1#hi, {q2}, q1#hi
+	vtbl.8	q2#lo, {q3}, q0#lo	@ vpshufb	%xmm0,	%xmm3,	%xmm2
+	vtbl.8	q2#hi, {q3}, q0#hi
+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+
+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Lenc_loop.
+	tst	r8, r8
+	b	.Lenc_entry
+
+.align 4
+.Lenc_loop:
+	@ middle of middle round
+	add	r10, r11, #0x40
+	vtbl.8	q4#lo, {$sb1t}, q2#lo	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	vtbl.8	q4#hi, {$sb1t}, q2#hi
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	vtbl.8	q0#lo, {$sb1u}, q3#lo	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	vtbl.8	q0#hi, {$sb1u}, q3#hi
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	vtbl.8	q5#lo, {$sb2t}, q2#lo	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	vtbl.8	q5#hi, {$sb2t}, q2#hi
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
+	vtbl.8	q2#lo, {$sb2u}, q3#lo	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	vtbl.8	q2#hi, {$sb2u}, q3#hi
+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	vtbl.8	q3#lo, {q0}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	vtbl.8	q3#hi, {q0}, q1#hi
+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	@ Write to q5 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	q5#lo, {q0}, q4#lo	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	vtbl.8	q5#hi, {q0}, q4#hi
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	vtbl.8	q4#lo, {q3}, q1#lo	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	vtbl.8	q4#hi, {q3}, q1#hi
+	@ Here we restore the original q0/q5 usage.
+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	r11, r11, #~(1<<6)	@ and		\$0x30,	%r11		# ... mod 4
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	subs	r8, r8, #1		@ nr--
+
+.Lenc_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	q5#lo, {$invhi}, q1#lo	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	vtbl.8	q5#hi, {$invhi}, q1#hi
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	q3#lo, {$invlo}, q0#lo	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	vtbl.8	q3#hi, {$invlo}, q0#hi
+	vtbl.8	q4#lo, {$invlo}, q1#lo	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	vtbl.8	q4#hi, {$invlo}, q1#hi
+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	vtbl.8	q2#lo, {$invlo}, q3#lo	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	vtbl.8	q2#hi, {$invlo}, q3#hi
+	vtbl.8	q3#lo, {$invlo}, q4#lo	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	q3#hi, {$invlo}, q4#hi
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
+	bne	.Lenc_loop
+
+	@ middle of last round
+	add	r10, r11, #0x80
+
+	adr	r11, .Lk_sbo
+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+	@ overlap table and destination registers.
+	vld1.64 {q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
+	vld1.64 {q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	vtbl.8	q4#lo, {q1}, q2#lo	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	q4#hi, {q1}, q2#hi
+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	@ Write to q2 instead of q0 below, to avoid overlapping table and
+	@ destination registers.
+	vtbl.8	q2#lo, {q0}, q3#lo	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	vtbl.8	q2#hi, {q0}, q3#hi
+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	@ Here we restore the original q0/q2 usage.
+	vtbl.8	q0#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm0
+	vtbl.8	q0#hi, {q2}, q1#hi
+	bx	lr
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3");
+my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12));
+
+$code.=<<___;
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                                                    @@
+@@                  AES key schedule                  @@
+@@                                                    @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@  Key schedule constants
+@
+.type	_vpaes_key_consts,%object
+.align	4
+_vpaes_key_consts:
+.Lk_rcon:	@ rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	@ output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	@ deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size	_vpaes_key_consts,.-_vpaes_key_consts
+
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	r11, .Lk_rcon
+	vmov.i8	$s63, #0x5b			@ .Lk_s63
+	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
+	vmov.i8	$s0F, #0x0f			@ .Lk_s0F
+	vld1.64	{$invlo,$invhi}, [r10]		@ .Lk_inv
+	vld1.64	{$rcon}, [r11]			@ .Lk_rcon
+	bx	lr
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
+	@ so save an extra register.
+	stmdb	sp!, {r3,lr}
+
+	bl	_vpaes_key_preheat	@ load the tables
+
+	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
+	vld1.64	{q0}, [$inp]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	@ input transform
+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+	@ overlap table and destination.
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
+
+	add	r8, r8, r10
+
+	@ encrypting, output zeroth round key after transform
+	vst1.64	{q0}, [$out]		@ vmovdqu	%xmm0,	(%rdx)
+
+	@ *ring*: Decryption removed.
+
+.Lschedule_go:
+	cmp	$bits, #192		@ cmp	\$192,	%esi
+	bhi	.Lschedule_256
+	@ 128: fall though
+
+@@
+@@  .schedule_128
+@@
+@@  128-bit specific part of key schedule.
+@@
+@@  This schedule is really simple, because all its parts
+@@  are accomplished by the subroutines.
+@@
+.Lschedule_128:
+	mov	$inp, #10		@ mov	\$10, %esi
+
+.Loop_schedule_128:
+	bl 	_vpaes_schedule_round
+	subs	$inp, $inp, #1		@ dec	%esi
+	beq 	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle	@ write output
+	b 	.Loop_schedule_128
+
+@@
+@@  .aes_schedule_256
+@@
+@@  256-bit specific part of key schedule.
+@@
+@@  The structure here is very similar to the 128-bit
+@@  schedule, but with an additional "low side" in
+@@  q6.  The low side's rounds are the same as the
+@@  high side's, except no rcon and no rotation.
+@@
+.align	4
+.Lschedule_256:
+	vld1.64	{q0}, [$inp]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	mov	$inp, #7			@ mov	\$7, %esi
+
+.Loop_schedule_256:
+	bl	_vpaes_schedule_mangle		@ output low result
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	@ high round
+	bl	_vpaes_schedule_round
+	subs	$inp, $inp, #1			@ dec	%esi
+	beq 	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	@ low round. swap xmm7 and xmm6
+	vdup.32	q0, q0#hi[1]		@ vpshufd	\$0xFF,	%xmm0,	%xmm0
+	vmov.i8	q4, #0
+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+@@
+@@  .aes_schedule_mangle_last
+@@
+@@  Mangler for last round of key schedule
+@@  Mangles q0
+@@    when encrypting, outputs out(q0) ^ 63
+@@    when decrypting, outputs unskew(q0)
+@@
+@@  Always called right before return... jumps to cleanup and exits
+@@
+.align	4
+.Lschedule_mangle_last:
+	@ schedule last round key from xmm0
+	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+
+	@ encrypting
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
+	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	$out, $out, #32		@ add		\$32,	%rdx
+	vmov	q2, q0
+	vtbl.8	q0#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+	vtbl.8	q0#hi, {q2}, q1#hi
+
+.Lschedule_mangle_last_dec:
+	sub	$out, $out, #16			@ add	\$-16,	%rdx
+	veor	q0, q0, $s63			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	@ output transform
+	vst1.64	{q0}, [$out]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	@ cleanup
+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
+	ldmia	sp!, {r3,pc}		@ return
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@  .aes_schedule_round
+@@
+@@  Runs one main round of the key schedule on q0, q7
+@@
+@@  Specifically, runs subbytes on the high dword of q0
+@@  then rotates it by one byte and xors into the low dword of
+@@  q7.
+@@
+@@  Adds rcon from low byte of q8, then rotates q8 for
+@@  next rcon.
+@@
+@@  Smears the dwords of q7 by xoring the low into the
+@@  second low, result into third, result into highest.
+@@
+@@  Returns results in q7 = q0.
+@@  Clobbers q1-q4, r11.
+@@
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	@ extract rcon from xmm8
+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
+	vext.8	q1, $rcon, q4, #15		@ vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
+	vext.8	$rcon, $rcon, $rcon, #15	@ vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
+
+	@ rotate
+	vdup.32	q0, q0#hi[1]			@ vpshufd	\$0xFF,	%xmm0,	%xmm0
+	vext.8	q0, q0, q0, #1			@ vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
+
+	@ fall through...
+
+	@ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+	@ We pin other values in _vpaes_key_preheat, so load them now.
+	adr	r11, .Lk_sb1
+	vld1.64	{q14,q15}, [r11]
+
+	@ smear xmm7
+	vext.8	q1, q4, q7, #12			@ vpslldq	\$4,	%xmm7,	%xmm1
+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
+	vext.8	q4, q4, q7, #8			@ vpslldq	\$8,	%xmm7,	%xmm4
+
+	@ subbytes
+	vand	q1, q0, $s0F			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
+	vshr.u8	q0, q0, #4			@ vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
+	 veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
+	vtbl.8	q2#lo, {$invhi}, q1#lo		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	vtbl.8	q2#hi, {$invhi}, q1#hi
+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
+	vtbl.8	q3#lo, {$invlo}, q0#lo		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	vtbl.8	q3#hi, {$invlo}, q0#hi
+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	vtbl.8	q4#lo, {$invlo}, q1#lo		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	vtbl.8	q4#hi, {$invlo}, q1#hi
+	 veor	q7, q7, $s63			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
+	vtbl.8	q3#lo, {$invlo}, q3#lo		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	vtbl.8	q3#hi, {$invlo}, q3#hi
+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	vtbl.8	q2#lo, {$invlo}, q4#lo		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	vtbl.8	q2#hi, {$invlo}, q4#hi
+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	vtbl.8	q4#lo, {q15}, q3#lo		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	vtbl.8	q4#hi, {q15}, q3#hi
+	vtbl.8	q1#lo, {q14}, q2#lo		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	vtbl.8	q1#hi, {q14}, q2#hi
+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	@ add in smeared stuff
+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
+	bx	lr
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@  .aes_schedule_transform
+@@
+@@  Linear-transform q0 according to tables at [r11]
+@@
+@@  Requires that q9 = 0x0F0F... as in preheat
+@@  Output in q0
+@@  Clobbers q1, q2, q14, q15
+@@
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
+					@ vmovdqa	16(%r11),	%xmm1 # hi
+	vand	q1, q0, $s0F		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0
+	vtbl.8	q2#lo, {q14}, q1#lo	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	q2#hi, {q14}, q1#hi
+	vtbl.8	q0#lo, {q15}, q0#lo	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	q0#hi, {q15}, q0#hi
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+	bx	lr
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@  .aes_schedule_mangle
+@@
+@@  Mangles q0 from (basis-transformed) standard version
+@@  to our version.
+@@
+@@  On encrypt,
+@@    xor with 0x63
+@@    multiply by circulant 0,1,1,1
+@@    apply shiftrows transform
+@@
+@@  On decrypt,
+@@    xor with 0x63
+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
+@@    deskew
+@@    apply shiftrows transform
+@@
+@@
+@@  Writes out to [r2], and increments or decrements it
+@@  Keeps track of round number mod 4 in r8
+@@  Preserves q0
+@@  Clobbers q1-q5
+@@
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	tst	$dir, $dir
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	@ encrypting
+	@ Write to q2 so we do not overlap table and destination below.
+	veor	q2, q0, $s63		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	$out, $out, #16		@ add		\$16,	%rdx
+	vtbl.8	q4#lo, {q2}, q5#lo	@ vpshufb	%xmm5,	%xmm4,	%xmm4
+	vtbl.8	q4#hi, {q2}, q5#hi
+	vtbl.8	q1#lo, {q4}, q5#lo	@ vpshufb	%xmm5,	%xmm4,	%xmm1
+	vtbl.8	q1#hi, {q4}, q5#hi
+	vtbl.8	q3#lo, {q1}, q5#lo	@ vpshufb	%xmm5,	%xmm1,	%xmm3
+	vtbl.8	q3#hi, {q1}, q5#hi
+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
+
+.Lschedule_mangle_both:
+	@ Write to q2 so table and destination do not overlap.
+	vtbl.8	q2#lo, {q3}, q1#lo	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	q2#hi, {q3}, q1#hi
+	add	r8, r8, #64-16		@ add	\$-16,	%r8
+	and	r8, r8, #~(1<<6)	@ and	\$0x30,	%r8
+	vst1.64	{q2}, [$out]		@ vmovdqu	%xmm3,	(%rdx)
+	bx	lr
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stmdb	sp!, {r7-r11, lr}
+	vstmdb	sp!, {d8-d15}
+
+	lsr	r9, $bits, #5		@ shr	\$5,%eax
+	add	r9, r9, #5		@ \$5,%eax
+	str	r9, [$out,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	$dir, #0		@ mov	\$0,%ecx
+	mov	r8, #0x30		@ mov	\$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	r0, r0, r0
+
+	vldmia	sp!, {d8-d15}
+	ldmia	sp!, {r7-r11, pc}	@ return
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+___
+}
+
+{
+my ($out, $inp) = map("r$_", (0..1));
+my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
+
+$code .= <<___;
+
+@ Additional constants for converting to bsaes.
+.type	_vpaes_convert_consts,%object
+.align	4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+	.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+	.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	vpaes_encrypt_key_to_bsaes
+.type	vpaes_encrypt_key_to_bsaes,%function
+.align	4
+vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	vmov.i8	$s0F, #0x0f		@ Required by _vpaes_schedule_transform
+	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+	vld1.64	{$mc_forward}, [r2]
+	vmov.i8	$s63, #0x5b		@ .Lk_s63 from vpaes-x86_64
+	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
+	vmov.i8	$s63_raw, #0x63		@ .LK_s63 without .Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [$inp,#240]
+	add	r2, r2, #1
+	str	r2, [$out,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [$inp]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [$inp]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8  q2#lo, {q0}, q1#lo
+	vtbl.8  q2#hi, {q0}, q1#hi
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	q1#lo, {q0}, $mc_forward#lo
+	vtbl.8	q1#hi, {q0}, $mc_forward#hi
+	vmov	q0, q1
+	vtbl.8	q2#lo, {q1}, $mc_forward#lo
+	vtbl.8	q2#hi, {q1}, $mc_forward#hi
+	veor	q0, q0, q2
+	vtbl.8	q1#lo, {q2}, $mc_forward#lo
+	vtbl.8	q1#hi, {q2}, $mc_forward#hi
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, $s63
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+	b	.Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ $s63_raw, not $s63.
+	veor	q0, q0, $s63_raw
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+___
+}
+
+{
+# Register-passed parameters.
+my ($inp, $out, $len, $key) = map("r$_", 0..3);
+# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and
+# $tmp. $ctr is r7 because it must be preserved across calls.
+my ($ctr, $ivec, $tmp) = map("r$_", 7..9);
+
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+#                                 const AES_KEY *key, const uint8_t ivec[16]);
+$code .= <<___;
+.globl	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7-r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8-d15}
+
+	cmp	$len, #0
+	@ $ivec is passed on the stack.
+	ldr	$ivec, [ip]
+	beq	.Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap $len and $key.
+	mov	$tmp, $key
+	mov	$key, $len
+	mov	$len, $tmp
+___
+my ($len, $key) = ($key, $len);
+$code .= <<___;
+
+	@ Load the IV and counter portion.
+	ldr	$ctr, [$ivec, #12]
+	vld1.8	{q7}, [$ivec]
+
+	bl	_vpaes_preheat
+	rev	$ctr, $ctr		@ The counter is big-endian.
+
+.Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [$inp]!		@ Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [$out]!
+	subs	$len, $len, #1
+	@ Update the counter.
+	add	$ctr, $ctr, #1
+	rev	$tmp, $ctr
+	vmov.32	q7#hi[1], $tmp
+	bne	.Lctr32_loop
+
+.Lctr32_done:
+	vldmia	sp!, {d8-d15}
+	ldmia	sp!, {r7-r11, pc}	@ return
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
new file mode 100644
index 0000000000..d5e4bce897
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
@@ -0,0 +1,824 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# ARMv8 NEON adaptation by <appro@openssl.org>
+#
+# Reason for undertaken effort is that there is at least one popular
+# SoC based on Cortex-A53 that doesn't have crypto extensions.
+#
+#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
+# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
+# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
+# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
+# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
+# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
+# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
+#
+# (*)	ECB denotes approximate result for parallelizable modes
+#	such as CBC decrypt, CTR, etc.;
+# (**)	these results are worse than scalar compiler-generated
+#	code, but it's constant-time and therefore preferred;
+# (***)	presented for reference/comparison purposes;
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$code.=<<___;
+.section	.rodata
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	// mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:// mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:		// sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	// inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	// input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	// sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	// sb1u, sb1t
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	// sb2u, sb2t
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	// decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	// decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	// decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	// rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	// output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+
+.text
+___
+
+{
+my ($inp,$out,$key) = map("x$_",(0..2));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
+
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align 4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.type	_vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	 and	v9.16b,  v15.16b,  v17.16b
+	 ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	 tbl	v9.16b,  {$iptlo}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	 tbl	v10.16b, {$ipthi}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	 eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	 eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	 tbl	v12.16b, {$sb1t}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sb1u}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	 tbl	v13.16b, {$sb2t}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	 tbl	v10.16b, {$sb2u}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	 tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	 eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	 tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	 eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	 tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	 eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	 eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	 and	v9.16b,  v8.16b, v17.16b
+	 ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	 tbl	v13.16b, {$invhi},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	 eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	 tbl	v11.16b, {$invlo},v8.16b
+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	 tbl	v12.16b, {$invlo},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	 eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	 eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	 tbl	v10.16b, {$invlo},v11.16b
+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	 tbl	v11.16b, {$invlo},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	 eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	 eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	 tbl	v12.16b, {$sbou}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sbot}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	 tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
+
+$code.=<<___;
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adrp	x11, :pg_hi21:.Lk_sb1
+	add	x11, x11, :lo12:.Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adrp	x10, :pg_hi21:.Lk_dksd
+	add	x10, x10, :lo12:.Lk_dksd
+	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
+	adrp	x11, :pg_hi21:.Lk_mc_forward
+	add	x11, x11, :lo12:.Lk_mc_forward
+	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, :pg_hi21:.Lk_sr		// lea	.Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:.Lk_sr
+
+	add	x8, x8, x10
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
+
+	cmp	$bits, #192			// cmp	\$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	$inp, #10			// mov	\$10, %esi
+
+.Loop_schedule_128:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl 	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	$inp, $inp, #8
+	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	$inp, #4			// mov	\$4,	%esi
+
+.Loop_schedule_192:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	$inp, #7			// mov	\$7, %esi
+
+.Loop_schedule_256:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, :pg_hi21:.Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:.Lk_deskew
+
+	cbnz	$dir, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, :pg_hi21:.Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:.Lk_opt
+	add	$out, $out, #32			// add	\$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
+	sub	$out, $out, #16			// add	\$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
+	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
+	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	$out, $out, #16			// add	\$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	\$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, $bits, #5		// shr	\$5,%eax
+	add	w9, w9, #5		// \$5,%eax
+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	$dir, #0		// mov	\$0,%ecx
+	mov	x8, #0x30		// mov	\$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4));
+my ($ctr, $ctr_tmp) = ("w6", "w7");
+
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+#                                 const AES_KEY *key, const uint8_t ivec[16]);
+$code.=<<___;
+.globl	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	$len, .Lctr32_done
+
+	// Note, unlike the other functions, $len here is measured in blocks,
+	// not bytes.
+	mov	x17, $len
+	mov	x2,  $key
+
+	// Load the IV and counter portion.
+	ldr	$ctr, [$ivec, #12]
+	ld1	{v7.16b}, [$ivec]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	$ctr, $ctr		// The counter is big-endian.
+	b.eq	.Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [$inp], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [$out], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	$ctr, $ctr, #1
+	rev	$ctr_tmp, $ctr
+	mov	v7.s[3], $ctr_tmp
+	b.ls	.Lctr32_done
+
+.Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	$ctr, $ctr, #1
+	rev	$ctr_tmp, $ctr
+	mov	v15.s[3], $ctr_tmp
+
+.Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [$inp], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	$ctr_tmp, $ctr, #1
+	add	$ctr, $ctr, #2
+	rev	$ctr_tmp, $ctr_tmp
+	mov	v14.s[3], $ctr_tmp
+	rev	$ctr_tmp, $ctr
+	mov	v15.s[3], $ctr_tmp
+	b.hi	.Lctr32_loop
+
+.Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000000..6410e7a752
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl
@@ -0,0 +1,617 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
+# Nehalem	27.9/40.4/18.1		10.2/11.9
+# Atom		70.7/92.1/60.1		61.1/75.4(***)
+# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
+#	and +15% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0]);
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
+&external_label("BORINGSSL_function_hit");
+&preprocessor_endif();
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&pand	("xmm0","xmm6");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pxor	("xmm2","xmm5");
+	&psrld	("xmm1",4);
+	&add	($key,16);
+	&pshufb	("xmm0","xmm1");
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&pxor	("xmm0","xmm2");
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&sub	($round,1);			# nr--
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	# 192-bit key support was removed. 
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+	record_function_hit(5);
+
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+	record_function_hit(4);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000000..09c1ba37ba
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1023 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#		aes-x86_64.pl		vpaes-x86_64.pl
+#
+# Core 2(**)	29.6/41.1/14.3		21.9/25.2(***)
+# Nehalem	29.6/40.3/14.6		10.0/11.8
+# Atom		57.3/74.2/32.1		60.9/77.2(***)
+# Silvermont	52.7/64.0/19.5		48.8/60.8(***)
+# Goldmont	38.9/49.0/17.8		10.6/12.6
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +36%/62% improvement on Core 2
+#	(as implied, over "hyper-threading-safe" code path).
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+.cfi_startproc
+	mov	%rdx,	%r9
+	mov	\$16,	%r11
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm5,	%xmm2
+	add	\$16,	%r9
+	pxor	%xmm2,	%xmm0
+	lea	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align 16
+.Lenc_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm2,	%xmm4	# 4 = sb1u
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm15,	%xmm5	# 4 : sb2u
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
+	pshufb	%xmm2,	%xmm5	# 4 = sb2u
+	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pxor	%xmm5,	%xmm2	# 2 = 2A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	add	\$16,	%r9	# next key
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	%xmm4,	%xmm3	# 3 = D
+	add	\$16,	%r11	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	and	\$0x30,	%r11	# ... mod 4
+	sub	\$1,%rax	# nr--
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+
+.Lenc_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	movdqa	%xmm11, %xmm5	# 2 : a/k
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,   	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	pshufb  %xmm0,  %xmm5	# 2 = a/k
+	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
+	pxor	%xmm1,	%xmm0	# 0 = j
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
+	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	movdqa	%xmm10, %xmm3   # 3 : 1/jak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	movdqu	(%r9),	%xmm5
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Lenc_loop
+
+	# middle of last round
+	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm1,	%xmm0
+	ret
+.cfi_endproc
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+##
+##  _aes_encrypt_core_2x
+##
+##  AES-encrypt %xmm0 and %xmm6 in parallel.
+##
+##  Inputs:
+##     %xmm0 and %xmm6 = input
+##     %xmm9 and %xmm10 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0 and %xmm6
+##  Clobbers  %xmm1-%xmm5, %xmm7, %xmm8, %xmm11-%xmm13, %r9, %r10, %r11, %rax
+##  Preserves %xmm14 and %xmm15
+##
+##  This function stitches two parallel instances of _vpaes_encrypt_core. x86_64
+##  provides 16 XMM registers. _vpaes_encrypt_core computes over six registers
+##  (%xmm0-%xmm5) and additionally uses seven registers with preloaded constants
+##  from _vpaes_preheat (%xmm9-%xmm15). This does not quite fit two instances,
+##  so we spill some of %xmm9 through %xmm15 back to memory. We keep %xmm9 and
+##  %xmm10 in registers as these values are used several times in a row. The
+##  remainder are read once per round and are spilled to memory. This leaves two
+##  registers preserved for the caller.
+##
+##  Thus, of the two _vpaes_encrypt_core instances, the first uses (%xmm0-%xmm5)
+##  as before. The second uses %xmm6-%xmm8,%xmm11-%xmm13. (Add 6 to %xmm2 and
+##  below. Add 8 to %xmm3 and up.) Instructions in the second instance are
+##  indented by one space.
+##
+##
+.type	_vpaes_encrypt_core_2x,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core_2x:
+.cfi_startproc
+	mov	%rdx,	%r9
+	mov	\$16,	%r11
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	 movdqa	%xmm9,	%xmm7
+	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
+	 movdqa	%xmm2,	%xmm8
+	pandn	%xmm0,	%xmm1
+	 pandn	%xmm6,	%xmm7
+	movdqu	(%r9),	%xmm5		# round0 key
+	 # Also use %xmm5 in the second instance.
+	psrld	\$4,	%xmm1
+	 psrld	\$4,	%xmm7
+	pand	%xmm9,	%xmm0
+	 pand	%xmm9,	%xmm6
+	pshufb	%xmm0,	%xmm2
+	 pshufb	%xmm6,	%xmm8
+	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
+	 movdqa	%xmm0,	%xmm6
+	pshufb	%xmm1,	%xmm0
+	 pshufb	%xmm7,	%xmm6
+	pxor	%xmm5,	%xmm2
+	 pxor	%xmm5,	%xmm8
+	add	\$16,	%r9
+	pxor	%xmm2,	%xmm0
+	 pxor	%xmm8,	%xmm6
+	lea	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc2x_entry
+
+.align 16
+.Lenc2x_loop:
+	# middle of middle round
+	movdqa  .Lk_sb1(%rip),	%xmm4		# 4 : sb1u
+	movdqa  .Lk_sb1+16(%rip),%xmm0		# 0 : sb1t
+	 movdqa	%xmm4,	%xmm12
+	 movdqa	%xmm0,	%xmm6
+	pshufb  %xmm2,	%xmm4			# 4 = sb1u
+	 pshufb	%xmm8,	%xmm12
+	pshufb  %xmm3,	%xmm0			# 0 = sb1t
+	 pshufb	%xmm11,	%xmm6
+	pxor	%xmm5,	%xmm4			# 4 = sb1u + k
+	 pxor	%xmm5,	%xmm12
+	movdqa  .Lk_sb2(%rip),	%xmm5		# 4 : sb2u
+	 movdqa	%xmm5,	%xmm13
+	pxor	%xmm4,	%xmm0			# 0 = A
+	 pxor	%xmm12,	%xmm6
+	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
+	 # Also use %xmm1 in the second instance.
+	pshufb	%xmm2,	%xmm5			# 4 = sb2u
+	 pshufb	%xmm8,	%xmm13
+	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
+	 # Also use %xmm4 in the second instance.
+	movdqa	.Lk_sb2+16(%rip), %xmm2		# 2 : sb2t
+	 movdqa	%xmm2,	%xmm8
+	pshufb	%xmm3,  %xmm2			# 2 = sb2t
+	 pshufb	%xmm11,	%xmm8
+	movdqa	%xmm0,  %xmm3			# 3 = A
+	 movdqa	%xmm6,	%xmm11
+	pxor	%xmm5,	%xmm2			# 2 = 2A
+	 pxor	%xmm13,	%xmm8
+	pshufb  %xmm1,  %xmm0			# 0 = B
+	 pshufb	%xmm1,	%xmm6
+	add	\$16,	%r9			# next key
+	pxor	%xmm2,  %xmm0			# 0 = 2A+B
+	 pxor	%xmm8,	%xmm6
+	pshufb	%xmm4,	%xmm3			# 3 = D
+	 pshufb	%xmm4,	%xmm11
+	add	\$16,	%r11			# next mc
+	pxor	%xmm0,	%xmm3			# 3 = 2A+B+D
+	 pxor	%xmm6,	%xmm11
+	pshufb  %xmm1,	%xmm0			# 0 = 2B+C
+	 pshufb	%xmm1,	%xmm6
+	and	\$0x30,	%r11			# ... mod 4
+	sub	\$1,%rax			# nr--
+	pxor	%xmm3,	%xmm0			# 0 = 2A+3B+C+D
+	 pxor	%xmm11,	%xmm6
+
+.Lenc2x_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	 movdqa	%xmm9,	%xmm7
+	movdqa	.Lk_inv+16(%rip), %xmm5	# 2 : a/k
+	 movdqa	%xmm5,	%xmm13
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	 pandn	%xmm6,	%xmm7
+	psrld	\$4,   	%xmm1   # 1 = i
+	 psrld	\$4,	%xmm7
+	pand	%xmm9, 	%xmm0   # 0 = k
+	 pand	%xmm9,	%xmm6
+	pshufb  %xmm0,  %xmm5	# 2 = a/k
+	 pshufb	%xmm6,	%xmm13
+	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
+	 movdqa	%xmm10,	%xmm11
+	pxor	%xmm1,	%xmm0	# 0 = j
+	 pxor	%xmm7,	%xmm6
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	 pshufb	%xmm7,	%xmm11
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	 movdqa	%xmm10,	%xmm12
+	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
+	 pxor	%xmm13,	%xmm11
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	 pshufb	%xmm6,	%xmm12
+	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
+	 movdqa	%xmm10,	%xmm8
+	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
+	 pxor	%xmm13,	%xmm12
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	 pshufb	%xmm11,	%xmm8
+	movdqa	%xmm10, %xmm3   # 3 : 1/jak
+	 movdqa	%xmm10,	%xmm11
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	 pxor	%xmm6,	%xmm8
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	 pshufb	%xmm12,	%xmm11
+	movdqu	(%r9),	%xmm5
+	 # Also use %xmm5 in the second instance.
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	 pxor	%xmm7,	%xmm11
+	jnz	.Lenc2x_loop
+
+	# middle of last round
+	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	 movdqa	%xmm4,	%xmm12
+	 movdqa	%xmm0,	%xmm6
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	 pshufb	%xmm8,	%xmm12
+	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
+	 pxor	%xmm5,	%xmm12
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	 pshufb	%xmm11,	%xmm6
+	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
+	 # Also use %xmm1 in the second instance.
+	pxor	%xmm4,	%xmm0	# 0 = A
+	 pxor	%xmm12,	%xmm6
+	pshufb	%xmm1,	%xmm0
+	 pshufb	%xmm1,	%xmm6
+	ret
+.cfi_endproc
+.size	_vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_schedule_core,\@abi-omnipotent
+.align	16
+_vpaes_schedule_core:
+.cfi_startproc
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+
+	call	_vpaes_preheat		# load the tables
+	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqa	%xmm0,	%xmm3
+	lea	.Lk_ipt(%rip), %r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,	%xmm7
+
+	lea	.Lk_sr(%rip),%r10
+
+	# encrypting, output zeroth round key after transform
+	movdqu	%xmm0,	(%rdx)
+
+.Lschedule_go:
+	cmp	\$192,	%esi
+	ja	.Lschedule_256
+	# 192-bit key support was removed.
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	\$10, %esi
+
+.Loop_schedule_128:
+	call 	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# write output
+	jmp 	.Loop_schedule_128
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	mov	\$7, %esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+	# low round. swap xmm7 and xmm6
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	16
+.Lschedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+
+	# encrypting
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,	%xmm0		# output permute
+	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
+	add	\$32,	%rdx
+
+.Lschedule_mangle_last_dec:
+	add	\$-16,	%rdx
+	pxor	.Lk_s63(%rip),	%xmm0
+	call	_vpaes_schedule_transform # output transform
+	movdqu	%xmm0,	(%rdx)		# save last key
+
+	# cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	ret
+.cfi_endproc
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,\@abi-omnipotent
+.align	16
+_vpaes_schedule_round:
+.cfi_startproc
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	palignr	\$1,	%xmm0,	%xmm0
+
+	# fall through...
+
+	# low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%rip), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	\$4,    %xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa	%xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa	%xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	%xmm13, %xmm4		# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	%xmm12, %xmm0		# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0
+	movdqa	%xmm0,	%xmm7
+	ret
+.cfi_endproc
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,\@abi-omnipotent
+.align	16
+_vpaes_schedule_transform:
+.cfi_startproc
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+.cfi_endproc
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,\@abi-omnipotent
+.align	16
+_vpaes_schedule_mangle:
+.cfi_startproc
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+
+	# encrypting
+	add	\$16,	%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,%xmm3
+	add	\$-16,	%r8
+	and	\$0x30,	%r8
+	movdqu	%xmm3,	(%rdx)
+	ret
+.cfi_endproc
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@function,3
+.align	16
+${PREFIX}_set_encrypt_key:
+.cfi_startproc
+	_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern        BORINGSSL_function_hit
+       movb \$1, BORINGSSL_function_hit+5(%rip)
+#endif
+
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	\$0,%ecx
+	mov	\$0x30,%r8d
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.cfi_endproc
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+___
+{
+my ($inp,$out,$blocks,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx","%r8");
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *inp, uint8_t *out,
+#                                 size_t blocks, const AES_KEY *key,
+#                                 const uint8_t ivp[16]);
+$code.=<<___;
+.globl	${PREFIX}_ctr32_encrypt_blocks
+.type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
+.align	16
+${PREFIX}_ctr32_encrypt_blocks:
+.cfi_startproc
+	_CET_ENDBR
+	# _vpaes_encrypt_core and _vpaes_encrypt_core_2x expect the key in %rdx.
+	xchg	$key, $blocks
+___
+($blocks,$key)=($key,$blocks);
+$code.=<<___;
+	test	$blocks, $blocks
+	jz	.Lctr32_abort
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+	movdqu	($ivp), %xmm0		# Load IV.
+	movdqa	.Lctr_add_one(%rip), %xmm8
+	sub	$inp, $out		# This allows only incrementing $inp.
+	call	_vpaes_preheat
+	movdqa	%xmm0, %xmm6
+	pshufb	.Lrev_ctr(%rip), %xmm6
+
+	test	\$1, $blocks
+	jz	.Lctr32_prep_loop
+
+	# Handle one block so the remaining block count is even for
+	# _vpaes_encrypt_core_2x.
+	movdqu	($inp), %xmm7		# Load input.
+	call	_vpaes_encrypt_core
+	pxor	%xmm7, %xmm0
+	paddd	%xmm8, %xmm6
+	movdqu	%xmm0, ($out,$inp)
+	sub	\$1, $blocks
+	lea	16($inp), $inp
+	jz	.Lctr32_done
+
+.Lctr32_prep_loop:
+	# _vpaes_encrypt_core_2x leaves only %xmm14 and %xmm15 as spare
+	# registers. We maintain two byte-swapped counters in them.
+	movdqa	%xmm6, %xmm14
+	movdqa	%xmm6, %xmm15
+	paddd	%xmm8, %xmm15
+
+.Lctr32_loop:
+	movdqa	.Lrev_ctr(%rip), %xmm1	# Set up counters.
+	movdqa	%xmm14, %xmm0
+	movdqa	%xmm15, %xmm6
+	pshufb	%xmm1, %xmm0
+	pshufb	%xmm1, %xmm6
+	call	_vpaes_encrypt_core_2x
+	movdqu	($inp), %xmm1		# Load input.
+	movdqu	16($inp), %xmm2
+	movdqa	.Lctr_add_two(%rip), %xmm3
+	pxor	%xmm1, %xmm0		# XOR input.
+	pxor	%xmm2, %xmm6
+	paddd	%xmm3, %xmm14		# Increment counters.
+	paddd	%xmm3, %xmm15
+	movdqu	%xmm0, ($out,$inp)	# Write output.
+	movdqu	%xmm6, 16($out,$inp)
+	sub	\$2, $blocks		# Advance loop.
+	lea	32($inp), $inp
+	jnz	.Lctr32_loop
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lctr32_epilogue:
+___
+$code.=<<___;
+.Lctr32_abort:
+	ret
+.cfi_endproc
+.size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_preheat,\@abi-omnipotent
+.align	16
+_vpaes_preheat:
+.cfi_startproc
+	lea	.Lk_s0F(%rip), %r10
+	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
+	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
+	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
+	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
+	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
+	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
+	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
+	ret
+.cfi_endproc
+.size	_vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type	_vpaes_consts,\@object
+.section .rodata
+.align	64
+_vpaes_consts:
+.Lk_inv:	# inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:	# s0F
+	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:	# input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:	# sb1u, sb1t
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:	# sb2u, sb2t
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:	# sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:	# mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:		# sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:	# rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:	# s63: all equal to 0x63 transformed
+	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:	# output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:	# deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+# .Lrev_ctr is a permutation which byte-swaps the counter portion of the IV.
+.Lrev_ctr:
+	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+# .Lctr_add_* may be added to a byte-swapped xmm register to increment the
+# counter. The register must be byte-swapped again to form the actual input.
+.Lctr_add_one:
+	.quad	0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+	.quad	0x0000000000000000, 0x0000000200000000
+
+.asciz	"Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+.text
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	16(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xb8(%rax),%rax		# adjust stack pointer
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
+	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
+	.rva	.LSEH_info_${PREFIX}_ctr32_encrypt_blocks
+
+.section	.xdata
+.align	8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_ctr32_encrypt_blocks:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000000..acae4e582e
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -0,0 +1,739 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+# September 2015
+#
+# Align Cortex-A9 performance with November 2013 improvements, i.e.
+# NEON code is now ~20-105% faster than integer-only one on this
+# processor. But this optimization further improved performance even
+# on other processors: NEON code path is ~45-180% faster than original
+# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
+# Snapdragon S4.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.global	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+
+.align	5
+bn_mul_mont_nohw:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	cmp	ip,#2
+	mov	$num,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
+	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	mov	$tj,sp
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,$tj		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	$tj,sp
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	mov	$aj,sp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,$aj		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
+	ldr	$aj,[$rp]
+	str	sp,[$tp],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	$aj,$tj
+	str	$aj,[$rp],#4
+	teq	$tp,$num		@ preserve carry
+	bne	.Lcopy
+
+	mov	sp,$num
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH>=5
+	ret				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+___
+{
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my @ACC=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero="$Z#lo";
+my $temp="$Temp#lo";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	bn_mul8x_mont_neon
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4-r11}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	$num,#8
+	bhi	.LNEON_8n
+
+	@ special case for $num==8, everything is in register bank...
+
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	sub		$toutptr,sp,$num,lsl#4
+	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
+	and		$toutptr,$toutptr,#-64
+	vld1.32		{${M0}[0]}, [$n0,:32]
+	mov		sp,$toutptr			@ alloca
+	vzip.16		$Bi,$zero
+
+	vmull.u32	@ACC[0],$Bi,${A0}[0]
+	vmull.u32	@ACC[1],$Bi,${A0}[1]
+	vmull.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmull.u32	@ACC[3],$Bi,${A1}[1]
+
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	veor		$zero,$zero,$zero
+	vmul.u32	$Ni,$Ni,$M0
+
+	vmull.u32	@ACC[4],$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmull.u32	@ACC[5],$Bi,${A2}[1]
+	vmull.u32	@ACC[6],$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmull.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	sub		$outer,$num,#1
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
+	vshr.u64	$temp,$temp,#16
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
+	vshr.u64	$temp,$temp,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	veor		$zero,$zero,$zero
+	subs		$outer,$outer,#1
+	vmul.u32	$Ni,$Ni,$M0
+
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
+	vshr.u64	$temp,$temp,#16
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	mov		$toutptr,sp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	mov		$inner,$num
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	add		$tinptr,sp,#96
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_8n:
+	veor		@ACC[0],@ACC[0],@ACC[0]
+	 sub		$toutptr,sp,#128
+	veor		@ACC[1],@ACC[1],@ACC[1]
+	 sub		$toutptr,$toutptr,$num,lsl#4
+	veor		@ACC[2],@ACC[2],@ACC[2]
+	 and		$toutptr,$toutptr,#-64
+	veor		@ACC[3],@ACC[3],@ACC[3]
+	 mov		sp,$toutptr			@ alloca
+	veor		@ACC[4],@ACC[4],@ACC[4]
+	 add		$toutptr,$toutptr,#256
+	veor		@ACC[5],@ACC[5],@ACC[5]
+	 sub		$inner,$num,#8
+	veor		@ACC[6],@ACC[6],@ACC[6]
+	veor		@ACC[7],@ACC[7],@ACC[7]
+
+.LNEON_8n_init:
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	subs		$inner,$inner,#8
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
+	bne		.LNEON_8n_init
+
+	add		$tinptr,sp,#256
+	vld1.32		{$A0-$A3},[$aptr]!
+	add		$bnptr,sp,#8
+	vld1.32		{${M0}[0]},[$n0,:32]
+	mov		$outer,$num
+	b		.LNEON_8n_outer
+
+.align	4
+.LNEON_8n_outer:
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	add		$toutptr,sp,#128
+	vld1.32		{$N0-$N3},[$nptr]!
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	veor		$temp,$temp,$temp
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vzip.16		$Bi,$temp
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
+___
+	push(@ACC,shift(@ACC));	$i++;
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]!
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
+	add		$bnptr,sp,#8		@ rewind
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	sub		$inner,$num,#8
+	b		.LNEON_8n_inner
+
+.align	4
+.LNEON_8n_inner:
+	subs		$inner,$inner,#8
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vld1.32		{$N0-$N3},[$nptr]!
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	it		eq
+	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	add		$bnptr,sp,#8		@ rewind
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+
+	bne		.LNEON_8n_inner
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	add		$tinptr,sp,#128
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	veor		q2,q2,q2		@ $N0-$N1
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	veor		q3,q3,q3		@ $N2-$N3
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]},[$toutptr,:128]
+
+	subs		$outer,$outer,#8
+	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
+	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
+	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
+	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
+
+	itt		ne
+	subne		$nptr,$nptr,$num,lsl#2	@ rewind
+	bne		.LNEON_8n_outer
+
+	add		$toutptr,sp,#128
+	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vst1.64		{q2-q3},[sp,:256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vst1.64		{q2-q3}, [sp,:256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vst1.64		{q2-q3}, [sp,:256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+	mov		$inner,$num
+	b		.LNEON_tail_entry
+
+.align	4
+.LNEON_tail:
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
+	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,@ACC[1]#lo,#16
+	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
+	vshr.u64	$temp,@ACC[1]#hi,#16
+	vzip.16		@ACC[1]#lo,@ACC[1]#hi
+___
+	push(@ACC,shift(@ACC));
+}
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
+	subs		$inner,$inner,#8
+	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
+	bne	.LNEON_tail
+
+	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
+	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
+	subs	$aptr,sp,#0				@ clear carry flag
+	add	$bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$nptr!, {r8-r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [$aptr]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,$bptr,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	$aptr,sp
+	sub	$rptr,$rptr,r11				@ rewind $rptr
+	mov	$nptr,$bptr				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$rptr,  {r8-r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	$aptr, {r4-r7}
+	stmia	$rptr!, {r8-r11}
+	sub	$aptr,$aptr,#16
+	ldmia	$rptr, {r8-r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_copy_n_zap
+
+	mov	sp,ip
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r11}
+	ret						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
+	s/\bret\b/bx    lr/g						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl
new file mode 100644
index 0000000000..0c32c8b147
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl
@@ -0,0 +1,1517 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# March 2015
+#
+# "Teaser" Montgomery multiplication module for ARMv8. Needs more
+# work. While it does improve RSA sign performance by 20-30% (less for
+# longer keys) on most processors, for some reason RSA2048 is not
+# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
+# instruction issue rate is limited on processor in question, meaning
+# that dedicated squaring procedure is a must. Well, actually all
+# contemporary AArch64 processors seem to have limited multiplication
+# issue rate, i.e. they can't issue multiplication every cycle, which
+# explains moderate improvement coefficients in comparison to
+# compiler-generated code. Recall that compiler is instructed to use
+# umulh and therefore uses same amount of multiplication instructions
+# to do the job. Assembly's edge is to minimize number of "collateral"
+# instructions and of course instruction scheduling.
+#
+# April 2015
+#
+# Squaring procedure that handles lengths divisible by 8 improves
+# RSA/DSA performance by 25-40-60% depending on processor and key
+# length. Overall improvement coefficients are always positive in
+# comparison to compiler-generated code. On Cortex-A57 improvement
+# is still modest on longest key lengths, while others exhibit e.g.
+# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
+# on Cortex-A57 and ~60-100% faster on others.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+($lo0,$hi0,$aj,$m0,$alo,$ahi,
+ $lo1,$hi1,$nj,$m1,$nlo,$nhi,
+ $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
+
+# void bn_mul_mont_nohw(
+$rp="x0";	# BN_ULONG *rp,
+$ap="x1";	# const BN_ULONG *ap,
+$bp="x2";	# const BN_ULONG *bp,
+$np="x3";	# const BN_ULONG *np,
+$n0="x4";	# const BN_ULONG *n0,
+$num="x5";	# size_t num);
+
+$code.=<<___;
+.text
+
+.globl	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+.align	5
+bn_mul_mont_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	$m0,[$bp],#8		// bp[0]
+	sub	$tp,sp,$num,lsl#3
+	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
+	lsl	$num,$num,#3
+	ldr	$n0,[$n0]		// *n0
+	and	$tp,$tp,#-16		// ABI says so
+	ldp	$hi1,$nj,[$np],#16	// np[0..1]
+
+	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
+	sub	$j,$num,#16		// j=num-2
+	umulh	$hi0,$hi0,$m0
+	mul	$alo,$aj,$m0		// ap[1]*bp[0]
+	umulh	$ahi,$aj,$m0
+
+	mul	$m1,$lo0,$n0		// "tp[0]"*n0
+	mov	sp,$tp			// alloca
+
+	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
+	umulh	$hi1,$hi1,$m1
+	mul	$nlo,$nj,$m1		// np[1]*m1
+	// (*)	adds	$lo1,$lo1,$lo0	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	$lo0 being non-zero. So that carry can be calculated
+	//	by adding -1 to $lo0. That's what next instruction does.
+	subs	xzr,$lo0,#1		// (*)
+	umulh	$nhi,$nj,$m1
+	adc	$hi1,$hi1,xzr
+	cbz	$j,.L1st_skip
+
+.L1st:
+	ldr	$aj,[$ap],#8
+	adds	$lo0,$alo,$hi0
+	sub	$j,$j,#8		// j--
+	adc	$hi0,$ahi,xzr
+
+	ldr	$nj,[$np],#8
+	adds	$lo1,$nlo,$hi1
+	mul	$alo,$aj,$m0		// ap[j]*bp[0]
+	adc	$hi1,$nhi,xzr
+	umulh	$ahi,$aj,$m0
+
+	adds	$lo1,$lo1,$lo0
+	mul	$nlo,$nj,$m1		// np[j]*m1
+	adc	$hi1,$hi1,xzr
+	umulh	$nhi,$nj,$m1
+	str	$lo1,[$tp],#8		// tp[j-1]
+	cbnz	$j,.L1st
+
+.L1st_skip:
+	adds	$lo0,$alo,$hi0
+	sub	$ap,$ap,$num		// rewind $ap
+	adc	$hi0,$ahi,xzr
+
+	adds	$lo1,$nlo,$hi1
+	sub	$np,$np,$num		// rewind $np
+	adc	$hi1,$nhi,xzr
+
+	adds	$lo1,$lo1,$lo0
+	sub	$i,$num,#8		// i=num-1
+	adcs	$hi1,$hi1,$hi0
+
+	adc	$ovf,xzr,xzr		// upmost overflow bit
+	stp	$lo1,$hi1,[$tp]
+
+.Louter:
+	ldr	$m0,[$bp],#8		// bp[i]
+	ldp	$hi0,$aj,[$ap],#16
+	ldr	$tj,[sp]		// tp[0]
+	add	$tp,sp,#8
+
+	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
+	sub	$j,$num,#16		// j=num-2
+	umulh	$hi0,$hi0,$m0
+	ldp	$hi1,$nj,[$np],#16
+	mul	$alo,$aj,$m0		// ap[1]*bp[i]
+	adds	$lo0,$lo0,$tj
+	umulh	$ahi,$aj,$m0
+	adc	$hi0,$hi0,xzr
+
+	mul	$m1,$lo0,$n0
+	sub	$i,$i,#8		// i--
+
+	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
+	umulh	$hi1,$hi1,$m1
+	mul	$nlo,$nj,$m1		// np[1]*m1
+	// (*)	adds	$lo1,$lo1,$lo0
+	subs	xzr,$lo0,#1		// (*)
+	umulh	$nhi,$nj,$m1
+	cbz	$j,.Linner_skip
+
+.Linner:
+	ldr	$aj,[$ap],#8
+	adc	$hi1,$hi1,xzr
+	ldr	$tj,[$tp],#8		// tp[j]
+	adds	$lo0,$alo,$hi0
+	sub	$j,$j,#8		// j--
+	adc	$hi0,$ahi,xzr
+
+	adds	$lo1,$nlo,$hi1
+	ldr	$nj,[$np],#8
+	adc	$hi1,$nhi,xzr
+
+	mul	$alo,$aj,$m0		// ap[j]*bp[i]
+	adds	$lo0,$lo0,$tj
+	umulh	$ahi,$aj,$m0
+	adc	$hi0,$hi0,xzr
+
+	mul	$nlo,$nj,$m1		// np[j]*m1
+	adds	$lo1,$lo1,$lo0
+	umulh	$nhi,$nj,$m1
+	str	$lo1,[$tp,#-16]		// tp[j-1]
+	cbnz	$j,.Linner
+
+.Linner_skip:
+	ldr	$tj,[$tp],#8		// tp[j]
+	adc	$hi1,$hi1,xzr
+	adds	$lo0,$alo,$hi0
+	sub	$ap,$ap,$num		// rewind $ap
+	adc	$hi0,$ahi,xzr
+
+	adds	$lo1,$nlo,$hi1
+	sub	$np,$np,$num		// rewind $np
+	adcs	$hi1,$nhi,$ovf
+	adc	$ovf,xzr,xzr
+
+	adds	$lo0,$lo0,$tj
+	adc	$hi0,$hi0,xzr
+
+	adds	$lo1,$lo1,$lo0
+	adcs	$hi1,$hi1,$hi0
+	adc	$ovf,$ovf,xzr		// upmost overflow bit
+	stp	$lo1,$hi1,[$tp,#-16]
+
+	cbnz	$i,.Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	$tj,[sp]		// tp[0]
+	add	$tp,sp,#8
+	ldr	$nj,[$np],#8		// np[0]
+	subs	$j,$num,#8		// j=num-1 and clear borrow
+	mov	$ap,$rp
+.Lsub:
+	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
+	ldr	$tj,[$tp],#8
+	sub	$j,$j,#8		// j--
+	ldr	$nj,[$np],#8
+	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
+	cbnz	$j,.Lsub
+
+	sbcs	$aj,$tj,$nj
+	sbcs	$ovf,$ovf,xzr		// did it borrow?
+	str	$aj,[$ap],#8		// rp[num-1]
+
+	ldr	$tj,[sp]		// tp[0]
+	add	$tp,sp,#8
+	ldr	$aj,[$rp],#8		// rp[0]
+	sub	$num,$num,#8		// num--
+	nop
+.Lcond_copy:
+	sub	$num,$num,#8		// num--
+	csel	$nj,$tj,$aj,lo		// did it borrow?
+	ldr	$tj,[$tp],#8
+	ldr	$aj,[$rp],#8
+	str	xzr,[$tp,#-16]		// wipe tp
+	str	$nj,[$rp,#-16]
+	cbnz	$num,.Lcond_copy
+
+	csel	$nj,$tj,$aj,lo
+	str	xzr,[$tp,#-8]		// wipe tp
+	str	$nj,[$rp,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+___
+{
+########################################################################
+# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
+
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
+my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
+my ($cnt,$carry,$topmost)=("x27","x28","x30");
+my ($tp,$ap_end,$na0)=($bp,$np,$carry);
+
+$code.=<<___;
+.globl	bn_sqr8x_mont
+.type	bn_sqr8x_mont,%function
+.align	5
+bn_sqr8x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$rp,$np,[sp,#96]	// offload rp and np
+
+	ldp	$a0,$a1,[$ap,#8*0]
+	ldp	$a2,$a3,[$ap,#8*2]
+	ldp	$a4,$a5,[$ap,#8*4]
+	ldp	$a6,$a7,[$ap,#8*6]
+
+	sub	$tp,sp,$num,lsl#4
+	lsl	$num,$num,#3
+	ldr	$n0,[$n0]		// *n0
+	mov	sp,$tp			// alloca
+	sub	$cnt,$num,#8*8
+	b	.Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+	sub	$cnt,$cnt,#8*8
+	stp	xzr,xzr,[$tp,#8*0]
+	stp	xzr,xzr,[$tp,#8*2]
+	stp	xzr,xzr,[$tp,#8*4]
+	stp	xzr,xzr,[$tp,#8*6]
+.Lsqr8x_zero_start:
+	stp	xzr,xzr,[$tp,#8*8]
+	stp	xzr,xzr,[$tp,#8*10]
+	stp	xzr,xzr,[$tp,#8*12]
+	stp	xzr,xzr,[$tp,#8*14]
+	add	$tp,$tp,#8*16
+	cbnz	$cnt,.Lsqr8x_zero
+
+	add	$ap_end,$ap,$num
+	add	$ap,$ap,#8*8
+	mov	$acc0,xzr
+	mov	$acc1,xzr
+	mov	$acc2,xzr
+	mov	$acc3,xzr
+	mov	$acc4,xzr
+	mov	$acc5,xzr
+	mov	$acc6,xzr
+	mov	$acc7,xzr
+	mov	$tp,sp
+	str	$n0,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+.Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
+	mul	$t1,$a2,$a0
+	mul	$t2,$a3,$a0
+	mul	$t3,$a4,$a0
+	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
+	mul	$t0,$a5,$a0
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$a6,$a0
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$a7,$a0
+	adcs	$acc4,$acc4,$t3
+	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
+	adcs	$acc5,$acc5,$t0
+	umulh	$t0,$a2,$a0
+	adcs	$acc6,$acc6,$t1
+	umulh	$t1,$a3,$a0
+	adcs	$acc7,$acc7,$t2
+	umulh	$t2,$a4,$a0
+	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
+	adc	$acc0,xzr,xzr		// t[8]
+	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
+	umulh	$t3,$a5,$a0
+	adcs	$acc3,$acc3,$t0
+	umulh	$t0,$a6,$a0
+	adcs	$acc4,$acc4,$t1
+	umulh	$t1,$a7,$a0
+	adcs	$acc5,$acc5,$t2
+	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
+	adcs	$acc6,$acc6,$t3
+	 mul	$t3,$a3,$a1
+	adcs	$acc7,$acc7,$t0
+	 mul	$t0,$a4,$a1
+	adc	$acc0,$acc0,$t1
+
+	mul	$t1,$a5,$a1
+	adds	$acc3,$acc3,$t2
+	mul	$t2,$a6,$a1
+	adcs	$acc4,$acc4,$t3
+	mul	$t3,$a7,$a1
+	adcs	$acc5,$acc5,$t0
+	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
+	adcs	$acc6,$acc6,$t1
+	umulh	$t1,$a3,$a1
+	adcs	$acc7,$acc7,$t2
+	umulh	$t2,$a4,$a1
+	adcs	$acc0,$acc0,$t3
+	umulh	$t3,$a5,$a1
+	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
+	adc	$acc1,xzr,xzr		// t[9]
+	adds	$acc4,$acc4,$t0
+	umulh	$t0,$a6,$a1
+	adcs	$acc5,$acc5,$t1
+	umulh	$t1,$a7,$a1
+	adcs	$acc6,$acc6,$t2
+	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
+	adcs	$acc7,$acc7,$t3
+	 mul	$t3,$a4,$a2
+	adcs	$acc0,$acc0,$t0
+	 mul	$t0,$a5,$a2
+	adc	$acc1,$acc1,$t1
+
+	mul	$t1,$a6,$a2
+	adds	$acc5,$acc5,$t2
+	mul	$t2,$a7,$a2
+	adcs	$acc6,$acc6,$t3
+	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
+	adcs	$acc7,$acc7,$t0
+	umulh	$t0,$a4,$a2
+	adcs	$acc0,$acc0,$t1
+	umulh	$t1,$a5,$a2
+	adcs	$acc1,$acc1,$t2
+	umulh	$t2,$a6,$a2
+	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
+	adc	$acc2,xzr,xzr		// t[10]
+	adds	$acc6,$acc6,$t3
+	umulh	$t3,$a7,$a2
+	adcs	$acc7,$acc7,$t0
+	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
+	adcs	$acc0,$acc0,$t1
+	 mul	$t1,$a5,$a3
+	adcs	$acc1,$acc1,$t2
+	 mul	$t2,$a6,$a3
+	adc	$acc2,$acc2,$t3
+
+	mul	$t3,$a7,$a3
+	adds	$acc7,$acc7,$t0
+	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
+	adcs	$acc0,$acc0,$t1
+	umulh	$t1,$a5,$a3
+	adcs	$acc1,$acc1,$t2
+	umulh	$t2,$a6,$a3
+	adcs	$acc2,$acc2,$t3
+	umulh	$t3,$a7,$a3
+	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
+	adc	$acc3,xzr,xzr		// t[11]
+	adds	$acc0,$acc0,$t0
+	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
+	adcs	$acc1,$acc1,$t1
+	 mul	$t1,$a6,$a4
+	adcs	$acc2,$acc2,$t2
+	 mul	$t2,$a7,$a4
+	adc	$acc3,$acc3,$t3
+
+	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
+	adds	$acc1,$acc1,$t0
+	umulh	$t0,$a6,$a4
+	adcs	$acc2,$acc2,$t1
+	umulh	$t1,$a7,$a4
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
+	adc	$acc4,xzr,xzr		// t[12]
+	adds	$acc2,$acc2,$t3
+	 mul	$t3,$a7,$a5
+	adcs	$acc3,$acc3,$t0
+	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
+	adc	$acc4,$acc4,$t1
+
+	umulh	$t1,$a7,$a5
+	adds	$acc3,$acc3,$t2
+	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
+	adcs	$acc4,$acc4,$t3
+	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
+	adc	$acc5,xzr,xzr		// t[13]
+	adds	$acc4,$acc4,$t0
+	sub	$cnt,$ap_end,$ap	// done yet?
+	adc	$acc5,$acc5,$t1
+
+	adds	$acc5,$acc5,$t2
+	sub	$t0,$ap_end,$num	// rewinded ap
+	adc	$acc6,xzr,xzr		// t[14]
+	add	$acc6,$acc6,$t3
+
+	cbz	$cnt,.Lsqr8x_outer_break
+
+	mov	$n0,$a0
+	ldp	$a0,$a1,[$tp,#8*0]
+	ldp	$a2,$a3,[$tp,#8*2]
+	ldp	$a4,$a5,[$tp,#8*4]
+	ldp	$a6,$a7,[$tp,#8*6]
+	adds	$acc0,$acc0,$a0
+	adcs	$acc1,$acc1,$a1
+	ldp	$a0,$a1,[$ap,#8*0]
+	adcs	$acc2,$acc2,$a2
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$ap,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$ap,#8*4]
+	adcs	$acc6,$acc6,$a6
+	mov	$rp,$ap
+	adcs	$acc7,xzr,$a7
+	ldp	$a6,$a7,[$ap,#8*6]
+	add	$ap,$ap,#8*8
+	//adc	$carry,xzr,xzr		// moved below
+	mov	$cnt,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+.Lsqr8x_mul:
+	mul	$t0,$a0,$n0
+	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
+	mul	$t1,$a1,$n0
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$n0
+	mul	$t3,$a3,$n0
+	adds	$acc0,$acc0,$t0
+	mul	$t0,$a4,$n0
+	adcs	$acc1,$acc1,$t1
+	mul	$t1,$a5,$n0
+	adcs	$acc2,$acc2,$t2
+	mul	$t2,$a6,$n0
+	adcs	$acc3,$acc3,$t3
+	mul	$t3,$a7,$n0
+	adcs	$acc4,$acc4,$t0
+	umulh	$t0,$a0,$n0
+	adcs	$acc5,$acc5,$t1
+	umulh	$t1,$a1,$n0
+	adcs	$acc6,$acc6,$t2
+	umulh	$t2,$a2,$n0
+	adcs	$acc7,$acc7,$t3
+	umulh	$t3,$a3,$n0
+	adc	$carry,$carry,xzr
+	str	$acc0,[$tp],#8
+	adds	$acc0,$acc1,$t0
+	umulh	$t0,$a4,$n0
+	adcs	$acc1,$acc2,$t1
+	umulh	$t1,$a5,$n0
+	adcs	$acc2,$acc3,$t2
+	umulh	$t2,$a6,$n0
+	adcs	$acc3,$acc4,$t3
+	umulh	$t3,$a7,$n0
+	ldr	$n0,[$rp,$cnt]
+	adcs	$acc4,$acc5,$t0
+	adcs	$acc5,$acc6,$t1
+	adcs	$acc6,$acc7,$t2
+	adcs	$acc7,$carry,$t3
+	//adc	$carry,xzr,xzr		// moved above
+	cbnz	$cnt,.Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	$ap,$ap_end		// done yet?
+	b.eq	.Lsqr8x_break
+
+	ldp	$a0,$a1,[$tp,#8*0]
+	ldp	$a2,$a3,[$tp,#8*2]
+	ldp	$a4,$a5,[$tp,#8*4]
+	ldp	$a6,$a7,[$tp,#8*6]
+	adds	$acc0,$acc0,$a0
+	ldr	$n0,[$rp,#-8*8]
+	adcs	$acc1,$acc1,$a1
+	ldp	$a0,$a1,[$ap,#8*0]
+	adcs	$acc2,$acc2,$a2
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$ap,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$ap,#8*4]
+	adcs	$acc6,$acc6,$a6
+	mov	$cnt,#-8*8
+	adcs	$acc7,$acc7,$a7
+	ldp	$a6,$a7,[$ap,#8*6]
+	add	$ap,$ap,#8*8
+	//adc	$carry,xzr,xzr		// moved above
+	b	.Lsqr8x_mul
+
+.align	4
+.Lsqr8x_break:
+	ldp	$a0,$a1,[$rp,#8*0]
+	add	$ap,$rp,#8*8
+	ldp	$a2,$a3,[$rp,#8*2]
+	sub	$t0,$ap_end,$ap		// is it last iteration?
+	ldp	$a4,$a5,[$rp,#8*4]
+	sub	$t1,$tp,$t0
+	ldp	$a6,$a7,[$rp,#8*6]
+	cbz	$t0,.Lsqr8x_outer_loop
+
+	stp	$acc0,$acc1,[$tp,#8*0]
+	ldp	$acc0,$acc1,[$t1,#8*0]
+	stp	$acc2,$acc3,[$tp,#8*2]
+	ldp	$acc2,$acc3,[$t1,#8*2]
+	stp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc4,$acc5,[$t1,#8*4]
+	stp	$acc6,$acc7,[$tp,#8*6]
+	mov	$tp,$t1
+	ldp	$acc6,$acc7,[$t1,#8*6]
+	b	.Lsqr8x_outer_loop
+
+.align	4
+.Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
+	ldp	$t1,$t2,[sp,#8*1]
+	ldp	$a5,$a7,[$t0,#8*2]
+	add	$ap,$t0,#8*4
+	ldp	$t3,$t0,[sp,#8*3]
+
+	stp	$acc0,$acc1,[$tp,#8*0]
+	mul	$acc0,$a1,$a1
+	stp	$acc2,$acc3,[$tp,#8*2]
+	umulh	$a1,$a1,$a1
+	stp	$acc4,$acc5,[$tp,#8*4]
+	mul	$a2,$a3,$a3
+	stp	$acc6,$acc7,[$tp,#8*6]
+	mov	$tp,sp
+	umulh	$a3,$a3,$a3
+	adds	$acc1,$a1,$t1,lsl#1
+	extr	$t1,$t2,$t1,#63
+	sub	$cnt,$num,#8*4
+
+.Lsqr4x_shift_n_add:
+	adcs	$acc2,$a2,$t1
+	extr	$t2,$t3,$t2,#63
+	sub	$cnt,$cnt,#8*4
+	adcs	$acc3,$a3,$t2
+	ldp	$t1,$t2,[$tp,#8*5]
+	mul	$a4,$a5,$a5
+	ldp	$a1,$a3,[$ap],#8*2
+	umulh	$a5,$a5,$a5
+	mul	$a6,$a7,$a7
+	umulh	$a7,$a7,$a7
+	extr	$t3,$t0,$t3,#63
+	stp	$acc0,$acc1,[$tp,#8*0]
+	adcs	$acc4,$a4,$t3
+	extr	$t0,$t1,$t0,#63
+	stp	$acc2,$acc3,[$tp,#8*2]
+	adcs	$acc5,$a5,$t0
+	ldp	$t3,$t0,[$tp,#8*7]
+	extr	$t1,$t2,$t1,#63
+	adcs	$acc6,$a6,$t1
+	extr	$t2,$t3,$t2,#63
+	adcs	$acc7,$a7,$t2
+	ldp	$t1,$t2,[$tp,#8*9]
+	mul	$a0,$a1,$a1
+	ldp	$a5,$a7,[$ap],#8*2
+	umulh	$a1,$a1,$a1
+	mul	$a2,$a3,$a3
+	umulh	$a3,$a3,$a3
+	stp	$acc4,$acc5,[$tp,#8*4]
+	extr	$t3,$t0,$t3,#63
+	stp	$acc6,$acc7,[$tp,#8*6]
+	add	$tp,$tp,#8*8
+	adcs	$acc0,$a0,$t3
+	extr	$t0,$t1,$t0,#63
+	adcs	$acc1,$a1,$t0
+	ldp	$t3,$t0,[$tp,#8*3]
+	extr	$t1,$t2,$t1,#63
+	cbnz	$cnt,.Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+	 ldp	$np,$n0,[x29,#104]	// pull np and n0
+
+	adcs	$acc2,$a2,$t1
+	extr	$t2,$t3,$t2,#63
+	adcs	$acc3,$a3,$t2
+	ldp	$t1,$t2,[$tp,#8*5]
+	mul	$a4,$a5,$a5
+	umulh	$a5,$a5,$a5
+	stp	$acc0,$acc1,[$tp,#8*0]
+	mul	$a6,$a7,$a7
+	umulh	$a7,$a7,$a7
+	stp	$acc2,$acc3,[$tp,#8*2]
+	extr	$t3,$t0,$t3,#63
+	adcs	$acc4,$a4,$t3
+	extr	$t0,$t1,$t0,#63
+	 ldp	$acc0,$acc1,[sp,#8*0]
+	adcs	$acc5,$a5,$t0
+	extr	$t1,$t2,$t1,#63
+	 ldp	$a0,$a1,[$np,#8*0]
+	adcs	$acc6,$a6,$t1
+	extr	$t2,xzr,$t2,#63
+	 ldp	$a2,$a3,[$np,#8*2]
+	adc	$acc7,$a7,$t2
+	 ldp	$a4,$a5,[$np,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	$na0,$n0,$acc0		// t[0]*n0
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np_end,$np,$num
+	ldp	$acc2,$acc3,[sp,#8*2]
+	stp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc4,$acc5,[sp,#8*4]
+	stp	$acc6,$acc7,[$tp,#8*6]
+	ldp	$acc6,$acc7,[sp,#8*6]
+	add	$np,$np,#8*8
+	mov	$topmost,xzr		// initial top-most carry
+	mov	$tp,sp
+	mov	$cnt,#8
+
+.Lsqr8x_reduction:
+	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
+	mul	$t1,$a1,$na0
+	sub	$cnt,$cnt,#1
+	mul	$t2,$a2,$na0
+	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
+	mul	$t3,$a3,$na0
+	// (*)	adds	xzr,$acc0,$t0
+	subs	xzr,$acc0,#1		// (*)
+	mul	$t0,$a4,$na0
+	adcs	$acc0,$acc1,$t1
+	mul	$t1,$a5,$na0
+	adcs	$acc1,$acc2,$t2
+	mul	$t2,$a6,$na0
+	adcs	$acc2,$acc3,$t3
+	mul	$t3,$a7,$na0
+	adcs	$acc3,$acc4,$t0
+	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	$acc4,$acc5,$t1
+	umulh	$t1,$a1,$na0
+	adcs	$acc5,$acc6,$t2
+	umulh	$t2,$a2,$na0
+	adcs	$acc6,$acc7,$t3
+	umulh	$t3,$a3,$na0
+	adc	$acc7,xzr,xzr
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a4,$na0
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a5,$na0
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a6,$na0
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a7,$na0
+	mul	$na0,$n0,$acc0		// next t[0]*n0
+	adcs	$acc4,$acc4,$t0
+	adcs	$acc5,$acc5,$t1
+	adcs	$acc6,$acc6,$t2
+	adc	$acc7,$acc7,$t3
+	cbnz	$cnt,.Lsqr8x_reduction
+
+	ldp	$t0,$t1,[$tp,#8*0]
+	ldp	$t2,$t3,[$tp,#8*2]
+	mov	$rp,$tp
+	sub	$cnt,$np_end,$np	// done yet?
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	ldp	$t0,$t1,[$tp,#8*4]
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	ldp	$t2,$t3,[$tp,#8*6]
+	adcs	$acc4,$acc4,$t0
+	adcs	$acc5,$acc5,$t1
+	adcs	$acc6,$acc6,$t2
+	adcs	$acc7,$acc7,$t3
+	//adc	$carry,xzr,xzr		// moved below
+	cbz	$cnt,.Lsqr8x8_post_condition
+
+	ldr	$n0,[$tp,#-8*8]
+	ldp	$a0,$a1,[$np,#8*0]
+	ldp	$a2,$a3,[$np,#8*2]
+	ldp	$a4,$a5,[$np,#8*4]
+	mov	$cnt,#-8*8
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np,$np,#8*8
+
+.Lsqr8x_tail:
+	mul	$t0,$a0,$n0
+	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
+	mul	$t1,$a1,$n0
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$n0
+	mul	$t3,$a3,$n0
+	adds	$acc0,$acc0,$t0
+	mul	$t0,$a4,$n0
+	adcs	$acc1,$acc1,$t1
+	mul	$t1,$a5,$n0
+	adcs	$acc2,$acc2,$t2
+	mul	$t2,$a6,$n0
+	adcs	$acc3,$acc3,$t3
+	mul	$t3,$a7,$n0
+	adcs	$acc4,$acc4,$t0
+	umulh	$t0,$a0,$n0
+	adcs	$acc5,$acc5,$t1
+	umulh	$t1,$a1,$n0
+	adcs	$acc6,$acc6,$t2
+	umulh	$t2,$a2,$n0
+	adcs	$acc7,$acc7,$t3
+	umulh	$t3,$a3,$n0
+	adc	$carry,$carry,xzr
+	str	$acc0,[$tp],#8
+	adds	$acc0,$acc1,$t0
+	umulh	$t0,$a4,$n0
+	adcs	$acc1,$acc2,$t1
+	umulh	$t1,$a5,$n0
+	adcs	$acc2,$acc3,$t2
+	umulh	$t2,$a6,$n0
+	adcs	$acc3,$acc4,$t3
+	umulh	$t3,$a7,$n0
+	ldr	$n0,[$rp,$cnt]
+	adcs	$acc4,$acc5,$t0
+	adcs	$acc5,$acc6,$t1
+	adcs	$acc6,$acc7,$t2
+	adcs	$acc7,$carry,$t3
+	//adc	$carry,xzr,xzr		// moved above
+	cbnz	$cnt,.Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	$a0,$a1,[$tp,#8*0]
+	sub	$cnt,$np_end,$np	// done yet?
+	sub	$t2,$np_end,$num	// rewinded np
+	ldp	$a2,$a3,[$tp,#8*2]
+	ldp	$a4,$a5,[$tp,#8*4]
+	ldp	$a6,$a7,[$tp,#8*6]
+	cbz	$cnt,.Lsqr8x_tail_break
+
+	ldr	$n0,[$rp,#-8*8]
+	adds	$acc0,$acc0,$a0
+	adcs	$acc1,$acc1,$a1
+	ldp	$a0,$a1,[$np,#8*0]
+	adcs	$acc2,$acc2,$a2
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$np,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$np,#8*4]
+	adcs	$acc6,$acc6,$a6
+	mov	$cnt,#-8*8
+	adcs	$acc7,$acc7,$a7
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np,$np,#8*8
+	//adc	$carry,xzr,xzr		// moved above
+	b	.Lsqr8x_tail
+
+.align	4
+.Lsqr8x_tail_break:
+	ldr	$n0,[x29,#112]		// pull n0
+	add	$cnt,$tp,#8*8		// end of current t[num] window
+
+	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
+	adcs	$t0,$acc0,$a0
+	adcs	$t1,$acc1,$a1
+	ldp	$acc0,$acc1,[$rp,#8*0]
+	adcs	$acc2,$acc2,$a2
+	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$t2,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$t2,#8*4]
+	adcs	$acc6,$acc6,$a6
+	adcs	$acc7,$acc7,$a7
+	ldp	$a6,$a7,[$t2,#8*6]
+	add	$np,$t2,#8*8
+	adc	$topmost,xzr,xzr	// top-most carry
+	mul	$na0,$n0,$acc0
+	stp	$t0,$t1,[$tp,#8*0]
+	stp	$acc2,$acc3,[$tp,#8*2]
+	ldp	$acc2,$acc3,[$rp,#8*2]
+	stp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc4,$acc5,[$rp,#8*4]
+	cmp	$cnt,x29		// did we hit the bottom?
+	stp	$acc6,$acc7,[$tp,#8*6]
+	mov	$tp,$rp			// slide the window
+	ldp	$acc6,$acc7,[$rp,#8*6]
+	mov	$cnt,#8
+	b.ne	.Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	$rp,[x29,#96]		// pull rp
+	add	$tp,$tp,#8*8
+	subs	$t0,$acc0,$a0
+	sbcs	$t1,$acc1,$a1
+	sub	$cnt,$num,#8*8
+	mov	$ap_end,$rp		// $rp copy
+
+.Lsqr8x_sub:
+	sbcs	$t2,$acc2,$a2
+	ldp	$a0,$a1,[$np,#8*0]
+	sbcs	$t3,$acc3,$a3
+	stp	$t0,$t1,[$rp,#8*0]
+	sbcs	$t0,$acc4,$a4
+	ldp	$a2,$a3,[$np,#8*2]
+	sbcs	$t1,$acc5,$a5
+	stp	$t2,$t3,[$rp,#8*2]
+	sbcs	$t2,$acc6,$a6
+	ldp	$a4,$a5,[$np,#8*4]
+	sbcs	$t3,$acc7,$a7
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np,$np,#8*8
+	ldp	$acc0,$acc1,[$tp,#8*0]
+	sub	$cnt,$cnt,#8*8
+	ldp	$acc2,$acc3,[$tp,#8*2]
+	ldp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc6,$acc7,[$tp,#8*6]
+	add	$tp,$tp,#8*8
+	stp	$t0,$t1,[$rp,#8*4]
+	sbcs	$t0,$acc0,$a0
+	stp	$t2,$t3,[$rp,#8*6]
+	add	$rp,$rp,#8*8
+	sbcs	$t1,$acc1,$a1
+	cbnz	$cnt,.Lsqr8x_sub
+
+	sbcs	$t2,$acc2,$a2
+	 mov	$tp,sp
+	 add	$ap,sp,$num
+	 ldp	$a0,$a1,[$ap_end,#8*0]
+	sbcs	$t3,$acc3,$a3
+	stp	$t0,$t1,[$rp,#8*0]
+	sbcs	$t0,$acc4,$a4
+	 ldp	$a2,$a3,[$ap_end,#8*2]
+	sbcs	$t1,$acc5,$a5
+	stp	$t2,$t3,[$rp,#8*2]
+	sbcs	$t2,$acc6,$a6
+	 ldp	$acc0,$acc1,[$ap,#8*0]
+	sbcs	$t3,$acc7,$a7
+	 ldp	$acc2,$acc3,[$ap,#8*2]
+	sbcs	xzr,$topmost,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	$t0,$t1,[$rp,#8*4]
+	stp	$t2,$t3,[$rp,#8*6]
+
+	sub	$cnt,$num,#8*4
+.Lsqr4x_cond_copy:
+	sub	$cnt,$cnt,#8*4
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	ldp	$a0,$a1,[$ap_end,#8*4]
+	ldp	$acc0,$acc1,[$ap,#8*4]
+	csel	$t2,$acc2,$a2,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	 add	$tp,$tp,#8*4
+	csel	$t3,$acc3,$a3,lo
+	ldp	$a2,$a3,[$ap_end,#8*6]
+	ldp	$acc2,$acc3,[$ap,#8*6]
+	add	$ap,$ap,#8*4
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+	add	$ap_end,$ap_end,#8*4
+	 stp	xzr,xzr,[$ap,#8*0]
+	 stp	xzr,xzr,[$ap,#8*2]
+	cbnz	$cnt,.Lsqr4x_cond_copy
+
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	csel	$t2,$acc2,$a2,lo
+	csel	$t3,$acc3,$a3,lo
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+
+	b	.Lsqr8x_done
+
+.align	4
+.Lsqr8x8_post_condition:
+	adc	$carry,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// $acc0-7,$carry hold result, $a0-7 hold modulus
+	subs	$a0,$acc0,$a0
+	ldr	$ap,[x29,#96]		// pull rp
+	sbcs	$a1,$acc1,$a1
+	 stp	xzr,xzr,[sp,#8*0]
+	sbcs	$a2,$acc2,$a2
+	 stp	xzr,xzr,[sp,#8*2]
+	sbcs	$a3,$acc3,$a3
+	 stp	xzr,xzr,[sp,#8*4]
+	sbcs	$a4,$acc4,$a4
+	 stp	xzr,xzr,[sp,#8*6]
+	sbcs	$a5,$acc5,$a5
+	 stp	xzr,xzr,[sp,#8*8]
+	sbcs	$a6,$acc6,$a6
+	 stp	xzr,xzr,[sp,#8*10]
+	sbcs	$a7,$acc7,$a7
+	 stp	xzr,xzr,[sp,#8*12]
+	sbcs	$carry,$carry,xzr	// did it borrow?
+	 stp	xzr,xzr,[sp,#8*14]
+
+	// $a0-7 hold result-modulus
+	csel	$a0,$acc0,$a0,lo
+	csel	$a1,$acc1,$a1,lo
+	csel	$a2,$acc2,$a2,lo
+	csel	$a3,$acc3,$a3,lo
+	stp	$a0,$a1,[$ap,#8*0]
+	csel	$a4,$acc4,$a4,lo
+	csel	$a5,$acc5,$a5,lo
+	stp	$a2,$a3,[$ap,#8*2]
+	csel	$a6,$acc6,$a6,lo
+	csel	$a7,$acc7,$a7,lo
+	stp	$a4,$a5,[$ap,#8*4]
+	stp	$a6,$a7,[$ap,#8*6]
+
+.Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+___
+}
+
+{
+########################################################################
+# Even though this might look as ARMv8 adaptation of mulx4x_mont from
+# x86_64-mont5 module, it's different in sense that it performs
+# reduction 256 bits at a time.
+
+my ($a0,$a1,$a2,$a3,
+    $t0,$t1,$t2,$t3,
+    $m0,$m1,$m2,$m3,
+    $acc0,$acc1,$acc2,$acc3,$acc4,
+    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
+my  $bp_end=$rp;
+my  ($carry,$topmost) = ($rp,"x30");
+
+$code.=<<___;
+.globl	bn_mul4x_mont
+.type	bn_mul4x_mont,%function
+.align	5
+bn_mul4x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	$tp,sp,$num,lsl#3
+	lsl	$num,$num,#3
+	ldr	$n0,[$n0]		// *n0
+	sub	sp,$tp,#8*4		// alloca
+
+	add	$t0,$bp,$num
+	add	$ap_end,$ap,$num
+	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
+
+	ldr	$bi,[$bp,#8*0]		// b[0]
+	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	mov	$acc0,xzr
+	mov	$acc1,xzr
+	mov	$acc2,xzr
+	mov	$acc3,xzr
+	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
+	ldp	$m2,$m3,[$np,#8*2]
+	adds	$np,$np,#8*4		// clear carry bit
+	mov	$carry,xzr
+	mov	$cnt,#0
+	mov	$tp,sp
+
+.Loop_mul4x_1st_reduction:
+	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
+	adcs	$acc1,$acc1,$t1
+	mul	$mi,$acc0,$n0		// t[0]*n0
+	adcs	$acc2,$acc2,$t2
+	umulh	$t1,$a1,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t2,$a2,$bi
+	adc	$acc4,xzr,xzr
+	umulh	$t3,$a3,$bi
+	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
+	adds	$acc1,$acc1,$t0
+	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
+	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	// (*)	adds	xzr,$acc0,$t0
+	subs	xzr,$acc0,#1		// (*)
+	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
+	adcs	$acc0,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc1,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc2,$acc3,$t3
+	umulh	$t3,$m3,$mi
+	adcs	$acc3,$acc4,$carry
+	adc	$carry,xzr,xzr
+	adds	$acc0,$acc0,$t0
+	sub	$t0,$ap_end,$ap
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_1st_reduction
+
+	cbz	$t0,.Lmul4x4_post_condition
+
+	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	ldr	$mi,[sp]		// a[0]*n0
+	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+
+.Loop_mul4x_1st_tail:
+	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,xzr,xzr
+	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
+	adds	$acc1,$acc1,$t0
+	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc3,$acc3,$t3
+	adcs	$acc4,$acc4,$carry
+	umulh	$t3,$m3,$mi
+	adc	$carry,xzr,xzr
+	ldr	$mi,[sp,$cnt]		// next t[0]*n0
+	str	$acc0,[$tp],#8		// result!!!
+	adds	$acc0,$acc1,$t0
+	sub	$t0,$ap_end,$ap		// done yet?
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2
+	adcs	$acc3,$acc4,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_1st_tail
+
+	sub	$t1,$ap_end,$num	// rewinded $ap
+	cbz	$t0,.Lmul4x_proceed
+
+	ldp	$a0,$a1,[$ap,#8*0]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	ldp	$m0,$m1,[$np,#8*0]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+	b	.Loop_mul4x_1st_tail
+
+.align	5
+.Lmul4x_proceed:
+	ldr	$bi,[$bp,#8*4]!		// *++b
+	adc	$topmost,$carry,xzr
+	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
+	sub	$np,$np,$num		// rewind np
+	ldp	$a2,$a3,[$t1,#8*2]
+	add	$ap,$t1,#8*4
+
+	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
+	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
+	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
+	ldp	$acc2,$acc3,[sp,#8*6]
+
+	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
+	mov	$tp,sp
+	ldp	$m2,$m3,[$np,#8*2]
+	adds	$np,$np,#8*4		// clear carry bit
+	mov	$carry,xzr
+
+.align	4
+.Loop_mul4x_reduction:
+	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
+	adcs	$acc1,$acc1,$t1
+	mul	$mi,$acc0,$n0		// t[0]*n0
+	adcs	$acc2,$acc2,$t2
+	umulh	$t1,$a1,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t2,$a2,$bi
+	adc	$acc4,xzr,xzr
+	umulh	$t3,$a3,$bi
+	ldr	$bi,[$bp,$cnt]		// next b[i]
+	adds	$acc1,$acc1,$t0
+	// (*)	mul	$t0,$m0,$mi
+	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	// (*)	adds	xzr,$acc0,$t0
+	subs	xzr,$acc0,#1		// (*)
+	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
+	adcs	$acc0,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc1,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc2,$acc3,$t3
+	umulh	$t3,$m3,$mi
+	adcs	$acc3,$acc4,$carry
+	adc	$carry,xzr,xzr
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_reduction
+
+	adc	$carry,$carry,xzr
+	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
+	ldp	$t2,$t3,[$tp,#8*6]
+	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+
+	ldr	$mi,[sp]		// t[0]*n0
+	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+
+.align	4
+.Loop_mul4x_tail:
+	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,xzr,xzr
+	ldr	$bi,[$bp,$cnt]		// next b[i]
+	adds	$acc1,$acc1,$t0
+	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$m3,$mi
+	adcs	$acc4,$acc4,$carry
+	ldr	$mi,[sp,$cnt]		// next a[0]*n0
+	adc	$carry,xzr,xzr
+	str	$acc0,[$tp],#8		// result!!!
+	adds	$acc0,$acc1,$t0
+	sub	$t0,$ap_end,$ap		// done yet?
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2
+	adcs	$acc3,$acc4,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_tail
+
+	sub	$t1,$np,$num		// rewinded np?
+	adc	$carry,$carry,xzr
+	cbz	$t0,.Loop_mul4x_break
+
+	ldp	$t0,$t1,[$tp,#8*4]
+	ldp	$t2,$t3,[$tp,#8*6]
+	ldp	$a0,$a1,[$ap,#8*0]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+	ldp	$m0,$m1,[$np,#8*0]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+	b	.Loop_mul4x_tail
+
+.align	4
+.Loop_mul4x_break:
+	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
+	adds	$acc0,$acc0,$topmost
+	add	$bp,$bp,#8*4		// bp++
+	adcs	$acc1,$acc1,xzr
+	sub	$ap,$ap,$num		// rewind ap
+	adcs	$acc2,$acc2,xzr
+	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
+	adcs	$acc3,$acc3,xzr
+	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
+	adc	$topmost,$carry,xzr
+	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
+	cmp	$bp,$t3			// done yet?
+	ldp	$acc2,$acc3,[sp,#8*6]
+	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
+	ldp	$m2,$m3,[$t1,#8*2]
+	add	$np,$t1,#8*4
+	b.eq	.Lmul4x_post
+
+	ldr	$bi,[$bp]
+	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
+	ldp	$a2,$a3,[$ap,#8*2]
+	adds	$ap,$ap,#8*4		// clear carry bit
+	mov	$carry,xzr
+	mov	$tp,sp
+	b	.Loop_mul4x_reduction
+
+.align	4
+.Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	$rp,$t2
+	mov	$ap_end,$t2		// $rp copy
+	subs	$t0,$acc0,$m0
+	add	$tp,sp,#8*8
+	sbcs	$t1,$acc1,$m1
+	sub	$cnt,$num,#8*4
+
+.Lmul4x_sub:
+	sbcs	$t2,$acc2,$m2
+	ldp	$m0,$m1,[$np,#8*0]
+	sub	$cnt,$cnt,#8*4
+	ldp	$acc0,$acc1,[$tp,#8*0]
+	sbcs	$t3,$acc3,$m3
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+	ldp	$acc2,$acc3,[$tp,#8*2]
+	add	$tp,$tp,#8*4
+	stp	$t0,$t1,[$rp,#8*0]
+	sbcs	$t0,$acc0,$m0
+	stp	$t2,$t3,[$rp,#8*2]
+	add	$rp,$rp,#8*4
+	sbcs	$t1,$acc1,$m1
+	cbnz	$cnt,.Lmul4x_sub
+
+	sbcs	$t2,$acc2,$m2
+	 mov	$tp,sp
+	 add	$ap,sp,#8*4
+	 ldp	$a0,$a1,[$ap_end,#8*0]
+	sbcs	$t3,$acc3,$m3
+	stp	$t0,$t1,[$rp,#8*0]
+	 ldp	$a2,$a3,[$ap_end,#8*2]
+	stp	$t2,$t3,[$rp,#8*2]
+	 ldp	$acc0,$acc1,[$ap,#8*0]
+	 ldp	$acc2,$acc3,[$ap,#8*2]
+	sbcs	xzr,$topmost,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	$cnt,$num,#8*4
+.Lmul4x_cond_copy:
+	sub	$cnt,$cnt,#8*4
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	ldp	$a0,$a1,[$ap_end,#8*4]
+	ldp	$acc0,$acc1,[$ap,#8*4]
+	csel	$t2,$acc2,$a2,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	 add	$tp,$tp,#8*4
+	csel	$t3,$acc3,$a3,lo
+	ldp	$a2,$a3,[$ap_end,#8*6]
+	ldp	$acc2,$acc3,[$ap,#8*6]
+	add	$ap,$ap,#8*4
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+	add	$ap_end,$ap_end,#8*4
+	cbnz	$cnt,.Lmul4x_cond_copy
+
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	csel	$t2,$acc2,$a2,lo
+	 stp	xzr,xzr,[$tp,#8*3]
+	csel	$t3,$acc3,$a3,lo
+	 stp	xzr,xzr,[$tp,#8*4]
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+
+	b	.Lmul4x_done
+
+.align	4
+.Lmul4x4_post_condition:
+	adc	$carry,$carry,xzr
+	ldr	$ap,[x29,#96]		// pull rp
+	// $acc0-3,$carry hold result, $m0-7 hold modulus
+	subs	$a0,$acc0,$m0
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	$a1,$acc1,$m1
+	 stp	xzr,xzr,[sp,#8*0]
+	sbcs	$a2,$acc2,$m2
+	 stp	xzr,xzr,[sp,#8*2]
+	sbcs	$a3,$acc3,$m3
+	 stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,$carry,xzr		// did it borrow?
+	 stp	xzr,xzr,[sp,#8*6]
+
+	// $a0-3 hold result-modulus
+	csel	$a0,$acc0,$a0,lo
+	csel	$a1,$acc1,$a1,lo
+	csel	$a2,$acc2,$a2,lo
+	csel	$a3,$acc3,$a3,lo
+	stp	$a0,$a1,[$ap,#8*0]
+	stp	$a2,$a3,[$ap,#8*2]
+
+.Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}
+$code.=<<___;
+.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl
new file mode 100644
index 0000000000..afa09948bc
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -0,0 +1,334 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$sse2=1;
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arranging 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","ebp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","ebp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("ebp","edx");		# this splits them apart modulo 4096
+
+	&and	("ebp",-64);		# align to cache line
+
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	&mov	("eax","esp");
+	&sub	("eax","ebp");
+	&and	("eax",-4096);
+	&mov	("edx","esp");		# saved stack pointer!
+	&lea	("esp",&DWP(0,"ebp","eax"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+	&jmp	(&label("page_walk_done"));
+
+&set_label("page_walk",16);
+	&lea	("esp",&DWP(-4096,"esp"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+&set_label("page_walk_done");
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"ebp");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"edx");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&mov	("edx",-1);
+	&xor	("edx","eax");
+	&jmp	(&label("copy"));
+
+&set_label("copy",16);				# conditional copy
+	&mov	($tp,&DWP($frame,"esp",$num,4));
+	&mov	($np,&DWP(0,$rp,$num,4));
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&and	($tp,"eax");
+	&and	($np,"edx");
+	&or	($np,$tp);
+	&mov	(&DWP(0,$rp,$num,4),$np);
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl
new file mode 100644
index 0000000000..acbcd31099
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -0,0 +1,1561 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# October 2005.
+#
+# Montgomery multiplication routine for x86_64. While it gives modest
+# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
+# than twice, >2x, as fast. Most common rsa1024 sign is improved by
+# respectful 50%. It remains to be seen if loop unrolling and
+# dedicated squaring routine can provide further improvement...
+
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# June 2013.
+#
+# Optimize reduction in squaring procedure and improve 1024+-bit RSA
+# sign performance by 10-16% on Intel Sandy Bridge and later
+# (virtually same on non-Intel processors).
+
+# August 2013.
+#
+# Add MULX/ADOX/ADCX code path.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+$addx = 1;
+
+# void bn_mul_mont_nohw(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+# TODO(davidben): The code below treats $num as an int, but C passes in a
+# size_t.
+$num="%r9";	# size_t num);
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,\@function,6
+.align	16
+bn_mul_mont_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+
+	neg	$num
+	mov	%rsp,%r11
+	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
+	neg	$num			# restore $num
+	and	\$-1024,%r10		# minimize TLB usage
+
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	sub	%r10,%r11
+	and	\$-4096,%r11
+	lea	(%r10,%r11),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
+.Lmul_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jb	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	mov	$num,$j			# j=num
+
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8(%rsp,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	mov	\$-1,%rbx
+	xor	%rax,%rbx		# not %rax
+	xor	$i,$i
+	mov	$num,$j			# j=num
+
+.Lcopy:					# conditional copy
+	mov	($rp,$i,8),%rcx
+	mov	(%rsp,$i,8),%rdx
+	and	%rbx,%rcx
+	and	%rax,%rdx
+	mov	$num,(%rsp,$i,8)	# zap temporary vector
+	or	%rcx,%rdx
+	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+.cfi_def_cfa	%rsi,8
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_epilogue:
+	ret
+.cfi_endproc
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.globl	bn_mul4x_mont
+.type	bn_mul4x_mont,\@function,6
+.align	16
+bn_mul4x_mont:
+.cfi_startproc
+	_CET_ENDBR
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+
+	neg	$num
+	mov	%rsp,%r11
+	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
+	neg	$num			# restore
+	and	\$-1024,%r10		# minimize TLB usage
+
+	sub	%r10,%r11
+	and	\$-4096,%r11
+	lea	(%r10,%r11),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jb	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)		# tp[j-1]
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jb	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jb	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	lea	-4($num),$j
+	mov	0(%rsp),@ri[0]		# tp[0]
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$j			# j=num/4-1
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	pxor	%xmm0,%xmm0
+	movq	@ri[0],%xmm4
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	\$0,%xmm4,%xmm4
+	mov	$num,$j
+	pxor	%xmm4,%xmm5
+	shr	\$2,$j			# j=num/4
+	xor	%eax,%eax		# i=0
+
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:				# conditional copy
+	movdqa	(%rsp,%rax),%xmm1
+	movdqu	($rp,%rax),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax),%xmm3
+	movdqa	%xmm0,(%rsp,%rax)
+	por	%xmm2,%xmm1
+	movdqu	16($rp,%rax),%xmm2
+	movdqu	%xmm1,($rp,%rax)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16($rp,%rax)
+	lea	32(%rax),%rax
+	dec	$j
+	jnz	.Lcopy4x
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+.cfi_def_cfa	%rsi, 8
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	ret
+.cfi_endproc
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr8x_mont(
+my $rptr="%rdi";	# const BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $mulx_adx_capable="%rdx"; # Different than upstream!
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___	if ($addx);
+.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
+___
+$code.=<<___;
+.extern	bn_sqr8x_internal		# see x86_64-mont5 module
+
+.globl	bn_sqr8x_mont
+.type	bn_sqr8x_mont,\@function,6
+.align	32
+bn_sqr8x_mont:
+.cfi_startproc
+	_CET_ENDBR
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lsqr8x_prologue:
+
+	mov	${num}d,%r10d
+	shl	\$3,${num}d		# convert $num to bytes
+	shl	\$3+2,%r10		# 4*$num
+	neg	$num
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr modulo
+	# 4096. this is done to allow memory disambiguation logic
+	# do its job.
+	#
+	lea	-64(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
+	mov	($n0),$n0		# *n0
+	sub	$aptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lsqr8x_sp_alt
+	sub	%r11,%rbp		# align with $aptr
+	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
+	jmp	.Lsqr8x_sp_done
+
+.align	32
+.Lsqr8x_sp_alt:
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rbp
+.Lsqr8x_sp_done:
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
+
+.align	16
+.Lsqr8x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+	mov	$num,%r10
+	neg	$num
+
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
+.Lsqr8x_body:
+
+	movq	$nptr, %xmm2		# save pointer to modulus
+	pxor	%xmm0,%xmm0
+	movq	$rptr,%xmm1		# save $rptr
+	movq	%r10, %xmm3		# -$num
+___
+$code.=<<___ if ($addx);
+	test	$mulx_adx_capable,$mulx_adx_capable
+	jz	.Lsqr8x_nox
+
+	call	bn_sqrx8x_internal	# see x86_64-mont5 module
+					# %rax	top-most carry
+					# %rbp	nptr
+					# %rcx	-8*num
+					# %r8	end of tp[2*num]
+	lea	(%r8,%rcx),%rbx
+	mov	%rcx,$num
+	mov	%rcx,%rdx
+	movq	%xmm1,$rptr
+	sar	\$3+2,%rcx		# %cf=0
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_nox:
+___
+$code.=<<___;
+	call	bn_sqr8x_internal	# see x86_64-mont5 module
+					# %rax	top-most carry
+					# %rbp	nptr
+					# %r8	-8*num
+					# %rdi	end of tp[2*num]
+	lea	(%rdi,$num),%rbx
+	mov	$num,%rcx
+	mov	$num,%rdx
+	movq	%xmm1,$rptr
+	sar	\$3+2,%rcx		# %cf=0
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_sub:
+	mov	8*0(%rbx),%r12
+	mov	8*1(%rbx),%r13
+	mov	8*2(%rbx),%r14
+	mov	8*3(%rbx),%r15
+	lea	8*4(%rbx),%rbx
+	sbb	8*0(%rbp),%r12
+	sbb	8*1(%rbp),%r13
+	sbb	8*2(%rbp),%r14
+	sbb	8*3(%rbp),%r15
+	lea	8*4(%rbp),%rbp
+	mov	%r12,8*0($rptr)
+	mov	%r13,8*1($rptr)
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+	inc	%rcx			# preserves %cf
+	jnz	.Lsqr8x_sub
+
+	sbb	\$0,%rax		# top-most carry
+	lea	(%rbx,$num),%rbx	# rewind
+	lea	($rptr,$num),$rptr	# rewind
+
+	movq	%rax,%xmm1
+	pxor	%xmm0,%xmm0
+	pshufd	\$0,%xmm1,%xmm1
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
+	jmp	.Lsqr8x_cond_copy
+
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	16*0(%rbx),%xmm2
+	movdqa	16*1(%rbx),%xmm3
+	lea	16*2(%rbx),%rbx
+	movdqu	16*0($rptr),%xmm4
+	movdqu	16*1($rptr),%xmm5
+	lea	16*2($rptr),$rptr
+	movdqa	%xmm0,-16*2(%rbx)	# zero tp
+	movdqa	%xmm0,-16*1(%rbx)
+	movdqa	%xmm0,-16*2(%rbx,%rdx)
+	movdqa	%xmm0,-16*1(%rbx,%rdx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-16*2($rptr)
+	movdqu	%xmm5,-16*1($rptr)
+	add	\$32,$num
+	jnz	.Lsqr8x_cond_copy
+
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lsqr8x_epilogue:
+	ret
+.cfi_endproc
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+___
+}}}
+
+if ($addx) {{{
+my $bp="%rdx";	# original value
+
+$code.=<<___;
+.globl	bn_mulx4x_mont
+.type	bn_mulx4x_mont,\@function,6
+.align	32
+bn_mulx4x_mont:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lmulx4x_prologue:
+
+	shl	\$3,${num}d		# convert $num to bytes
+	xor	%r10,%r10
+	sub	$num,%r10		# -$num
+	mov	($n0),$n0		# *n0
+	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
+	and	\$-128,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.align	16
+.Lmulx4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+	lea	($bp,$num),%r10
+	##############################################################
+	# Stack layout
+	# +0	num
+	# +8	off-loaded &b[i]
+	# +16	end of b[num]
+	# +24	saved n0
+	# +32	saved rp
+	# +40	saved %rsp
+	# +48	inner counter
+	# +56
+	# +64	tmp[num+1]
+	#
+	mov	$num,0(%rsp)		# save $num
+	shr	\$5,$num
+	mov	%r10,16(%rsp)		# end of b[num]
+	sub	\$1,$num
+	mov	$n0, 24(%rsp)		# save *n0
+	mov	$rp, 32(%rsp)		# save $rp
+	mov	%rax,40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
+	mov	$num,48(%rsp)		# inner counter
+	jmp	.Lmulx4x_body
+
+.align	32
+.Lmulx4x_body:
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
+   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+$code.=<<___;
+	lea	8($bp),$bptr
+	mov	($bp),%rdx		# b[0], $bp==%rdx actually
+	lea	64+32(%rsp),$tptr
+	mov	%rdx,$bi
+
+	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
+	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
+	add	%rax,%r11
+	mov	$bptr,8(%rsp)		# off-load &b[i]
+	mulx	2*8($aptr),%r12,%r13	# ...
+	adc	%r14,%r12
+	adc	\$0,%r13
+
+	mov	$mi,$bptr		# borrow $bptr
+	imulq	24(%rsp),$mi		# "t[0]"*n0
+	xor	$zero,$zero		# cf=0, of=0
+
+	mulx	3*8($aptr),%rax,%r14
+	 mov	$mi,%rdx
+	lea	4*8($aptr),$aptr
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+
+	mulx	0*8($nptr),%rax,%r10
+	adcx	%rax,$bptr		# discarded
+	adox	%r11,%r10
+	mulx	1*8($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
+	mov	48(%rsp),$bptr		# counter value
+	mov	%r10,-4*8($tptr)
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-3*8($tptr)
+	adcx	%rax,%r12
+	adox	$zero,%r15		# of=0
+	lea	4*8($nptr),$nptr
+	mov	%r12,-2*8($tptr)
+
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
+	adcx	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
+	adcx	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 .byte	0x67,0x67
+	 mov	$mi,%rdx
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+
+	adox	%r15,%r10
+	mulx	0*8($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*8($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*8($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	mov	%r11,-4*8($tptr)
+	adox	%r15,%r13
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	lea	4*8($nptr),$nptr
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_1st
+
+	mov	0(%rsp),$num		# load num
+	mov	8(%rsp),$bptr		# re-load &b[i]
+	adc	$zero,%r15		# modulo-scheduled
+	add	%r15,%r14
+	sbb	%r15,%r15		# top-most carry
+	mov	%r14,-1*8($tptr)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	mov	($bptr),%rdx		# b[i]
+	lea	8($bptr),$bptr		# b++
+	sub	$num,$aptr		# rewind $aptr
+	mov	%r15,($tptr)		# save top-most carry
+	lea	64+4*8(%rsp),$tptr
+	sub	$num,$nptr		# rewind $nptr
+
+	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
+	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
+	mov	%rdx,$bi
+	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
+	adox	-4*8($tptr),$mi
+	adcx	%r14,%r11
+	mulx	2*8($aptr),%r15,%r13	# ...
+	adox	-3*8($tptr),%r11
+	adcx	%r15,%r12
+	adox	-2*8($tptr),%r12
+	adcx	$zero,%r13
+	adox	$zero,%r13
+
+	mov	$bptr,8(%rsp)		# off-load &b[i]
+	mov	$mi,%r15
+	imulq	24(%rsp),$mi		# "t[0]"*n0
+	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
+
+	mulx	3*8($aptr),%rax,%r14
+	 mov	$mi,%rdx
+	adcx	%rax,%r13
+	adox	-1*8($tptr),%r13
+	adcx	$zero,%r14
+	lea	4*8($aptr),$aptr
+	adox	$zero,%r14
+
+	mulx	0*8($nptr),%rax,%r10
+	adcx	%rax,%r15		# discarded
+	adox	%r11,%r10
+	mulx	1*8($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	mulx	2*8($nptr),%rax,%r12
+	mov	%r10,-4*8($tptr)
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-3*8($tptr)
+	lea	4*8($nptr),$nptr
+	adcx	%rax,%r12
+	adox	$zero,%r15		# of=0
+	mov	48(%rsp),$bptr		# counter value
+	mov	%r12,-2*8($tptr)
+
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	adox	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
+	adcx	0*8($tptr),%r10
+	adox	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	1*8($tptr),%r11
+	adox	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 mov	$mi,%rdx
+	adcx	2*8($tptr),%r12
+	adox	%rax,%r13
+	adcx	3*8($tptr),%r13
+	adox	$zero,%r14		# of=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+	adcx	$zero,%r14		# cf=0
+
+	adox	%r15,%r10
+	mulx	0*8($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*8($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*8($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	adox	%r15,%r13
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-4*8($tptr)
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	lea	4*8($nptr),$nptr
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_inner
+
+	mov	0(%rsp),$num		# load num
+	mov	8(%rsp),$bptr		# re-load &b[i]
+	adc	$zero,%r15		# modulo-scheduled
+	sub	0*8($tptr),$zero	# pull top-most carry
+	adc	%r15,%r14
+	sbb	%r15,%r15		# top-most carry
+	mov	%r14,-1*8($tptr)
+
+	cmp	16(%rsp),$bptr
+	jne	.Lmulx4x_outer
+
+	lea	64(%rsp),$tptr
+	sub	$num,$nptr		# rewind $nptr
+	neg	%r15
+	mov	$num,%rdx
+	shr	\$3+2,$num		# %cf=0
+	mov	32(%rsp),$rptr		# restore rp
+	jmp	.Lmulx4x_sub
+
+.align	32
+.Lmulx4x_sub:
+	mov	8*0($tptr),%r11
+	mov	8*1($tptr),%r12
+	mov	8*2($tptr),%r13
+	mov	8*3($tptr),%r14
+	lea	8*4($tptr),$tptr
+	sbb	8*0($nptr),%r11
+	sbb	8*1($nptr),%r12
+	sbb	8*2($nptr),%r13
+	sbb	8*3($nptr),%r14
+	lea	8*4($nptr),$nptr
+	mov	%r11,8*0($rptr)
+	mov	%r12,8*1($rptr)
+	mov	%r13,8*2($rptr)
+	mov	%r14,8*3($rptr)
+	lea	8*4($rptr),$rptr
+	dec	$num			# preserves %cf
+	jnz	.Lmulx4x_sub
+
+	sbb	\$0,%r15		# top-most carry
+	lea	64(%rsp),$tptr
+	sub	%rdx,$rptr		# rewind
+
+	movq	%r15,%xmm1
+	pxor	%xmm0,%xmm0
+	pshufd	\$0,%xmm1,%xmm1
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
+	jmp	.Lmulx4x_cond_copy
+
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	16*0($tptr),%xmm2
+	movdqa	16*1($tptr),%xmm3
+	lea	16*2($tptr),$tptr
+	movdqu	16*0($rptr),%xmm4
+	movdqu	16*1($rptr),%xmm5
+	lea	16*2($rptr),$rptr
+	movdqa	%xmm0,-16*2($tptr)	# zero tp
+	movdqa	%xmm0,-16*1($tptr)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-16*2($rptr)
+	movdqu	%xmm5,-16*1($rptr)
+	sub	\$32,%rdx
+	jnz	.Lmulx4x_cond_copy
+
+	mov	%rdx,($tptr)
+
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	ret
+.cfi_endproc
+.size	bn_mulx4x_mont,.-bn_mulx4x_mont
+___
+}}}
+$code.=<<___;
+.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
+	jmp	.Lcommon_pop_regs
+.size	mul_handler,.-mul_handler
+
+.type	sqr_handler,\@abi-omnipotent
+.align	16
+sqr_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
+	jb	.Lcommon_seh_tail
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# body label
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_pop_regs
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
+	jae	.Lcommon_seh_tail
+
+	mov	40(%rax),%rax		# pull saved stack pointer
+
+.Lcommon_pop_regs:
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	sqr_handler,.-sqr_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont_nohw
+	.rva	.LSEH_end_bn_mul_mont_nohw
+	.rva	.LSEH_info_bn_mul_mont_nohw
+
+	.rva	.LSEH_begin_bn_mul4x_mont
+	.rva	.LSEH_end_bn_mul4x_mont
+	.rva	.LSEH_info_bn_mul4x_mont
+
+	.rva	.LSEH_begin_bn_sqr8x_mont
+	.rva	.LSEH_end_bn_sqr8x_mont
+	.rva	.LSEH_info_bn_sqr8x_mont
+___
+$code.=<<___ if ($addx);
+	.rva	.LSEH_begin_bn_mulx4x_mont
+	.rva	.LSEH_end_bn_mulx4x_mont
+	.rva	.LSEH_info_bn_mulx4x_mont
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont_nohw:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.LSEH_info_bn_sqr8x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
+	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
+.align	8
+___
+$code.=<<___ if ($addx);
+.LSEH_info_bn_mulx4x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
+	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
+.align	8
+___
+}
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
new file mode 100644
index 0000000000..e7628712a7
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,3352 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+# August 2013.
+#
+# Add MULX/AD*X code paths and additional interfaces to optimize for
+# branch prediction unit. For input lengths that are multiples of 8
+# the np argument is not just modulus value, but one interleaved
+# with 0. This is to optimize post-condition...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+$addx = 1;
+
+# int bn_mul_mont_gather5_nohw(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num,
+		# int idx);	# 0 to 2^5-1, "index" in $bp holding
+				# pre-computed powers of a', interlaced
+				# in such manner that b[0] is $bp[idx],
+				# b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.globl	bn_mul4x_mont_gather5
+.type	bn_mul4x_mont_gather5,\@function,6
+.align	32
+bn_mul4x_mont_gather5:
+.cfi_startproc
+	_CET_ENDBR
+	.byte	0x67
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lmul4x_prologue:
+
+	.byte	0x67
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, predates the first access.
+	shl	\$3,${num}d		# convert $num to bytes
+	lea	($num,$num,2),%r10	# 3*$num in bytes
+	neg	$num			# -$num
+
+	##############################################################
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra [num] is allocated in order
+	# to align with bn_power5's frame, which is cleansed after
+	# completing exponentiation. Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
+	#
+	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
+	sub	$rp,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lmul4xsp_alt
+	sub	%r11,%rbp		# align with $rp
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
+	jmp	.Lmul4xsp_done
+
+.align	32
+.Lmul4xsp_alt:
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rbp
+.Lmul4xsp_done:
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	neg	$num
+
+	mov	%rax,40(%rsp)
+.cfi_cfa_expression	%rsp+40,deref,+8
+.Lmul4x_body:
+
+	call	mul4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
+	mov	\$1,%rax
+
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	ret
+.cfi_endproc
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type	mul4x_internal,\@abi-omnipotent
+.align	32
+mul4x_internal:
+.cfi_startproc
+	shl	\$5,$num		# $num was in bytes
+	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
+	lea	.Linc(%rip),%rax
+	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
+	shr	\$5,$num		# restore $num
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$tp=$i;
+$code.=<<___;
+	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
+	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
+	lea	128(%rdx),$bp		# size optimization
+
+	pshufd	\$0,%xmm5,%xmm5		# broadcast index
+	movdqa	%xmm1,%xmm4
+	.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
+#
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
+$code.=<<___;
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
+	.byte	0x67
+	movdqa	%xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	movdqa	%xmm4,%xmm3
+___
+}
+$code.=<<___;				# last iteration can be optimized
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+
+	paddd	%xmm2,%xmm3
+	.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
+
+	pand	`16*($i+1)-128`($bp),%xmm1
+	pand	`16*($i+2)-128`($bp),%xmm2
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	pand	`16*($i+3)-128`($bp),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bp),%xmm4
+	movdqa	`16*($i+1)-128`($bp),%xmm5
+	movdqa	`16*($i+2)-128`($bp),%xmm2
+	pand	`16*($i+0)+112`(%r10),%xmm4
+	movdqa	`16*($i+3)-128`($bp),%xmm3
+	pand	`16*($i+1)+112`(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	`16*($i+2)+112`(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	`16*($i+3)+112`(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+}
+$code.=<<___;
+	por	%xmm1,%xmm0
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
+	por	%xmm1,%xmm0
+	lea	$STRIDE($bp),$bp
+	movq	%xmm0,$m0		# m0=bp[0]
+
+	mov	%r13,16+8(%rsp)		# save end of b[num]
+	mov	$rp, 56+8(%rsp)		# save $rp
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+	lea	($ap,$num),$ap		# end of a[num]
+	neg	$num
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	lea	64+8(%rsp),$tp
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap,$num),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap,$num),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4*8($num),$j		# j=4
+	lea	8*4($np),$np
+	adc	\$0,%rdx
+	mov	$N[1],($tp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+
+.align	32
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-8*2($np),%rax
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	8*0($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	lea	8*4($np),$np
+	adc	\$0,%rdx
+	mov	$N[1],($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	add	\$32,$j			# j+=4
+	jnz	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-8*2($np),%rax
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$num),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	lea	($np,$num),$np		# rewind $np
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8($tp)
+
+	jmp	.Louter4x
+
+.align	32
+.Louter4x:
+	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bp),%xmm0
+	movdqa	`16*($i+1)-128`($bp),%xmm1
+	movdqa	`16*($i+2)-128`($bp),%xmm2
+	movdqa	`16*($i+3)-128`($bp),%xmm3
+	pand	`16*($i+0)-128`(%rdx),%xmm0
+	pand	`16*($i+1)-128`(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($i+2)-128`(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($i+3)-128`(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
+	por	%xmm4,%xmm0
+	lea	$STRIDE($bp),$bp
+	movq	%xmm0,$m0		# m0=bp[i]
+
+	mov	($tp,$num),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+	mov	$N[1],($tp)		# store upmost overflow bit
+
+	lea	($tp,$num),$tp		# rewind $tp
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap,$num),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8*1($np),%rax
+	adc	\$0,%rdx
+	add	8($tp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap,$num),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4*8($num),$j		# j=4
+	lea	8*4($np),$np
+	adc	\$0,%rdx
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+
+.align	32
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-8*2($np),%rax
+	adc	\$0,%rdx
+	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8*1($np),%rax
+	adc	\$0,%rdx
+	add	-8($tp),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	8*0($np),%rax
+	adc	\$0,%rdx
+	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8*1($np),%rax
+	adc	\$0,%rdx
+	add	8($tp),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	8*4($np),$np
+	adc	\$0,%rdx
+	mov	$N[0],-8($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	add	\$32,$j			# j+=4
+	jnz	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-8*2($np),%rax
+	adc	\$0,%rdx
+	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	$m1,%rax
+	mov	-8*1($np),$m1
+	adc	\$0,%rdx
+	add	-8($tp),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$num),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mov	$N[1],-16($tp)		# tp[j-1]
+	lea	($np,$num),$np		# rewind $np
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	($tp),$N[0]		# pull upmost overflow bit
+	adc	\$0,$N[1]		# upmost overflow bit
+	mov	$N[0],-8($tp)
+
+	cmp	16+8(%rsp),$bp
+	jb	.Louter4x
+___
+if (1) {
+$code.=<<___;
+	xor	%rax,%rax
+	sub	$N[0],$m1		# compare top-most words
+	adc	$j,$j			# $j is zero
+	or	$j,$N[1]
+	sub	$N[1],%rax		# %rax=-$N[1]
+	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
+	mov	($np),%r12
+	lea	($np),%rbp		# nptr in .sqr4x_sub
+	mov	%r9,%rcx
+	sar	\$3+2,%rcx
+	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
+	dec	%r12			# so that after 'not' we get -n[0]
+	xor	%r10,%r10
+	mov	8*1(%rbp),%r13
+	mov	8*2(%rbp),%r14
+	mov	8*3(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+___
+}
+$code.=<<___;
+.cfi_endproc
+.size	mul4x_internal,.-mul4x_internal
+___
+}}}
+{{{
+######################################################################
+# void bn_power5_nohw(
+my $rptr="%rdi";	# BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# const void *table,
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 8
+			# int pwr
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.globl	bn_power5_nohw
+.type	bn_power5_nohw,\@function,6
+.align	32
+bn_power5_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lpower5_prologue:
+
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, come before the first access.
+	shl	\$3,${num}d		# convert $num to bytes
+	lea	($num,$num,2),%r10d	# 3*$num
+	neg	$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
+	#
+	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
+	sub	$rptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lpwr_sp_alt
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
+	jmp	.Lpwr_sp_done
+
+.align	32
+.Lpwr_sp_alt:
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rbp
+.Lpwr_sp_done:
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+	mov	$num,%r10
+	neg	$num
+
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved *n0
+	# +40	saved %rsp
+	# +48	t[2*$num]
+	#
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
+.Lpower5_body:
+	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
+	movq	$nptr,%xmm2		# save $nptr
+	movq	%r10, %xmm3		# -$num, used in sqr8x
+	movq	$bptr,%xmm4
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+	movq	%xmm2,$nptr
+	movq	%xmm4,$bptr
+	mov	$aptr,$rptr
+	mov	40(%rsp),%rax
+	lea	32(%rsp),$n0
+
+	call	mul4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpower5_epilogue:
+	ret
+.cfi_endproc
+.size	bn_power5_nohw,.-bn_power5_nohw
+
+.globl	bn_sqr8x_internal
+.hidden	bn_sqr8x_internal
+.type	bn_sqr8x_internal,\@abi-omnipotent
+.align	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+.cfi_startproc
+	_CET_ENDBR
+	##############################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	##############################################################
+	#                                                     a[1]a[0]
+	#                                                 a[2]a[0]
+	#                                             a[3]a[0]
+	#                                             a[2]a[1]
+	#                                         a[4]a[0]
+	#                                         a[3]a[1]
+	#                                     a[5]a[0]
+	#                                     a[4]a[1]
+	#                                     a[3]a[2]
+	#                                 a[6]a[0]
+	#                                 a[5]a[1]
+	#                                 a[4]a[2]
+	#                             a[7]a[0]
+	#                             a[6]a[1]
+	#                             a[5]a[2]
+	#                             a[4]a[3]
+	#                         a[7]a[1]
+	#                         a[6]a[2]
+	#                         a[5]a[3]
+	#                     a[7]a[2]
+	#                     a[6]a[3]
+	#                     a[5]a[4]
+	#                 a[7]a[3]
+	#                 a[6]a[4]
+	#             a[7]a[4]
+	#             a[6]a[5]
+	#         a[7]a[5]
+	#     a[7]a[6]
+	#                                                     a[1]a[0]
+	#                                                 a[2]a[0]
+	#                                             a[3]a[0]
+	#                                         a[4]a[0]
+	#                                     a[5]a[0]
+	#                                 a[6]a[0]
+	#                             a[7]a[0]
+	#                                             a[2]a[1]
+	#                                         a[3]a[1]
+	#                                     a[4]a[1]
+	#                                 a[5]a[1]
+	#                             a[6]a[1]
+	#                         a[7]a[1]
+	#                                     a[3]a[2]
+	#                                 a[4]a[2]
+	#                             a[5]a[2]
+	#                         a[6]a[2]
+	#                     a[7]a[2]
+	#                             a[4]a[3]
+	#                         a[5]a[3]
+	#                     a[6]a[3]
+	#                 a[7]a[3]
+	#                     a[5]a[4]
+	#                 a[6]a[4]
+	#             a[7]a[4]
+	#             a[6]a[5]
+	#         a[7]a[5]
+	#     a[7]a[6]
+	#                                                         a[0]a[0]
+	#                                                 a[1]a[1]
+	#                                         a[2]a[2]
+	#                                 a[3]a[3]
+	#                         a[4]a[4]
+	#                 a[5]a[5]
+	#         a[6]a[6]
+	# a[7]a[7]
+
+	lea	32(%r10),$i		# $i=-($num-32)
+	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
+
+	mov	$num,$j			# $j=$num
+
+					# comments apply to $num==8 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	%rax,$A0[0]		# a[1]*a[0]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+	mov	%rdx,$A0[0]
+
+
+	 mov	-8($aptr,$i),$ai	# a[3]
+	mul	$a1			# a[2]*a[1]
+	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+
+	 lea	($i),$j
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[3]
+	jmp	.Lsqr4x_1st
+
+.align	32
+.Lsqr4x_1st:
+	 mov	($aptr,$j),$ai		# a[4]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[0]
+	adc	\$0,$A1[0]
+
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	 mov	8($aptr,$j),$ai		# a[5]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+
+
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	 mov	$A0[1],($tptr,$j)	# t[4]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	 mov	16($aptr,$j),$ai	# a[6]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
+	 mov	$ai,%rax
+	 mov	$A0[0],8($tptr,$j)	# t[5]
+	mov	%rdx,$A1[0]
+	adc	\$0,$A1[0]
+
+	mul	$a0			# a[6]*a[2]
+	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
+	 mov	$ai,%rax		# a[3]
+	 mov	24($aptr,$j),$ai	# a[7]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+
+
+	mul	$a1			# a[6]*a[5]
+	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
+	 mov	$ai,%rax
+	 mov	$A0[1],16($tptr,$j)	# t[6]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+	 lea	32($j),$j
+
+	mul	$a0			# a[7]*a[4]
+	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
+	 mov	$ai,%rax
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[7]
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_1st
+
+	mul	$a1			# a[7]*a[5]
+	add	%rax,$A1[1]
+	lea	16($i),$i
+	adc	\$0,%rdx
+	add	$A0[1],$A1[1]
+	adc	\$0,%rdx
+
+	mov	$A1[1],($tptr)		# t[8]
+	mov	%rdx,$A1[0]
+	mov	%rdx,8($tptr)		# t[9]
+	jmp	.Lsqr4x_outer
+
+.align	32
+.Lsqr4x_outer:				# comments apply to $num==6 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	-24($tptr,$i),$A0[0]	# t[1]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	\$0,%rdx
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+	mov	%rdx,$A0[1]
+
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	xor	$A1[0],$A1[0]
+
+	 mov	-8($aptr,$i),$ai	# a[3]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	-8($tptr,$i),$A1[0]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	$A1[0],$A0[0]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$i)	# t[3]
+
+	lea	($i),$j
+	jmp	.Lsqr4x_inner
+
+.align	32
+.Lsqr4x_inner:
+	 mov	($aptr,$j),$ai		# a[4]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[0]
+	adc	\$0,$A1[0]
+	add	($tptr,$j),$A1[1]
+	adc	\$0,$A1[0]
+
+	.byte	0x67
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	 mov	8($aptr,$j),$ai		# a[5]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	mov	$A0[1],($tptr,$j)	# t[4]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+	add	8($tptr,$j),$A1[0]
+	lea	16($j),$j		# j++
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	$A1[0],$A0[0]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_inner
+
+	.byte	0x67
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]
+	adc	\$0,%rdx
+	add	$A0[1],$A1[1]
+	adc	\$0,%rdx
+
+	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
+	mov	%rdx,$A1[0]
+	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
+
+	add	\$16,$i
+	jnz	.Lsqr4x_outer
+
+					# comments apply to $num==4 case
+	mov	-32($aptr),$a0		# a[0]
+	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr),%rax		# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr),$ai		# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	 mov	$A0[0],-24($tptr)	# t[1]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
+	 mov	-8($aptr),$ai		# a[3]
+	adc	\$0,$A0[0]
+
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
+	 mov	$ai,%rax
+	 mov	$A0[1],-16($tptr)	# t[2]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr)	# t[3]
+
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]
+	 mov	-16($aptr),%rax		# a[2]
+	adc	\$0,%rdx
+	add	$A0[1],$A1[1]
+	adc	\$0,%rdx
+
+	mov	$A1[1],($tptr)		# t[4]
+	mov	%rdx,$A1[0]
+	mov	%rdx,8($tptr)		# t[5]
+
+	mul	$ai			# a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+	 add	\$16,$i
+	 xor	$shift,$shift
+	 sub	$num,$i			# $i=16-$num
+	 xor	$carry,$carry
+
+	add	$A1[0],%rax		# t[5]
+	adc	\$0,%rdx
+	mov	%rax,8($tptr)		# t[5]
+	mov	%rdx,16($tptr)		# t[6]
+	mov	$carry,24($tptr)	# t[7]
+
+	 mov	-16($aptr,$i),%rax	# a[0]
+	lea	48+8(%rsp),$tptr
+	 xor	$A0[0],$A0[0]		# t[0]
+	 mov	8($tptr),$A0[1]		# t[1]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr)
+	adc	%rdx,$S[3]
+	lea	16($i),$i
+	mov	$S[3],24($tptr)
+	sbb	$carry,$carry		# mov cf,$carry
+	lea	64($tptr),$tptr
+	jmp	.Lsqr4x_shift_n_add
+
+.align	32
+.Lsqr4x_shift_n_add:
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr)
+	adc	%rdx,$S[3]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	 mov	$S[3],-8($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],0($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr)
+	adc	%rdx,$S[3]
+	mov	$S[3],24($tptr)
+	sbb	$carry,$carry		# mov cf,$carry
+	lea	64($tptr),$tptr
+	add	\$32,$i
+	jnz	.Lsqr4x_shift_n_add
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	.byte	0x67
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	adc	%rax,$S[2]
+	adc	%rdx,$S[3]
+	mov	$S[2],-16($tptr)
+	mov	$S[3],-8($tptr)
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
+
+$code.=<<___;
+	movq	%xmm2,$nptr
+__bn_sqr8x_reduction:
+	xor	%rax,%rax
+	lea	($nptr,$num),%rcx	# end of n[]
+	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
+	mov	%rcx,0+8(%rsp)
+	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
+	mov	%rdx,8+8(%rsp)
+	neg	$num
+	jmp	.L8x_reduction_loop
+
+.align	32
+.L8x_reduction_loop:
+	lea	($tptr,$num),$tptr	# start of current t[] window
+	.byte	0x66
+	mov	8*0($tptr),$m0
+	mov	8*1($tptr),%r9
+	mov	8*2($tptr),%r10
+	mov	8*3($tptr),%r11
+	mov	8*4($tptr),%r12
+	mov	8*5($tptr),%r13
+	mov	8*6($tptr),%r14
+	mov	8*7($tptr),%r15
+	mov	%rax,(%rdx)		# store top-most carry bit
+	lea	8*8($tptr),$tptr
+
+	.byte	0x67
+	mov	$m0,%r8
+	imulq	32+8(%rsp),$m0		# n0*a[0]
+	mov	8*0($nptr),%rax		# n[0]
+	mov	\$8,%ecx
+	jmp	.L8x_reduce
+
+.align	32
+.L8x_reduce:
+	mulq	$m0
+	 mov	8*1($nptr),%rax		# n[1]
+	neg	%r8
+	mov	%rdx,%r8
+	adc	\$0,%r8
+
+	mulq	$m0
+	add	%rax,%r9
+	 mov	8*2($nptr),%rax
+	adc	\$0,%rdx
+	add	%r9,%r8
+	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
+	mov	%rdx,%r9
+	adc	\$0,%r9
+
+	mulq	$m0
+	add	%rax,%r10
+	 mov	8*3($nptr),%rax
+	adc	\$0,%rdx
+	add	%r10,%r9
+	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
+	mov	%rdx,%r10
+	adc	\$0,%r10
+
+	mulq	$m0
+	add	%rax,%r11
+	 mov	8*4($nptr),%rax
+	adc	\$0,%rdx
+	 imulq	%r8,$carry		# modulo-scheduled
+	add	%r11,%r10
+	mov	%rdx,%r11
+	adc	\$0,%r11
+
+	mulq	$m0
+	add	%rax,%r12
+	 mov	8*5($nptr),%rax
+	adc	\$0,%rdx
+	add	%r12,%r11
+	mov	%rdx,%r12
+	adc	\$0,%r12
+
+	mulq	$m0
+	add	%rax,%r13
+	 mov	8*6($nptr),%rax
+	adc	\$0,%rdx
+	add	%r13,%r12
+	mov	%rdx,%r13
+	adc	\$0,%r13
+
+	mulq	$m0
+	add	%rax,%r14
+	 mov	8*7($nptr),%rax
+	adc	\$0,%rdx
+	add	%r14,%r13
+	mov	%rdx,%r14
+	adc	\$0,%r14
+
+	mulq	$m0
+	 mov	$carry,$m0		# n0*a[i]
+	add	%rax,%r15
+	 mov	8*0($nptr),%rax		# n[0]
+	adc	\$0,%rdx
+	add	%r15,%r14
+	mov	%rdx,%r15
+	adc	\$0,%r15
+
+	dec	%ecx
+	jnz	.L8x_reduce
+
+	lea	8*8($nptr),$nptr
+	xor	%rax,%rax
+	mov	8+8(%rsp),%rdx		# pull end of t[]
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.L8x_no_tail
+
+	.byte	0x66
+	add	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	sbb	$carry,$carry		# top carry
+
+	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
+	mov	\$8,%ecx
+	mov	8*0($nptr),%rax
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail:
+	mulq	$m0
+	add	%rax,%r8
+	 mov	8*1($nptr),%rax
+	 mov	%r8,($tptr)		# save result
+	mov	%rdx,%r8
+	adc	\$0,%r8
+
+	mulq	$m0
+	add	%rax,%r9
+	 mov	8*2($nptr),%rax
+	adc	\$0,%rdx
+	add	%r9,%r8
+	 lea	8($tptr),$tptr		# $tptr++
+	mov	%rdx,%r9
+	adc	\$0,%r9
+
+	mulq	$m0
+	add	%rax,%r10
+	 mov	8*3($nptr),%rax
+	adc	\$0,%rdx
+	add	%r10,%r9
+	mov	%rdx,%r10
+	adc	\$0,%r10
+
+	mulq	$m0
+	add	%rax,%r11
+	 mov	8*4($nptr),%rax
+	adc	\$0,%rdx
+	add	%r11,%r10
+	mov	%rdx,%r11
+	adc	\$0,%r11
+
+	mulq	$m0
+	add	%rax,%r12
+	 mov	8*5($nptr),%rax
+	adc	\$0,%rdx
+	add	%r12,%r11
+	mov	%rdx,%r12
+	adc	\$0,%r12
+
+	mulq	$m0
+	add	%rax,%r13
+	 mov	8*6($nptr),%rax
+	adc	\$0,%rdx
+	add	%r13,%r12
+	mov	%rdx,%r13
+	adc	\$0,%r13
+
+	mulq	$m0
+	add	%rax,%r14
+	 mov	8*7($nptr),%rax
+	adc	\$0,%rdx
+	add	%r14,%r13
+	mov	%rdx,%r14
+	adc	\$0,%r14
+
+	mulq	$m0
+	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
+	add	%rax,%r15
+	adc	\$0,%rdx
+	add	%r15,%r14
+	 mov	8*0($nptr),%rax		# pull n[0]
+	mov	%rdx,%r15
+	adc	\$0,%r15
+
+	dec	%ecx
+	jnz	.L8x_tail
+
+	lea	8*8($nptr),$nptr
+	mov	8+8(%rsp),%rdx		# pull end of t[]
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.L8x_tail_done		# break out of loop
+
+	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
+	neg	$carry
+	 mov	8*0($nptr),%rax		# pull n[0]
+	adc	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	sbb	$carry,$carry		# top carry
+
+	mov	\$8,%ecx
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail_done:
+	xor	%rax,%rax
+	add	(%rdx),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15
+	adc	\$0,%rax
+
+	neg	$carry
+.L8x_no_tail:
+	adc	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	adc	\$0,%rax		# top-most carry
+	 mov	-8($nptr),%rcx		# np[num-1]
+	 xor	$carry,$carry
+
+	movq	%xmm2,$nptr		# restore $nptr
+
+	mov	%r8,8*0($tptr)		# store top 512 bits
+	mov	%r9,8*1($tptr)
+	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
+	mov	%r10,8*2($tptr)
+	mov	%r11,8*3($tptr)
+	mov	%r12,8*4($tptr)
+	mov	%r13,8*5($tptr)
+	mov	%r14,8*6($tptr)
+	mov	%r15,8*7($tptr)
+	lea	8*8($tptr),$tptr
+
+	cmp	%rdx,$tptr		# end of t[]?
+	jb	.L8x_reduction_loop
+	ret
+.cfi_endproc
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($tptr,$nptr)=("%rbx","%rbp");
+$code.=<<___;
+.type	__bn_post4x_internal,\@abi-omnipotent
+.align	32
+__bn_post4x_internal:
+.cfi_startproc
+	mov	8*0($nptr),%r12
+	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
+	mov	$num,%rcx
+	movq	%xmm1,$rptr		# restore $rptr
+	neg	%rax
+	movq	%xmm1,$aptr		# prepare for back-to-back call
+	sar	\$3+2,%rcx
+	dec	%r12			# so that after 'not' we get -n[0]
+	xor	%r10,%r10
+	mov	8*1($nptr),%r13
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+	jmp	.Lsqr4x_sub_entry
+
+.align	16
+.Lsqr4x_sub:
+	mov	8*0($nptr),%r12
+	mov	8*1($nptr),%r13
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+.Lsqr4x_sub_entry:
+	lea	8*4($nptr),$nptr
+	not	%r12
+	not	%r13
+	not	%r14
+	not	%r15
+	and	%rax,%r12
+	and	%rax,%r13
+	and	%rax,%r14
+	and	%rax,%r15
+
+	neg	%r10			# mov %r10,%cf
+	adc	8*0($tptr),%r12
+	adc	8*1($tptr),%r13
+	adc	8*2($tptr),%r14
+	adc	8*3($tptr),%r15
+	mov	%r12,8*0($rptr)
+	lea	8*4($tptr),$tptr
+	mov	%r13,8*1($rptr)
+	sbb	%r10,%r10		# mov %cf,%r10
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+
+	inc	%rcx			# pass %cf
+	jnz	.Lsqr4x_sub
+
+	mov	$num,%r10		# prepare for back-to-back call
+	neg	$num			# restore $num
+	ret
+.cfi_endproc
+.size	__bn_post4x_internal,.-__bn_post4x_internal
+___
+}
+}}}
+
+if ($addx) {{{
+my $bp="%rdx";	# restore original value
+
+$code.=<<___;
+.globl	bn_mulx4x_mont_gather5
+.type	bn_mulx4x_mont_gather5,\@function,6
+.align	32
+bn_mulx4x_mont_gather5:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lmulx4x_prologue:
+
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, predates the first access.
+	shl	\$3,${num}d		# convert $num to bytes
+	lea	($num,$num,2),%r10	# 3*$num in bytes
+	neg	$num			# -$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra [num] is allocated in order
+	# to align with bn_power5's frame, which is cleansed after
+	# completing exponentiation. Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
+	#
+	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
+	sub	$rp,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lmulx4xsp_alt
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
+	jmp	.Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rbp
+.Lmulx4xsp_done:
+	and	\$-64,%rbp		# ensure alignment
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+	##############################################################
+	# Stack layout
+	# +0	-num
+	# +8	off-loaded &b[i]
+	# +16	end of b[num]
+	# +24	inner counter
+	# +32	saved n0
+	# +40	saved %rsp
+	# +48
+	# +56	saved rp
+	# +64	tmp[num+1]
+	#
+	mov	$n0, 32(%rsp)		# save *n0
+	mov	%rax,40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
+.Lmulx4x_body:
+	call	mulx4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
+	mov	\$1,%rax
+
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	ret
+.cfi_endproc
+.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type	mulx4x_internal,\@abi-omnipotent
+.align	32
+mulx4x_internal:
+.cfi_startproc
+	mov	$num,8(%rsp)		# save -$num (it was in bytes)
+	mov	$num,%r10
+	neg	$num			# restore $num
+	shl	\$5,$num
+	neg	%r10			# restore $num
+	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
+	shr	\$5+5,$num
+	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
+	sub	\$1,$num
+	lea	.Linc(%rip),%rax
+	mov	%r13,16+8(%rsp)		# end of b[num]
+	mov	$num,24+8(%rsp)		# inner counter
+	mov	$rp, 56+8(%rsp)		# save $rp
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
+   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+my $STRIDE=2**5*8;		# 5 is "window size"
+$code.=<<___;
+	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
+	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimization)
+	lea	128($bp),$bptr		# size optimization
+
+	pshufd	\$0,%xmm5,%xmm5		# broadcast index
+	movdqa	%xmm1,%xmm4
+	.byte	0x67
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
+#
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
+$code.=<<___;
+	.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
+	movdqa	%xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	movdqa	%xmm4,%xmm3
+___
+}
+$code.=<<___;				# last iteration can be optimized
+	.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+
+	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
+	pand	`16*($i+1)-128`($bptr),%xmm1
+	pand	`16*($i+2)-128`($bptr),%xmm2
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	pand	`16*($i+3)-128`($bptr),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bptr),%xmm4
+	movdqa	`16*($i+1)-128`($bptr),%xmm5
+	movdqa	`16*($i+2)-128`($bptr),%xmm2
+	pand	`16*($i+0)+112`(%r10),%xmm4
+	movdqa	`16*($i+3)-128`($bptr),%xmm3
+	pand	`16*($i+1)+112`(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	`16*($i+2)+112`(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	`16*($i+3)+112`(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+}
+$code.=<<___;
+	pxor	%xmm1,%xmm0
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
+	por	%xmm1,%xmm0
+	lea	$STRIDE($bptr),$bptr
+	movq	%xmm0,%rdx		# bp[0]
+	lea	64+8*4+8(%rsp),$tptr
+
+	mov	%rdx,$bi
+	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
+	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
+	add	%rax,%r11
+	mulx	2*8($aptr),%rax,%r13	# ...
+	adc	%rax,%r12
+	adc	\$0,%r13
+	mulx	3*8($aptr),%rax,%r14
+
+	mov	$mi,%r15
+	imulq	32+8(%rsp),$mi		# "t[0]"*n0
+	xor	$zero,$zero		# cf=0, of=0
+	mov	$mi,%rdx
+
+	mov	$bptr,8+8(%rsp)		# off-load &b[i]
+
+	lea	4*8($aptr),$aptr
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+
+	mulx	0*8($nptr),%rax,%r10
+	adcx	%rax,%r15		# discarded
+	adox	%r11,%r10
+	mulx	1*8($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	mulx	2*8($nptr),%rax,%r12
+	mov	24+8(%rsp),$bptr	# counter value
+	mov	%r10,-8*4($tptr)
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-8*3($tptr)
+	adcx	%rax,%r12
+	adox	$zero,%r15		# of=0
+	lea	4*8($nptr),$nptr
+	mov	%r12,-8*2($tptr)
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
+	adcx	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
+	adcx	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 .byte	0x67,0x67
+	 mov	$mi,%rdx
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+
+	adox	%r15,%r10
+	mulx	0*8($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*8($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*8($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	mov	%r11,-4*8($tptr)
+	adox	%r15,%r13
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	lea	4*8($nptr),$nptr
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_1st
+
+	mov	8(%rsp),$num		# load -num
+	adc	$zero,%r15		# modulo-scheduled
+	lea	($aptr,$num),$aptr	# rewind $aptr
+	add	%r15,%r14
+	mov	8+8(%rsp),$bptr		# re-load &b[i]
+	adc	$zero,$zero		# top-most carry
+	mov	%r14,-1*8($tptr)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
+	pxor	%xmm4,%xmm4
+	.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bptr),%xmm0
+	movdqa	`16*($i+1)-128`($bptr),%xmm1
+	movdqa	`16*($i+2)-128`($bptr),%xmm2
+	pand	`16*($i+0)+256`(%r10),%xmm0
+	movdqa	`16*($i+3)-128`($bptr),%xmm3
+	pand	`16*($i+1)+256`(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($i+2)+256`(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($i+3)+256`(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
+	por	%xmm4,%xmm0
+	lea	$STRIDE($bptr),$bptr
+	movq	%xmm0,%rdx		# m0=bp[i]
+
+	mov	$zero,($tptr)		# save top-most carry
+	lea	4*8($tptr,$num),$tptr	# rewind $tptr
+	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
+	xor	$zero,$zero		# cf=0, of=0
+	mov	%rdx,$bi
+	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
+	adox	-4*8($tptr),$mi		# +t[0]
+	adcx	%r14,%r11
+	mulx	2*8($aptr),%r15,%r13	# ...
+	adox	-3*8($tptr),%r11
+	adcx	%r15,%r12
+	mulx	3*8($aptr),%rdx,%r14
+	adox	-2*8($tptr),%r12
+	adcx	%rdx,%r13
+	lea	($nptr,$num),$nptr	# rewind $nptr
+	lea	4*8($aptr),$aptr
+	adox	-1*8($tptr),%r13
+	adcx	$zero,%r14
+	adox	$zero,%r14
+
+	mov	$mi,%r15
+	imulq	32+8(%rsp),$mi		# "t[0]"*n0
+
+	mov	$mi,%rdx
+	xor	$zero,$zero		# cf=0, of=0
+	mov	$bptr,8+8(%rsp)		# off-load &b[i]
+
+	mulx	0*8($nptr),%rax,%r10
+	adcx	%rax,%r15		# discarded
+	adox	%r11,%r10
+	mulx	1*8($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	mulx	2*8($nptr),%rax,%r12
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	24+8(%rsp),$bptr	# counter value
+	mov	%r10,-8*4($tptr)
+	adcx	%rax,%r12
+	mov	%r11,-8*3($tptr)
+	adox	$zero,%r15		# of=0
+	mov	%r12,-8*2($tptr)
+	lea	4*8($nptr),$nptr
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	adox	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
+	adcx	0*8($tptr),%r10
+	adox	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	1*8($tptr),%r11
+	adox	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 mov	$mi,%rdx
+	adcx	2*8($tptr),%r12
+	adox	%rax,%r13
+	adcx	3*8($tptr),%r13
+	adox	$zero,%r14		# of=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+	adcx	$zero,%r14		# cf=0
+
+	adox	%r15,%r10
+	mulx	0*8($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*8($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*8($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	adox	%r15,%r13
+	mov	%r11,-4*8($tptr)
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	lea	4*8($nptr),$nptr
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_inner
+
+	mov	0+8(%rsp),$num		# load -num
+	adc	$zero,%r15		# modulo-scheduled
+	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
+	mov	8+8(%rsp),$bptr		# re-load &b[i]
+	mov	16+8(%rsp),%r10
+	adc	%r15,%r14
+	lea	($aptr,$num),$aptr	# rewind $aptr
+	adc	$zero,$zero		# top-most carry
+	mov	%r14,-1*8($tptr)
+
+	cmp	%r10,$bptr
+	jb	.Lmulx4x_outer
+
+	mov	-8($nptr),%r10
+	mov	$zero,%r8
+	mov	($nptr,$num),%r12
+	lea	($nptr,$num),%rbp	# rewind $nptr
+	mov	$num,%rcx
+	lea	($tptr,$num),%rdi	# rewind $tptr
+	xor	%eax,%eax
+	xor	%r15,%r15
+	sub	%r14,%r10		# compare top-most words
+	adc	%r15,%r15
+	or	%r15,%r8
+	sar	\$3+2,%rcx
+	sub	%r8,%rax		# %rax=-%r8
+	mov	56+8(%rsp),%rdx		# restore rp
+	dec	%r12			# so that after 'not' we get -n[0]
+	mov	8*1(%rbp),%r13
+	xor	%r8,%r8
+	mov	8*2(%rbp),%r14
+	mov	8*3(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry	# common post-condition
+.cfi_endproc
+.size	mulx4x_internal,.-mulx4x_internal
+___
+}{
+######################################################################
+# void bn_powerx5(
+my $rptr="%rdi";	# BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# const void *table,
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 8
+			# int pwr);
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.globl	bn_powerx5
+.type	bn_powerx5,\@function,6
+.align	32
+bn_powerx5:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lpowerx5_prologue:
+
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, predates the first access.
+	shl	\$3,${num}d		# convert $num to bytes
+	lea	($num,$num,2),%r10	# 3*$num in bytes
+	neg	$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
+	#
+	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
+	sub	$rptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lpwrx_sp_alt
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
+	jmp	.Lpwrx_sp_done
+
+.align	32
+.Lpwrx_sp_alt:
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rbp
+.Lpwrx_sp_done:
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+	jmp	.Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+	mov	$num,%r10
+	neg	$num
+
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +16	intermediate carry bit
+	# +24	top-most carry bit, used in reduction section
+	# +32	saved *n0
+	# +40	saved %rsp
+	# +48	t[2*$num]
+	#
+	pxor	%xmm0,%xmm0
+	movq	$rptr,%xmm1		# save $rptr
+	movq	$nptr,%xmm2		# save $nptr
+	movq	%r10, %xmm3		# -$num
+	movq	$bptr,%xmm4
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
+.Lpowerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	mov	%r10,$num		# -num
+	mov	$aptr,$rptr
+	movq	%xmm2,$nptr
+	movq	%xmm4,$bptr
+	mov	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
+	mov	\$1,%rax
+
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpowerx5_epilogue:
+	ret
+.cfi_endproc
+.size	bn_powerx5,.-bn_powerx5
+
+.globl	bn_sqrx8x_internal
+.hidden	bn_sqrx8x_internal
+.type	bn_sqrx8x_internal,\@abi-omnipotent
+.align	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc
+	_CET_ENDBR
+	##################################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	##################################################################
+	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+	#                                                     a[1]a[0]
+	#                                                 a[2]a[0]
+	#                                             a[3]a[0]
+	#                                             a[2]a[1]
+	#                                         a[3]a[1]
+	#                                     a[3]a[2]
+	#
+	#                                         a[4]a[0]
+	#                                     a[5]a[0]
+	#                                 a[6]a[0]
+	#                             a[7]a[0]
+	#                                     a[4]a[1]
+	#                                 a[5]a[1]
+	#                             a[6]a[1]
+	#                         a[7]a[1]
+	#                                 a[4]a[2]
+	#                             a[5]a[2]
+	#                         a[6]a[2]
+	#                     a[7]a[2]
+	#                             a[4]a[3]
+	#                         a[5]a[3]
+	#                     a[6]a[3]
+	#                 a[7]a[3]
+	#
+	#                     a[5]a[4]
+	#                 a[6]a[4]
+	#             a[7]a[4]
+	#             a[6]a[5]
+	#         a[7]a[5]
+	#     a[7]a[6]
+	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+___
+{
+my ($zero,$carry)=("%rbp","%rcx");
+my $aaptr=$zero;
+$code.=<<___;
+	lea	48+8(%rsp),$tptr
+	lea	($aptr,$num),$aaptr
+	mov	$num,0+8(%rsp)			# save $num
+	mov	$aaptr,8+8(%rsp)		# save end of $aptr
+	jmp	.Lsqr8x_zero_start
+
+.align	32
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+	.byte	0x3e
+	movdqa	%xmm0,0*8($tptr)
+	movdqa	%xmm0,2*8($tptr)
+	movdqa	%xmm0,4*8($tptr)
+	movdqa	%xmm0,6*8($tptr)
+.Lsqr8x_zero_start:			# aligned at 32
+	movdqa	%xmm0,8*8($tptr)
+	movdqa	%xmm0,10*8($tptr)
+	movdqa	%xmm0,12*8($tptr)
+	movdqa	%xmm0,14*8($tptr)
+	lea	16*8($tptr),$tptr
+	sub	\$64,$num
+	jnz	.Lsqrx8x_zero
+
+	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
+	#xor	%r9,%r9			# t[1], ex-$num, zero already
+	xor	%r10,%r10
+	xor	%r11,%r11
+	xor	%r12,%r12
+	xor	%r13,%r13
+	xor	%r14,%r14
+	xor	%r15,%r15
+	lea	48+8(%rsp),$tptr
+	xor	$zero,$zero		# cf=0, cf=0
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_loop:
+	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
+	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
+	adox	%rax,%r10
+	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
+	adcx	%r10,%r9
+	adox	%rax,%r11
+	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
+	adcx	%r11,%r10
+	adox	%rax,%r12
+	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
+	adcx	%r12,%r11
+	adox	%rax,%r13
+	mulx	5*8($aptr),%r12,%rax
+	adcx	%r13,%r12
+	adox	%rax,%r14
+	mulx	6*8($aptr),%r13,%rax
+	adcx	%r14,%r13
+	adox	%r15,%rax
+	mulx	7*8($aptr),%r14,%r15
+	 mov	1*8($aptr),%rdx		# a[1]
+	adcx	%rax,%r14
+	adox	$zero,%r15
+	adc	8*8($tptr),%r15
+	mov	%r8,1*8($tptr)		# t[1]
+	mov	%r9,2*8($tptr)		# t[2]
+	sbb	$carry,$carry		# mov %cf,$carry
+	xor	$zero,$zero		# cf=0, of=0
+
+
+	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
+	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
+	adcx	%r10,%r8
+	adox	%rbx,%r9
+	mulx	4*8($aptr),%r10,%rbx	# ...
+	adcx	%r11,%r9
+	adox	%rax,%r10
+	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
+	adcx	%r12,%r10
+	adox	%rbx,%r11
+	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
+	adcx	%r13,%r11
+	adox	%r14,%r12
+	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
+	 mov	2*8($aptr),%rdx		# a[2]
+	adcx	%rax,%r12
+	adox	%rbx,%r13
+	adcx	%r15,%r13
+	adox	$zero,%r14		# of=0
+	adcx	$zero,%r14		# cf=0
+
+	mov	%r8,3*8($tptr)		# t[3]
+	mov	%r9,4*8($tptr)		# t[4]
+
+	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
+	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
+	adcx	%r10,%r8
+	adox	%rbx,%r9
+	mulx	5*8($aptr),%r10,%rbx	# ...
+	adcx	%r11,%r9
+	adox	%rax,%r10
+	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
+	adcx	%r12,%r10
+	adox	%r13,%r11
+	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
+	.byte	0x3e
+	 mov	3*8($aptr),%rdx		# a[3]
+	adcx	%rbx,%r11
+	adox	%rax,%r12
+	adcx	%r14,%r12
+	mov	%r8,5*8($tptr)		# t[5]
+	mov	%r9,6*8($tptr)		# t[6]
+	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
+	adox	$zero,%r13		# of=0
+	adcx	$zero,%r13		# cf=0
+
+	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
+	adcx	%r10,%r8
+	adox	%rax,%r9
+	mulx	6*8($aptr),%r10,%rax	# ...
+	adcx	%r11,%r9
+	adox	%r12,%r10
+	mulx	7*8($aptr),%r11,%r12
+	 mov	4*8($aptr),%rdx		# a[4]
+	 mov	5*8($aptr),%r14		# a[5]
+	adcx	%rbx,%r10
+	adox	%rax,%r11
+	 mov	6*8($aptr),%r15		# a[6]
+	adcx	%r13,%r11
+	adox	$zero,%r12		# of=0
+	adcx	$zero,%r12		# cf=0
+
+	mov	%r8,7*8($tptr)		# t[7]
+	mov	%r9,8*8($tptr)		# t[8]
+
+	mulx	%r14,%r9,%rax		# a[5]*a[4]
+	 mov	7*8($aptr),%r8		# a[7]
+	adcx	%r10,%r9
+	mulx	%r15,%r10,%rbx		# a[6]*a[4]
+	adox	%rax,%r10
+	adcx	%r11,%r10
+	mulx	%r8,%r11,%rax		# a[7]*a[4]
+	 mov	%r14,%rdx		# a[5]
+	adox	%rbx,%r11
+	adcx	%r12,%r11
+	#adox	$zero,%rax		# of=0
+	adcx	$zero,%rax		# cf=0
+
+	mulx	%r15,%r14,%rbx		# a[6]*a[5]
+	mulx	%r8,%r12,%r13		# a[7]*a[5]
+	 mov	%r15,%rdx		# a[6]
+	 lea	8*8($aptr),$aptr
+	adcx	%r14,%r11
+	adox	%rbx,%r12
+	adcx	%rax,%r12
+	adox	$zero,%r13
+
+	.byte	0x67,0x67
+	mulx	%r8,%r8,%r14		# a[7]*a[6]
+	adcx	%r8,%r13
+	adcx	$zero,%r14
+
+	cmp	8+8(%rsp),$aptr
+	je	.Lsqrx8x_outer_break
+
+	neg	$carry			# mov $carry,%cf
+	mov	\$-8,%rcx
+	mov	$zero,%r15
+	mov	8*8($tptr),%r8
+	adcx	9*8($tptr),%r9		# +=t[9]
+	adcx	10*8($tptr),%r10	# ...
+	adcx	11*8($tptr),%r11
+	adc	12*8($tptr),%r12
+	adc	13*8($tptr),%r13
+	adc	14*8($tptr),%r14
+	adc	15*8($tptr),%r15
+	lea	($aptr),$aaptr
+	lea	2*64($tptr),$tptr
+	sbb	%rax,%rax		# mov %cf,$carry
+
+	mov	-64($aptr),%rdx		# a[0]
+	mov	%rax,16+8(%rsp)		# offload $carry
+	mov	$tptr,24+8(%rsp)
+
+	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
+	xor	%eax,%eax		# cf=0, of=0
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_loop:
+	mov	%r8,%rbx
+	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
+	adcx	%rax,%rbx		# +=t[8]
+	adox	%r9,%r8
+
+	mulx	1*8($aaptr),%rax,%r9	# ...
+	adcx	%rax,%r8
+	adox	%r10,%r9
+
+	mulx	2*8($aaptr),%rax,%r10
+	adcx	%rax,%r9
+	adox	%r11,%r10
+
+	mulx	3*8($aaptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
+	adcx	%rax,%r11
+	adox	%r13,%r12
+
+	mulx	5*8($aaptr),%rax,%r13
+	adcx	%rax,%r12
+	adox	%r14,%r13
+
+	mulx	6*8($aaptr),%rax,%r14
+	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
+	 mov	\$0,%ebx
+	adcx	%rax,%r13
+	adox	%r15,%r14
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
+	 mov	8($aptr,%rcx,8),%rdx	# a[i]
+	adcx	%rax,%r14
+	adox	%rbx,%r15		# %rbx is 0, of=0
+	adcx	%rbx,%r15		# cf=0
+
+	.byte	0x67
+	inc	%rcx			# of=0
+	jnz	.Lsqrx8x_loop
+
+	lea	8*8($aaptr),$aaptr
+	mov	\$-8,%rcx
+	cmp	8+8(%rsp),$aaptr	# done?
+	je	.Lsqrx8x_break
+
+	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
+	.byte	0x66
+	mov	-64($aptr),%rdx
+	adcx	0*8($tptr),%r8
+	adcx	1*8($tptr),%r9
+	adc	2*8($tptr),%r10
+	adc	3*8($tptr),%r11
+	adc	4*8($tptr),%r12
+	adc	5*8($tptr),%r13
+	adc	6*8($tptr),%r14
+	adc	7*8($tptr),%r15
+	lea	8*8($tptr),$tptr
+	.byte	0x67
+	sbb	%rax,%rax		# mov %cf,%rax
+	xor	%ebx,%ebx		# cf=0, of=0
+	mov	%rax,16+8(%rsp)		# offload carry
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_break:
+	xor	$zero,$zero
+	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
+	adcx	$zero,%r8
+	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
+	adcx	$zero,%r9
+	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
+	adc	\$0,%r10
+	mov	%r8,0*8($tptr)
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15
+	cmp	$carry,$tptr		# cf=0, of=0
+	je	.Lsqrx8x_outer_loop
+
+	mov	%r9,1*8($tptr)
+	 mov	1*8($carry),%r9
+	mov	%r10,2*8($tptr)
+	 mov	2*8($carry),%r10
+	mov	%r11,3*8($tptr)
+	 mov	3*8($carry),%r11
+	mov	%r12,4*8($tptr)
+	 mov	4*8($carry),%r12
+	mov	%r13,5*8($tptr)
+	 mov	5*8($carry),%r13
+	mov	%r14,6*8($tptr)
+	 mov	6*8($carry),%r14
+	mov	%r15,7*8($tptr)
+	 mov	7*8($carry),%r15
+	mov	$carry,$tptr
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_break:
+	mov	%r9,9*8($tptr)		# t[9]
+	 movq	%xmm3,%rcx		# -$num
+	mov	%r10,10*8($tptr)	# ...
+	mov	%r11,11*8($tptr)
+	mov	%r12,12*8($tptr)
+	mov	%r13,13*8($tptr)
+	mov	%r14,14*8($tptr)
+___
+}{
+my $i="%rcx";
+$code.=<<___;
+	lea	48+8(%rsp),$tptr
+	mov	($aptr,$i),%rdx		# a[0]
+
+	mov	8($tptr),$A0[1]		# t[1]
+	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
+	mov	0+8(%rsp),$num		# restore $num
+	adox	$A0[1],$A0[1]
+	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
+	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
+	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
+
+.align	32
+.Lsqrx4x_shift_n_add:
+	mulx	%rdx,%rax,%rbx
+	 adox	$A1[0],$A1[0]
+	adcx	$A0[0],%rax
+	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
+	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
+	 adox	$A1[1],$A1[1]
+	adcx	$A0[1],%rbx
+	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
+	mov	%rax,0($tptr)
+	mov	%rbx,8($tptr)
+
+	mulx	%rdx,%rax,%rbx
+	 adox	$A0[0],$A0[0]
+	adcx	$A1[0],%rax
+	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
+	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
+	 adox	$A0[1],$A0[1]
+	adcx	$A1[1],%rbx
+	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
+	mov	%rax,16($tptr)
+	mov	%rbx,24($tptr)
+
+	mulx	%rdx,%rax,%rbx
+	 adox	$A1[0],$A1[0]
+	adcx	$A0[0],%rax
+	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
+	 lea	32($i),$i
+	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
+	 adox	$A1[1],$A1[1]
+	adcx	$A0[1],%rbx
+	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
+	mov	%rax,32($tptr)
+	mov	%rbx,40($tptr)
+
+	mulx	%rdx,%rax,%rbx
+	 adox	$A0[0],$A0[0]
+	adcx	$A1[0],%rax
+	jrcxz	.Lsqrx4x_shift_n_add_break
+	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
+	 adox	$A0[1],$A0[1]
+	adcx	$A1[1],%rbx
+	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
+	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
+	mov	%rax,48($tptr)
+	mov	%rbx,56($tptr)
+	lea	64($tptr),$tptr
+	nop
+	jmp	.Lsqrx4x_shift_n_add
+
+.align	32
+.Lsqrx4x_shift_n_add_break:
+	adcx	$A1[1],%rbx
+	mov	%rax,48($tptr)
+	mov	%rbx,56($tptr)
+	lea	64($tptr),$tptr		# end of t[] buffer
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
+
+$code.=<<___;
+	movq	%xmm2,$nptr
+__bn_sqrx8x_reduction:
+	xor	%eax,%eax		# initial top-most carry bit
+	mov	32+8(%rsp),%rbx		# n0
+	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
+	lea	-8*8($nptr,$num),%rcx	# end of n[]
+	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
+	mov	%rcx, 0+8(%rsp)		# save end of n[]
+	mov	$tptr,8+8(%rsp)		# save end of t[]
+
+	lea	48+8(%rsp),$tptr		# initial t[] window
+	jmp	.Lsqrx8x_reduction_loop
+
+.align	32
+.Lsqrx8x_reduction_loop:
+	mov	8*1($tptr),%r9
+	mov	8*2($tptr),%r10
+	mov	8*3($tptr),%r11
+	mov	8*4($tptr),%r12
+	mov	%rdx,%r8
+	imulq	%rbx,%rdx		# n0*a[i]
+	mov	8*5($tptr),%r13
+	mov	8*6($tptr),%r14
+	mov	8*7($tptr),%r15
+	mov	%rax,24+8(%rsp)		# store top-most carry bit
+
+	lea	8*8($tptr),$tptr
+	xor	$carry,$carry		# cf=0,of=0
+	mov	\$-8,%rcx
+	jmp	.Lsqrx8x_reduce
+
+.align	32
+.Lsqrx8x_reduce:
+	mov	%r8, %rbx
+	mulx	8*0($nptr),%rax,%r8	# n[0]
+	adcx	%rbx,%rax		# discarded
+	adox	%r9,%r8
+
+	mulx	8*1($nptr),%rbx,%r9	# n[1]
+	adcx	%rbx,%r8
+	adox	%r10,%r9
+
+	mulx	8*2($nptr),%rbx,%r10
+	adcx	%rbx,%r9
+	adox	%r11,%r10
+
+	mulx	8*3($nptr),%rbx,%r11
+	adcx	%rbx,%r10
+	adox	%r12,%r11
+
+	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
+	 mov	%rdx,%rax
+	 mov	%r8,%rdx
+	adcx	%rbx,%r11
+	adox	%r13,%r12
+
+	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
+	 mov	%rax,%rdx
+	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
+
+	mulx	8*5($nptr),%rax,%r13
+	adcx	%rax,%r12
+	adox	%r14,%r13
+
+	mulx	8*6($nptr),%rax,%r14
+	adcx	%rax,%r13
+	adox	%r15,%r14
+
+	mulx	8*7($nptr),%rax,%r15
+	 mov	%rbx,%rdx
+	adcx	%rax,%r14
+	adox	$carry,%r15		# $carry is 0
+	adcx	$carry,%r15		# cf=0
+
+	.byte	0x67,0x67,0x67
+	inc	%rcx			# of=0
+	jnz	.Lsqrx8x_reduce
+
+	mov	$carry,%rax		# xor	%rax,%rax
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.Lsqrx8x_no_tail
+
+	mov	48+8(%rsp),%rdx		# pull n0*a[0]
+	add	8*0($tptr),%r8
+	lea	8*8($nptr),$nptr
+	mov	\$-8,%rcx
+	adcx	8*1($tptr),%r9
+	adcx	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	lea	8*8($tptr),$tptr
+	sbb	%rax,%rax		# top carry
+
+	xor	$carry,$carry		# of=0, cf=0
+	mov	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail:
+	mov	%r8,%rbx
+	mulx	8*0($nptr),%rax,%r8
+	adcx	%rax,%rbx
+	adox	%r9,%r8
+
+	mulx	8*1($nptr),%rax,%r9
+	adcx	%rax,%r8
+	adox	%r10,%r9
+
+	mulx	8*2($nptr),%rax,%r10
+	adcx	%rax,%r9
+	adox	%r11,%r10
+
+	mulx	8*3($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
+	adcx	%rax,%r11
+	adox	%r13,%r12
+
+	mulx	8*5($nptr),%rax,%r13
+	adcx	%rax,%r12
+	adox	%r14,%r13
+
+	mulx	8*6($nptr),%rax,%r14
+	adcx	%rax,%r13
+	adox	%r15,%r14
+
+	mulx	8*7($nptr),%rax,%r15
+	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
+	adcx	%rax,%r14
+	adox	$carry,%r15
+	 mov	%rbx,($tptr,%rcx,8)	# save result
+	 mov	%r8,%rbx
+	adcx	$carry,%r15		# cf=0
+
+	inc	%rcx			# of=0
+	jnz	.Lsqrx8x_tail
+
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.Lsqrx8x_tail_done	# break out of loop
+
+	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
+	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
+	 lea	8*8($nptr),$nptr
+	adc	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	lea	8*8($tptr),$tptr
+	sbb	%rax,%rax
+	sub	\$8,%rcx		# mov	\$-8,%rcx
+
+	xor	$carry,$carry		# of=0, cf=0
+	mov	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail_done:
+	xor	%rax,%rax
+	add	24+8(%rsp),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15
+	adc	\$0,%rax
+
+	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
+.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
+	adc	8*0($tptr),%r8
+	 movq	%xmm3,%rcx
+	adc	8*1($tptr),%r9
+	 mov	8*7($nptr),$carry
+	 movq	%xmm2,$nptr		# restore $nptr
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	adc	\$0,%rax		# top-most carry
+
+	mov	32+8(%rsp),%rbx		# n0
+	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
+
+	mov	%r8,8*0($tptr)		# store top 512 bits
+	 lea	8*8($tptr),%r8		# borrow %r8
+	mov	%r9,8*1($tptr)
+	mov	%r10,8*2($tptr)
+	mov	%r11,8*3($tptr)
+	mov	%r12,8*4($tptr)
+	mov	%r13,8*5($tptr)
+	mov	%r14,8*6($tptr)
+	mov	%r15,8*7($tptr)
+
+	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
+	cmp	8+8(%rsp),%r8		# end of t[]?
+	jb	.Lsqrx8x_reduction_loop
+	ret
+.cfi_endproc
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($rptr,$nptr)=("%rdx","%rbp");
+$code.=<<___;
+.align	32
+.type	__bn_postx4x_internal,\@abi-omnipotent
+__bn_postx4x_internal:
+.cfi_startproc
+	mov	8*0($nptr),%r12
+	mov	%rcx,%r10		# -$num
+	mov	%rcx,%r9		# -$num
+	neg	%rax
+	sar	\$3+2,%rcx
+	#lea	48+8(%rsp,%r9),$tptr
+	movq	%xmm1,$rptr		# restore $rptr
+	movq	%xmm1,$aptr		# prepare for back-to-back call
+	dec	%r12			# so that after 'not' we get -n[0]
+	mov	8*1($nptr),%r13
+	xor	%r8,%r8
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+	jmp	.Lsqrx4x_sub_entry
+
+.align	16
+.Lsqrx4x_sub:
+	mov	8*0($nptr),%r12
+	mov	8*1($nptr),%r13
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+.Lsqrx4x_sub_entry:
+	andn	%rax,%r12,%r12
+	lea	8*4($nptr),$nptr
+	andn	%rax,%r13,%r13
+	andn	%rax,%r14,%r14
+	andn	%rax,%r15,%r15
+
+	neg	%r8			# mov %r8,%cf
+	adc	8*0($tptr),%r12
+	adc	8*1($tptr),%r13
+	adc	8*2($tptr),%r14
+	adc	8*3($tptr),%r15
+	mov	%r12,8*0($rptr)
+	lea	8*4($tptr),$tptr
+	mov	%r13,8*1($rptr)
+	sbb	%r8,%r8			# mov %cf,%r8
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+
+	inc	%rcx
+	jnz	.Lsqrx4x_sub
+
+	neg	%r9			# restore $num
+
+	ret
+.cfi_endproc
+.size	__bn_postx4x_internal,.-__bn_postx4x_internal
+___
+}
+}}}
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
+				("%rdi","%esi","%rdx","%ecx");  # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl	bn_scatter5
+.type	bn_scatter5,\@abi-omnipotent
+.align	16
+bn_scatter5:
+.cfi_startproc
+	_CET_ENDBR
+	cmp	\$0, $num
+	jz	.Lscatter_epilogue
+
+	# $tbl stores 32 entries, t0 through t31. Each entry has $num words.
+	# They are interleaved in memory as follows:
+	#
+	#  t0[0]      t1[0]      t2[0]      ... t31[0]
+	#  t0[1]      t1[1]      t2[1]      ... t31[1]
+	#  ...
+	#  t0[$num-1] t1[$num-1] t2[$num-1] ... t31[$num-1]
+
+	lea	($tbl,$idx,8),$tbl
+.Lscatter:
+	mov	($inp),%rax
+	lea	8($inp),$inp
+	mov	%rax,($tbl)
+	lea	32*8($tbl),$tbl
+	sub	\$1,$num
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.cfi_endproc
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,\@abi-omnipotent
+.align	32
+bn_gather5:
+.cfi_startproc
+.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
+	_CET_ENDBR
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
+.cfi_def_cfa_register	%r10
+	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
+	lea	.Linc(%rip),%rax
+	and	\$-16,%rsp		# shouldn't be formally required
+
+	movd	$idx,%xmm5
+	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
+	lea	128($tbl),%r11		# size optimization
+	lea	128(%rsp),%rax		# size optimization
+
+	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
+#
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
+___
+$code.=<<___	if ($i);
+	movdqa	%xmm3,`16*($i-1)-128`(%rax)
+___
+$code.=<<___;
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($i+0)-128`(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($i+1)-128`(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($i+2)-128`(%rax)
+	movdqa	%xmm4,%xmm2
+___
+}
+$code.=<<___;
+	movdqa	%xmm3,`16*($i-1)-128`(%rax)
+	jmp	.Lgather
+
+.align	32
+.Lgather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+# Combine the masks with the corresponding table entries to select the correct
+# entry.
+$code.=<<___;
+	movdqa	`16*($i+0)-128`(%r11),%xmm0
+	movdqa	`16*($i+1)-128`(%r11),%xmm1
+	movdqa	`16*($i+2)-128`(%r11),%xmm2
+	pand	`16*($i+0)-128`(%rax),%xmm0
+	movdqa	`16*($i+3)-128`(%r11),%xmm3
+	pand	`16*($i+1)-128`(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($i+2)-128`(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($i+3)-128`(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	lea	$STRIDE(%r11),%r11
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
+	por	%xmm4,%xmm0
+	movq	%xmm0,($out)		# m0=bp[0]
+	lea	8($out),$out
+	sub	\$1,$num
+	jnz	.Lgather
+
+	lea	(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+	ret
+.LSEH_end_bn_gather5:
+.cfi_endproc
+.size	bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.section .rodata
+.align	64
+.Linc:
+	.long	0,0, 1,1
+	.long	2,2, 2,2
+.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.text
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# beginning of body label
+	cmp	%r10,%rbx		# context->Rip<body label
+	jb	.Lcommon_pop_regs
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	.Lmul4x_epilogue(%rip),%r10 # *ring*: hacked for deletion of _nohw
+	cmp	%r10,%rbx
+	ja	.Lbody_40
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
+	jmp	.Lcommon_pop_regs
+
+.Lbody_40:
+	mov	40(%rax),%rax		# pull saved stack pointer
+.Lcommon_pop_regs:
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mul_handler,.-mul_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul4x_mont_gather5
+	.rva	.LSEH_end_bn_mul4x_mont_gather5
+	.rva	.LSEH_info_bn_mul4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_power5_nohw
+	.rva	.LSEH_end_bn_power5_nohw
+	.rva	.LSEH_info_bn_power5_nohw
+___
+$code.=<<___ if ($addx);
+	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
+	.rva	.LSEH_end_bn_mulx4x_mont_gather5
+	.rva	.LSEH_info_bn_mulx4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_powerx5
+	.rva	.LSEH_end_bn_powerx5
+	.rva	.LSEH_info_bn_powerx5
+___
+$code.=<<___;
+	.rva	.LSEH_begin_bn_gather5
+	.rva	.LSEH_end_bn_gather5
+	.rva	.LSEH_info_bn_gather5
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_power5_nohw:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
+___
+$code.=<<___ if ($addx);
+.align	8
+.LSEH_info_bn_mulx4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
+.align	8
+.LSEH_info_bn_powerx5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.align	8
+.LSEH_info_bn_gather5:
+	.byte	0x01,0x0b,0x03,0x0a
+	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
+	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
+.align	8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/internal.h b/ring-0.17.14/crypto/fipsmodule/bn/internal.h
new file mode 100644
index 0000000000..f59b5af080
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/internal.h
@@ -0,0 +1,154 @@
+// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_BN_INTERNAL_H
+#define OPENSSL_HEADER_BN_INTERNAL_H
+
+#include <ring-core/base.h>
+
+#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push, 3)
+#include <intrin.h>
+#pragma warning(pop)
+#pragma intrinsic(_umul128)
+#endif
+
+#include "../../internal.h"
+
+typedef crypto_word_t BN_ULONG;
+
+#if defined(OPENSSL_64_BIT)
+
+#if defined(BORINGSSL_HAS_UINT128)
+// MSVC doesn't support two-word integers on 64-bit.
+#define BN_ULLONG uint128_t
+#endif
+
+#define BN_BITS2 64
+#define BN_MONT_CTX_N0_LIMBS 1
+#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0
+#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
+
+#elif defined(OPENSSL_32_BIT)
+
+#define BN_ULLONG uint64_t
+#define BN_BITS2 32
+// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
+// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
+// needs to be two words long. Only certain 32-bit platforms actually make use
+// of n0[1] and shorter R value would suffice for the others. However,
+// currently only the assembly files know which is which.
+#define BN_MONT_CTX_N0_LIMBS 2
+#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo)
+#define TOBN(hi, lo) (lo), (hi)
+
+#else
+#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
+#endif
+
+
+
+// BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM|
+// used with Montgomery reduction. Ideally this limit would be applied to all
+// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB
+// values for other operations.
+// #define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG))
+
+// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
+// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to
+// an |N0|.
+//
+// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
+// If neither is fully-reduced, the output may not be either.
+//
+// This function allocates |num| words on the stack, so |num| should be at most
+// |BN_MONTGOMERY_MAX_WORDS|.
+//
+// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
+// off upper bits. The aarch64 implementation expects a 64-bit input and does
+// not. |size_t| is the safer option but not strictly correct for x86_64. But
+// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
+//
+// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
+// inputs.
+//
+// |num| must be at least 4, at least on x86.
+//
+// In other forks, |bn_mul_mont| returns an |int| indicating whether it
+// actually did the multiplication. All our implementations always do the
+// multiplication, and forcing callers to deal with the possibility of it
+// failing just leads to further problems.
+OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
+                      (sizeof(int) == 4 && sizeof(size_t) == 8),
+                      "int and size_t ABI mismatch");
+#if defined(OPENSSL_X86_64)
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+static inline void bn_mul_mont_small(
+    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
+}
+#elif defined(OPENSSL_AARCH64)
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+static inline void bn_mul_mont_small(
+    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+    // No point in optimizing for P-256 because P-256 doesn't call into
+    // this on AArch64.
+    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
+}
+#elif defined(OPENSSL_ARM)
+void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                        const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+static inline void bn_mul_mont_small(
+    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+    // Approximate what `bn_mul_mont` did so that the NEON version for P-256
+    // when practical.
+    if (num == 8) {
+        // XXX: This should not be accessing `neon_available` directly.
+        if (neon_available) {
+            bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
+            return;
+        }
+    }
+    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
+}
+#else
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                 const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+static inline void bn_mul_mont_small(
+    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+    bn_mul_mont(rp, ap, bp, np, n0, num);
+}
+#endif
+
+static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
+                                 BN_ULONG a, BN_ULONG b) {
+#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
+  *low_out = _umul128(a, b, high_out);
+#else
+  BN_ULLONG result = (BN_ULLONG)a * b;
+  *low_out = (BN_ULONG)result;
+  *high_out = (BN_ULONG)(result >> BN_BITS2);
+#endif
+}
+
+#endif  // OPENSSL_HEADER_BN_INTERNAL_H
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/montgomery.c b/ring-0.17.14/crypto/fipsmodule/bn/montgomery.c
new file mode 100644
index 0000000000..07e757d32e
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/montgomery.c
@@ -0,0 +1,64 @@
+// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "internal.h"
+#include "../../internal.h"
+
+#include "../../limbs/limbs.h"
+#include "../../limbs/limbs.inl"
+
+OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
+  "BN_MONT_CTX_N0_LIMBS value is invalid");
+OPENSSL_STATIC_ASSERT(
+  sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
+  "uint64_t is insufficient precision for n0");
+
+int bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[],
+                                    size_t num_a, const BN_ULONG n[],
+                                    size_t num_n,
+                                    const BN_ULONG n0_[BN_MONT_CTX_N0_LIMBS]) {
+  if (num_n == 0 || num_r != num_n || num_a != 2 * num_n) {
+    return 0;
+  }
+
+  // Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
+  // input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
+  // includes |carry| which is stored separately.
+  BN_ULONG n0 = n0_[0];
+  BN_ULONG carry = 0;
+  for (size_t i = 0; i < num_n; i++) {
+    BN_ULONG v = limbs_mul_add_limb(a + i, n, a[i] * n0, num_n);
+    v += carry + a[i + num_n];
+    carry |= (v != a[i + num_n]);
+    carry &= (v <= a[i + num_n]);
+    a[i + num_n] = v;
+  }
+
+  // Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a|
+  // includes |carry| which is stored separately.
+  a += num_n;
+
+  // |a| thus requires at most one additional subtraction |n| to be reduced.
+  // Subtract |n| and select the answer in constant time.
+  BN_ULONG v = limbs_sub(r, a, n, num_n) - carry;
+  // |v| is one if |a| - |n| underflowed or zero if it did not. Note |v| cannot
+  // be -1. That would imply the subtraction did not fit in |num_n| words, and
+  // we know at most one subtraction is needed.
+  v = 0u - v;
+  for (size_t i = 0; i < num_n; i++) {
+    r[i] = constant_time_select_w(v, a[i], r[i]);
+    a[i] = 0;
+  }
+  return 1;
+}
diff --git a/ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c b/ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c
new file mode 100644
index 0000000000..070cdc1c35
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c
@@ -0,0 +1,105 @@
+/* Copyright 2016 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
+                      "BN_MONT_CTX_N0_LIMBS value is invalid");
+OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
+                      "uint64_t is insufficient precision for n0");
+
+// LG_LITTLE_R is log_2(r).
+#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
+
+// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v|
+// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n|
+// must be odd.
+//
+// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery
+// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf).
+// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and
+// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000"
+// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21).
+//
+// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion"
+// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is
+// constant-time with respect to |n|. We assume uint64_t additions,
+// subtractions, shifts, and bitwise operations are all constant time, which
+// may be a large leap of faith on 32-bit targets. We avoid division and
+// multiplication, which tend to be the most problematic in terms of timing
+// leaks.
+//
+// Most GCD implementations return values such that |u*r + v*n == 1|, so the
+// caller would have to negate the resultant |v| for the purpose of Montgomery
+// multiplication. This implementation does the negation implicitly by doing
+// the computations as a difference instead of a sum.
+uint64_t bn_neg_inv_mod_r_u64(uint64_t n) {
+  dev_assert_secret(n % 2 == 1);
+
+  // alpha == 2**(lg r - 1) == r / 2.
+  static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1);
+
+  const uint64_t beta = n;
+
+  uint64_t u = 1;
+  uint64_t v = 0;
+
+  // The invariant maintained from here on is:
+  // 2**(lg r - i) == u*2*alpha - v*beta.
+  for (size_t i = 0; i < LG_LITTLE_R; ++i) {
+#if BN_BITS2 == 64 && defined(BN_ULLONG)
+    dev_assert_secret((BN_ULLONG)(1) << (LG_LITTLE_R - i) ==
+           ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
+#endif
+
+    // Delete a common factor of 2 in u and v if |u| is even. Otherwise, set
+    // |u = (u + beta) / 2| and |v = (v / 2) + alpha|.
+
+    uint64_t u_is_odd = UINT64_C(0) - (u & 1);  // Either 0xff..ff or 0.
+
+    // The addition can overflow, so use Dietz's method for it.
+    //
+    // Dietz calculates (x+y)/2 by (x xor y)>>1 + x&y. This is valid for all
+    // (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values
+    // (embedded in 64 bits to so that overflow can be ignored):
+    //
+    // (declare-fun x () (_ BitVec 64))
+    // (declare-fun y () (_ BitVec 64))
+    // (assert (let (
+    //    (one (_ bv1 64))
+    //    (thirtyTwo (_ bv32 64)))
+    //    (and
+    //      (bvult x (bvshl one thirtyTwo))
+    //      (bvult y (bvshl one thirtyTwo))
+    //      (not (=
+    //        (bvadd (bvlshr (bvxor x y) one) (bvand x y))
+    //        (bvlshr (bvadd x y) one)))
+    // )))
+    // (check-sat)
+    uint64_t beta_if_u_is_odd = beta & u_is_odd;  // Either |beta| or 0.
+    u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd);
+
+    uint64_t alpha_if_u_is_odd = alpha & u_is_odd; /* Either |alpha| or 0. */
+    v = (v >> 1) + alpha_if_u_is_odd;
+  }
+
+  // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
+#if BN_BITS2 == 64 && defined(BN_ULLONG)
+  declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
+#endif
+
+  return v;
+}
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
new file mode 100644
index 0000000000..720b3b489c
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
@@ -0,0 +1,1567 @@
+#! /usr/bin/env perl
+# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# ECP_NISTZ256 module for ARMv8.
+#
+# February 2015.
+#
+# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
+# http://eprint.iacr.org/2013/816.
+#
+#			with/without -DECP_NISTZ256_ASM
+# Apple A7		+190-360%
+# Cortex-A53		+190-400%
+# Cortex-A57		+190-350%
+# Denver		+230-400%
+#
+# Ranges denote minimum and maximum improvement coefficients depending
+# on benchmark. Lower coefficients are for ECDSA sign, server-side
+# operation. Keep in mind that +400% means 5x improvement.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+{
+my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
+    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
+    map("x$_",(0..17,19,20));
+
+my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
+
+$code.=<<___;
+.section .rodata
+.align	5
+.Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+.Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+.Lone:
+.quad	1,0,0,0
+.Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,%function
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	$bi,[$bp]		// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+	adrp	$poly3,:pg_hi21:.Lpoly
+	add	$poly3,$poly3,:lo12:.Lpoly
+	ldr	$poly1,[$poly3,#8]
+	ldr	$poly3,[$poly3,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,%function
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+	adrp	$poly3,:pg_hi21:.Lpoly
+	add	$poly3,$poly3,:lo12:.Lpoly
+	ldr	$poly1,[$poly3,#8]
+	ldr	$poly3,[$poly3,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+.type	ecp_nistz256_neg,%function
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$bp,$ap
+	mov	$acc0,xzr		// a = 0
+	mov	$acc1,xzr
+	mov	$acc2,xzr
+	mov	$acc3,xzr
+	adrp	$poly3,:pg_hi21:.Lpoly
+	add	$poly3,$poly3,:lo12:.Lpoly
+	ldr	$poly1,[$poly3,#8]
+	ldr	$poly3,[$poly3,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to $a0-$a3 and b[0] - to $bi
+.type	__ecp_nistz256_mul_mont,%function
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	$acc0,$a0,$bi		// a[0]*b[0]
+	umulh	$t0,$a0,$bi
+
+	mul	$acc1,$a1,$bi		// a[1]*b[0]
+	umulh	$t1,$a1,$bi
+
+	mul	$acc2,$a2,$bi		// a[2]*b[0]
+	umulh	$t2,$a2,$bi
+
+	mul	$acc3,$a3,$bi		// a[3]*b[0]
+	umulh	$t3,$a3,$bi
+	ldr	$bi,[$bp,#8]		// b[1]
+
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	 lsl	$t0,$acc0,#32
+	adcs	$acc2,$acc2,$t1
+	 lsr	$t1,$acc0,#32
+	adcs	$acc3,$acc3,$t2
+	adc	$acc4,xzr,$t3
+	mov	$acc5,xzr
+___
+for($i=1;$i<4;$i++) {
+        # Reduction iteration is normally performed by accumulating
+        # result of multiplication of modulus by "magic" digit [and
+        # omitting least significant word, which is guaranteed to
+        # be 0], but thanks to special form of modulus and "magic"
+        # digit being equal to least significant word, it can be
+        # performed with additions and subtractions alone. Indeed:
+        #
+        #            ffff0001.00000000.0000ffff.ffffffff
+        # *                                     abcdefgh
+        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
+        #
+        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+        # rewrite above as:
+        #
+        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
+        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
+        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
+        #
+        # or marking redundant operations:
+        #
+        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
+        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
+        # - 0000abcd.efgh0000.--------.--------.--------
+
+$code.=<<___;
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
+	adcs	$acc1,$acc2,$t1
+	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
+	adcs	$acc3,$acc4,$t3
+	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
+	adc	$acc4,$acc5,xzr
+
+	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
+	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
+	adcs	$acc1,$acc1,$t1
+	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
+	adcs	$acc2,$acc2,$t2
+	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
+	adcs	$acc3,$acc3,$t3
+	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
+	adc	$acc4,$acc4,xzr
+___
+$code.=<<___	if ($i<3);
+	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
+___
+$code.=<<___;
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	 lsl	$t0,$acc0,#32
+	adcs	$acc2,$acc2,$t1
+	 lsr	$t1,$acc0,#32
+	adcs	$acc3,$acc3,$t2
+	adcs	$acc4,$acc4,$t3
+	adc	$acc5,xzr,xzr
+___
+}
+$code.=<<___;
+	// last reduction
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	adcs	$acc3,$acc4,$t3
+	adc	$acc4,$acc5,xzr
+
+	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
+	sbcs	$t1,$acc1,$poly1
+	sbcs	$t2,$acc2,xzr
+	sbcs	$t3,$acc3,$poly3
+	sbcs	xzr,$acc4,xzr		// did it borrow?
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to $a0-$a3
+.type	__ecp_nistz256_sqr_mont,%function
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	$acc1,$a1,$a0		// a[1]*a[0]
+	umulh	$t1,$a1,$a0
+	mul	$acc2,$a2,$a0		// a[2]*a[0]
+	umulh	$t2,$a2,$a0
+	mul	$acc3,$a3,$a0		// a[3]*a[0]
+	umulh	$acc4,$a3,$a0
+
+	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
+	 mul	$t0,$a2,$a1		// a[2]*a[1]
+	 umulh	$t1,$a2,$a1
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a3,$a1		// a[3]*a[1]
+	 umulh	$t3,$a3,$a1
+	adc	$acc4,$acc4,xzr		// can't overflow
+
+	mul	$acc5,$a3,$a2		// a[3]*a[2]
+	umulh	$acc6,$a3,$a2
+
+	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
+	 mul	$acc0,$a0,$a0		// a[0]*a[0]
+	adc	$t2,$t3,xzr		// can't overflow
+
+	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
+	 umulh	$a0,$a0,$a0
+	adcs	$acc4,$acc4,$t1
+	 mul	$t1,$a1,$a1		// a[1]*a[1]
+	adcs	$acc5,$acc5,$t2
+	 umulh	$a1,$a1,$a1
+	adc	$acc6,$acc6,xzr		// can't overflow
+
+	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
+	 mul	$t2,$a2,$a2		// a[2]*a[2]
+	adcs	$acc2,$acc2,$acc2
+	 umulh	$a2,$a2,$a2
+	adcs	$acc3,$acc3,$acc3
+	 mul	$t3,$a3,$a3		// a[3]*a[3]
+	adcs	$acc4,$acc4,$acc4
+	 umulh	$a3,$a3,$a3
+	adcs	$acc5,$acc5,$acc5
+	adcs	$acc6,$acc6,$acc6
+	adc	$acc7,xzr,xzr
+
+	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$a1
+	adcs	$acc4,$acc4,$t2
+	adcs	$acc5,$acc5,$a2
+	 lsl	$t0,$acc0,#32
+	adcs	$acc6,$acc6,$t3
+	 lsr	$t1,$acc0,#32
+	adc	$acc7,$acc7,$a3
+___
+for($i=0;$i<3;$i++) {			# reductions, see commentary in
+					# multiplication for details
+$code.=<<___;
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	adcs	$acc1,$acc2,$t1
+	 lsl	$t0,$acc0,#32
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	 lsr	$t1,$acc0,#32
+	adc	$acc3,$t3,xzr		// can't overflow
+___
+}
+$code.=<<___;
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	adc	$acc3,$t3,xzr		// can't overflow
+
+	adds	$acc0,$acc0,$acc4	// accumulate upper half
+	adcs	$acc1,$acc1,$acc5
+	adcs	$acc2,$acc2,$acc6
+	adcs	$acc3,$acc3,$acc7
+	adc	$acc4,xzr,xzr
+
+	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
+	sbcs	$t1,$acc1,$poly1
+	sbcs	$t2,$acc2,xzr
+	sbcs	$t3,$acc3,$poly3
+	sbcs	xzr,$acc4,xzr		// did it borrow?
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.type	__ecp_nistz256_add_to,%function
+.align	4
+__ecp_nistz256_add_to:
+	adds	$acc0,$acc0,$t0		// ret = a+b
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	adc	$ap,xzr,xzr		// zap $ap
+
+	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
+	sbcs	$t1,$acc1,$poly1
+	sbcs	$t2,$acc2,xzr
+	sbcs	$t3,$acc3,$poly3
+	sbcs	xzr,$ap,xzr		// did subtraction borrow?
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
+
+.type	__ecp_nistz256_sub_from,%function
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	$t0,$t1,[$bp]
+	ldp	$t2,$t3,[$bp,#16]
+	subs	$acc0,$acc0,$t0		// ret = a-b
+	sbcs	$acc1,$acc1,$t1
+	sbcs	$acc2,$acc2,$t2
+	sbcs	$acc3,$acc3,$t3
+	sbc	$ap,xzr,xzr		// zap $ap
+
+	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
+	adcs	$t1,$acc1,$poly1
+	adcs	$t2,$acc2,xzr
+	adc	$t3,$acc3,$poly3
+	cmp	$ap,xzr			// did subtraction borrow?
+
+	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
+	csel	$acc1,$acc1,$t1,eq
+	csel	$acc2,$acc2,$t2,eq
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,eq
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
+
+.type	__ecp_nistz256_sub_morf,%function
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	$t0,$t1,[$bp]
+	ldp	$t2,$t3,[$bp,#16]
+	subs	$acc0,$t0,$acc0		// ret = b-a
+	sbcs	$acc1,$t1,$acc1
+	sbcs	$acc2,$t2,$acc2
+	sbcs	$acc3,$t3,$acc3
+	sbc	$ap,xzr,xzr		// zap $ap
+
+	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
+	adcs	$t1,$acc1,$poly1
+	adcs	$t2,$acc2,xzr
+	adc	$t3,$acc3,$poly3
+	cmp	$ap,xzr			// did subtraction borrow?
+
+	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
+	csel	$acc1,$acc1,$t1,eq
+	csel	$acc2,$acc2,$t2,eq
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,eq
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
+
+.type	__ecp_nistz256_div_by_2,%function
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
+	adcs	$t1,$acc1,$poly1
+	adcs	$t2,$acc2,xzr
+	adcs	$t3,$acc3,$poly3
+	adc	$ap,xzr,xzr		// zap $ap
+	tst	$acc0,#1		// is a even?
+
+	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
+	csel	$acc1,$acc1,$t1,eq
+	csel	$acc2,$acc2,$t2,eq
+	csel	$acc3,$acc3,$t3,eq
+	csel	$ap,xzr,$ap,eq
+
+	lsr	$acc0,$acc0,#1		// ret >>= 1
+	orr	$acc0,$acc0,$acc1,lsl#63
+	lsr	$acc1,$acc1,#1
+	orr	$acc1,$acc1,$acc2,lsl#63
+	lsr	$acc2,$acc2,#1
+	orr	$acc2,$acc2,$acc3,lsl#63
+	lsr	$acc3,$acc3,#1
+	stp	$acc0,$acc1,[$rp]
+	orr	$acc3,$acc3,$ap,lsl#63
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
+___
+########################################################################
+# following subroutines are "literal" implementation of those found in
+# ecp_nistz256.c
+#
+########################################################################
+# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
+#
+{
+my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
+# above map() describes stack layout with 4 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real) = map("x$_",(21,22));
+
+$code.=<<___;
+.globl	ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,%function
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+.Ldouble_shortcut:
+	ldp	$acc0,$acc1,[$ap,#32]
+	 mov	$rp_real,$rp
+	ldp	$acc2,$acc3,[$ap,#48]
+	 mov	$ap_real,$ap
+	 adrp	$poly3,:pg_hi21:.Lpoly
+	 add	$poly3,$poly3,:lo12:.Lpoly
+	 ldr	$poly1,[$poly3,#8]
+	mov	$t0,$acc0
+	 ldr	$poly3,[$poly3,#24]
+	mov	$t1,$acc1
+	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	 ldp	$a2,$a3,[$ap_real,#64+16]
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	$rp,sp,#$Zsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	$t0,$t1,[$ap_real]
+	ldp	$t2,$t3,[$ap_real,#16]
+	mov	$a0,$acc0		// put Zsqr aside for p256_sub
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	add	$rp,sp,#$M
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	$bp,$ap_real,#0
+	mov	$acc0,$a0		// restore Zsqr
+	mov	$acc1,$a1
+	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
+	mov	$acc2,$a2
+	mov	$acc3,$a3
+	 ldp	$a2,$a3,[sp,#$S+16]
+	add	$rp,sp,#$Zsqr
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	$bi,[$ap_real,#32]
+	ldp	$a0,$a1,[$ap_real,#64]
+	ldp	$a2,$a3,[$ap_real,#64+16]
+	add	$bp,$ap_real,#32
+	add	$rp,sp,#$tmp0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	 ldp	$a2,$a3,[sp,#$S+16]
+	add	$rp,$rp_real,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	$rp,sp,#$tmp0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$M]
+	 ldp	$a2,$a3,[sp,#$M+16]
+	add	$rp,$rp_real,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	$bp,sp,#$Zsqr
+	add	$rp,sp,#$M
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	$t0,$acc0		// duplicate M
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	mov	$a0,$acc0		// put M aside
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	add	$rp,sp,#$M
+	bl	__ecp_nistz256_add_to
+	mov	$t0,$a0			// restore M
+	mov	$t1,$a1
+	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
+	mov	$t2,$a2
+	 ldp	$a0,$a1,[sp,#$S]
+	mov	$t3,$a3
+	 ldp	$a2,$a3,[sp,#$S+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	$bp,$ap_real,#0
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	 ldp	$a2,$a3,[sp,#$M+16]
+	add	$rp,sp,#$tmp0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	$rp,$rp_real,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	$bp,sp,#$tmp0
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	$bp,sp,#$S
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	$bi,[sp,#$M]
+	mov	$a0,$acc0		// copy S
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	add	$bp,sp,#$M
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	$bp,$rp_real,#32
+	add	$rp,$rp_real,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+___
+}
+
+########################################################################
+# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
+#			      const P256_POINT *in2);
+{
+my ($res_x,$res_y,$res_z,
+    $H,$Hsqr,$R,$Rsqr,$Hcub,
+    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
+my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
+# above map() describes stack layout with 12 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
+
+$code.=<<___;
+.globl	ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,%function
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	$a0,$a1,[$bp,#64]	// in2_z
+	ldp	$a2,$a3,[$bp,#64+16]
+	 mov	$rp_real,$rp
+	 mov	$ap_real,$ap
+	 mov	$bp_real,$bp
+	 adrp	$poly3,:pg_hi21:.Lpoly
+	 add	$poly3,$poly3,:lo12:.Lpoly
+	 ldr	$poly1,[$poly3,#8]
+	 ldr	$poly3,[$poly3,#24]
+	orr	$t0,$a0,$a1
+	orr	$t2,$a2,$a3
+	orr	$in2infty,$t0,$t2
+	cmp	$in2infty,#0
+	csetm	$in2infty,ne		// ~in2infty
+	add	$rp,sp,#$Z2sqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
+	ldp	$a2,$a3,[$ap_real,#64+16]
+	orr	$t0,$a0,$a1
+	orr	$t2,$a2,$a3
+	orr	$in1infty,$t0,$t2
+	cmp	$in1infty,#0
+	csetm	$in1infty,ne		// ~in1infty
+	add	$rp,sp,#$Z1sqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	$bi,[$bp_real,#64]
+	ldp	$a0,$a1,[sp,#$Z2sqr]
+	ldp	$a2,$a3,[sp,#$Z2sqr+16]
+	add	$bp,$bp_real,#64
+	add	$rp,sp,#$S1
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	$bi,[$ap_real,#64]
+	ldp	$a0,$a1,[sp,#$Z1sqr]
+	ldp	$a2,$a3,[sp,#$Z1sqr+16]
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	$bi,[$ap_real,#32]
+	ldp	$a0,$a1,[sp,#$S1]
+	ldp	$a2,$a3,[sp,#$S1+16]
+	add	$bp,$ap_real,#32
+	add	$rp,sp,#$S1
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	$bi,[$bp_real,#32]
+	ldp	$a0,$a1,[sp,#$S2]
+	ldp	$a2,$a3,[sp,#$S2+16]
+	add	$bp,$bp_real,#32
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	$bp,sp,#$S1
+	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[$ap_real]
+	 ldp	$a2,$a3,[$ap_real,#16]
+	add	$rp,sp,#$R
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	$acc0,$acc0,$acc1	// see if result is zero
+	orr	$acc2,$acc2,$acc3
+	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
+
+	add	$bp,sp,#$Z2sqr
+	add	$rp,sp,#$U1
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	$bi,[sp,#$Z1sqr]
+	ldp	$a0,$a1,[$bp_real]
+	ldp	$a2,$a3,[$bp_real,#16]
+	add	$bp,sp,#$Z1sqr
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	$bp,sp,#$U1
+	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
+	 ldp	$a2,$a3,[sp,#$R+16]
+	add	$rp,sp,#$H
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	$acc0,$acc0,$acc1	// see if result is zero
+	orr	$acc2,$acc2,$acc3
+	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
+
+	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
+	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
+	orr	$acc0,$acc0,$temp1
+	orr	$acc0,$acc0,$temp2
+	orr	$acc0,$acc0,$temp0
+	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+.Ladd_double:
+	mov	$ap,$ap_real
+	mov	$rp,$rp_real
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	.Ldouble_shortcut
+
+.align	4
+.Ladd_proceed:
+	add	$rp,sp,#$Rsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	$bi,[$ap_real,#64]
+	ldp	$a0,$a1,[sp,#$H]
+	ldp	$a2,$a3,[sp,#$H+16]
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$res_z
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	$a0,$a1,[sp,#$H]
+	ldp	$a2,$a3,[sp,#$H+16]
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	$bi,[$bp_real,#64]
+	ldp	$a0,$a1,[sp,#$res_z]
+	ldp	$a2,$a3,[sp,#$res_z+16]
+	add	$bp,$bp_real,#64
+	add	$rp,sp,#$res_z
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	$bi,[sp,#$H]
+	ldp	$a0,$a1,[sp,#$Hsqr]
+	ldp	$a2,$a3,[sp,#$Hsqr+16]
+	add	$bp,sp,#$H
+	add	$rp,sp,#$Hcub
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	$bi,[sp,#$Hsqr]
+	ldp	$a0,$a1,[sp,#$U1]
+	ldp	$a2,$a3,[sp,#$U1+16]
+	add	$bp,sp,#$Hsqr
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	$bp,sp,#$Rsqr
+	add	$rp,sp,#$res_x
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	$bp,sp,#$Hcub
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	$bp,sp,#$U2
+	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$S1]
+	 ldp	$a2,$a3,[sp,#$S1+16]
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	$bp,sp,#$Hcub
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	$bi,[sp,#$R]
+	ldp	$a0,$a1,[sp,#$res_y]
+	ldp	$a2,$a3,[sp,#$res_y+16]
+	add	$bp,sp,#$R
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	$bp,sp,#$S2
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	$a0,$a1,[sp,#$res_x]		// res
+	ldp	$a2,$a3,[sp,#$res_x+16]
+	ldp	$t0,$t1,[$bp_real]		// in2
+	ldp	$t2,$t3,[$bp_real,#16]
+___
+for($i=0;$i<64;$i+=32) {		# conditional moves
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	ldp	$a2,$a3,[sp,#$res_x+$i+48]
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	ldp	$t2,$t3,[$bp_real,#$i+48]
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+___
+}
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+
+.Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+___
+}
+
+########################################################################
+# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
+#				     const P256_POINT_AFFINE *in2);
+{
+my ($res_x,$res_y,$res_z,
+    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
+my $Z1sqr = $S2;
+# above map() describes stack layout with 10 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
+
+$code.=<<___;
+.globl	ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,%function
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	$rp_real,$rp
+	mov	$ap_real,$ap
+	mov	$bp_real,$bp
+	adrp	$poly3,:pg_hi21:.Lpoly
+	add	$poly3,$poly3,:lo12:.Lpoly
+	ldr	$poly1,[$poly3,#8]
+	ldr	$poly3,[$poly3,#24]
+
+	ldp	$a0,$a1,[$ap,#64]	// in1_z
+	ldp	$a2,$a3,[$ap,#64+16]
+	orr	$t0,$a0,$a1
+	orr	$t2,$a2,$a3
+	orr	$in1infty,$t0,$t2
+	cmp	$in1infty,#0
+	csetm	$in1infty,ne		// ~in1infty
+
+	ldp	$acc0,$acc1,[$bp]	// in2_x
+	ldp	$acc2,$acc3,[$bp,#16]
+	ldp	$t0,$t1,[$bp,#32]	// in2_y
+	ldp	$t2,$t3,[$bp,#48]
+	orr	$acc0,$acc0,$acc1
+	orr	$acc2,$acc2,$acc3
+	orr	$t0,$t0,$t1
+	orr	$t2,$t2,$t3
+	orr	$acc0,$acc0,$acc2
+	orr	$t0,$t0,$t2
+	orr	$in2infty,$acc0,$t0
+	cmp	$in2infty,#0
+	csetm	$in2infty,ne		// ~in2infty
+
+	add	$rp,sp,#$Z1sqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	$a0,$acc0
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	ldr	$bi,[$bp_real]
+	add	$bp,$bp_real,#0
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	$bp,$ap_real,#0
+	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$Z1sqr]
+	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
+	add	$rp,sp,#$H
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	$bi,[$ap_real,#64]
+	ldp	$a0,$a1,[sp,#$H]
+	ldp	$a2,$a3,[sp,#$H+16]
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$res_z
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	$bi,[$bp_real,#32]
+	ldp	$a0,$a1,[sp,#$S2]
+	ldp	$a2,$a3,[sp,#$S2+16]
+	add	$bp,$bp_real,#32
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	$bp,$ap_real,#32
+	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
+	 ldp	$a2,$a3,[sp,#$H+16]
+	add	$rp,sp,#$R
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	$a0,$a1,[sp,#$R]
+	ldp	$a2,$a3,[sp,#$R+16]
+	add	$rp,sp,#$Rsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	$bi,[sp,#$H]
+	ldp	$a0,$a1,[sp,#$Hsqr]
+	ldp	$a2,$a3,[sp,#$Hsqr+16]
+	add	$bp,sp,#$H
+	add	$rp,sp,#$Hcub
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	$bi,[$ap_real]
+	ldp	$a0,$a1,[sp,#$Hsqr]
+	ldp	$a2,$a3,[sp,#$Hsqr+16]
+	add	$bp,$ap_real,#0
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	$bp,sp,#$Rsqr
+	add	$rp,sp,#$res_x
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	$bp,sp,#$Hcub
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	$bp,sp,#$U2
+	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$Hcub]
+	 ldp	$a2,$a3,[sp,#$Hcub+16]
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	$bp,$ap_real,#32
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	$bi,[sp,#$R]
+	ldp	$a0,$a1,[sp,#$res_y]
+	ldp	$a2,$a3,[sp,#$res_y+16]
+	add	$bp,sp,#$R
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	$bp,sp,#$S2
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	$a0,$a1,[sp,#$res_x]		// res
+	ldp	$a2,$a3,[sp,#$res_x+16]
+	ldp	$t0,$t1,[$bp_real]		// in2
+	ldp	$t2,$t3,[$bp_real,#16]
+___
+for($i=0;$i<64;$i+=32) {		# conditional moves
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	ldp	$a2,$a3,[sp,#$res_x+$i+48]
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	ldp	$t2,$t3,[$bp_real,#$i+48]
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+___
+$code.=<<___	if ($i == 0);
+	adrp	$bp_real,:pg_hi21:.Lone_mont-64
+	add	$bp_real,$bp_real,:lo12:.Lone_mont-64
+___
+}
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+___
+}
+if (1) {
+my ($ord0,$ord1) = ($poly1,$poly3);
+my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
+my $acc7 = $bi;
+
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,%function
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	$ordk,:pg_hi21:.Lord
+	add	$ordk,$ordk,:lo12:.Lord
+	ldr	$bi,[$bp]		// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+
+	ldp	$ord0,$ord1,[$ordk,#0]
+	ldp	$ord2,$ord3,[$ordk,#16]
+	ldr	$ordk,[$ordk,#32]
+
+	mul	$acc0,$a0,$bi		// a[0]*b[0]
+	umulh	$t0,$a0,$bi
+
+	mul	$acc1,$a1,$bi		// a[1]*b[0]
+	umulh	$t1,$a1,$bi
+
+	mul	$acc2,$a2,$bi		// a[2]*b[0]
+	umulh	$t2,$a2,$bi
+
+	mul	$acc3,$a3,$bi		// a[3]*b[0]
+	umulh	$acc4,$a3,$bi
+
+	mul	$t4,$acc0,$ordk
+
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$t2
+	adc	$acc4,$acc4,xzr
+	mov	$acc5,xzr
+___
+for ($i=1;$i<4;$i++) {
+	################################################################
+	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
+	# *                                     abcdefgh
+	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+	#
+	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+	# rewrite above as:
+	#
+	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
+	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
+$code.=<<___;
+	ldr	$bi,[$bp,#8*$i]		// b[i]
+
+	lsl	$t0,$t4,#32
+	subs	$acc2,$acc2,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc3,$acc3,$t0
+	sbcs	$acc4,$acc4,$t1
+	sbc	$acc5,$acc5,xzr
+
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	 mul	$t0,$a0,$bi
+	adc	$t3,$t3,xzr
+	 mul	$t1,$a1,$bi
+
+	adds	$acc0,$acc1,$t2
+	 mul	$t2,$a2,$bi
+	adcs	$acc1,$acc2,$t3
+	 mul	$t3,$a3,$bi
+	adcs	$acc2,$acc3,$t4
+	adcs	$acc3,$acc4,$t4
+	adc	$acc4,$acc5,xzr
+
+	adds	$acc0,$acc0,$t0		// accumulate low parts
+	umulh	$t0,$a0,$bi
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,$acc4,xzr
+	mul	$t4,$acc0,$ordk
+	adds	$acc1,$acc1,$t0		// accumulate high parts
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$t2
+	adcs	$acc4,$acc4,$t3
+	adc	$acc5,xzr,xzr
+___
+}
+$code.=<<___;
+	lsl	$t0,$t4,#32		// last reduction
+	subs	$acc2,$acc2,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc3,$acc3,$t0
+	sbcs	$acc4,$acc4,$t1
+	sbc	$acc5,$acc5,xzr
+
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	adc	$t3,$t3,xzr
+
+	adds	$acc0,$acc1,$t2
+	adcs	$acc1,$acc2,$t3
+	adcs	$acc2,$acc3,$t4
+	adcs	$acc3,$acc4,$t4
+	adc	$acc4,$acc5,xzr
+
+	subs	$t0,$acc0,$ord0		// ret -= modulus
+	sbcs	$t1,$acc1,$ord1
+	sbcs	$t2,$acc2,$ord2
+	sbcs	$t3,$acc3,$ord3
+	sbcs	xzr,$acc4,xzr
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,%function
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	$ordk,:pg_hi21:.Lord
+	add	$ordk,$ordk,:lo12:.Lord
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+
+	ldp	$ord0,$ord1,[$ordk,#0]
+	ldp	$ord2,$ord3,[$ordk,#16]
+	ldr	$ordk,[$ordk,#32]
+	b	.Loop_ord_sqr
+
+.align	4
+.Loop_ord_sqr:
+	sub	$bp,$bp,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	$acc1,$a1,$a0		// a[1]*a[0]
+	umulh	$t1,$a1,$a0
+	mul	$acc2,$a2,$a0		// a[2]*a[0]
+	umulh	$t2,$a2,$a0
+	mul	$acc3,$a3,$a0		// a[3]*a[0]
+	umulh	$acc4,$a3,$a0
+
+	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
+	 mul	$t0,$a2,$a1		// a[2]*a[1]
+	 umulh	$t1,$a2,$a1
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a3,$a1		// a[3]*a[1]
+	 umulh	$t3,$a3,$a1
+	adc	$acc4,$acc4,xzr		// can't overflow
+
+	mul	$acc5,$a3,$a2		// a[3]*a[2]
+	umulh	$acc6,$a3,$a2
+
+	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
+	 mul	$acc0,$a0,$a0		// a[0]*a[0]
+	adc	$t2,$t3,xzr		// can't overflow
+
+	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
+	 umulh	$a0,$a0,$a0
+	adcs	$acc4,$acc4,$t1
+	 mul	$t1,$a1,$a1		// a[1]*a[1]
+	adcs	$acc5,$acc5,$t2
+	 umulh	$a1,$a1,$a1
+	adc	$acc6,$acc6,xzr		// can't overflow
+
+	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
+	 mul	$t2,$a2,$a2		// a[2]*a[2]
+	adcs	$acc2,$acc2,$acc2
+	 umulh	$a2,$a2,$a2
+	adcs	$acc3,$acc3,$acc3
+	 mul	$t3,$a3,$a3		// a[3]*a[3]
+	adcs	$acc4,$acc4,$acc4
+	 umulh	$a3,$a3,$a3
+	adcs	$acc5,$acc5,$acc5
+	adcs	$acc6,$acc6,$acc6
+	adc	$acc7,xzr,xzr
+
+	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
+	 mul	$t4,$acc0,$ordk
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$a1
+	adcs	$acc4,$acc4,$t2
+	adcs	$acc5,$acc5,$a2
+	adcs	$acc6,$acc6,$t3
+	adc	$acc7,$acc7,$a3
+___
+for($i=0; $i<4; $i++) {			# reductions
+$code.=<<___;
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	adc	$t3,$t3,xzr
+
+	adds	$acc0,$acc1,$t2
+	adcs	$acc1,$acc2,$t3
+	adcs	$acc2,$acc3,$t4
+	adc	$acc3,xzr,$t4		// can't overflow
+___
+$code.=<<___	if ($i<3);
+	mul	$t3,$acc0,$ordk
+___
+$code.=<<___;
+	lsl	$t0,$t4,#32
+	subs	$acc1,$acc1,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc2,$acc2,$t0
+	sbc	$acc3,$acc3,$t1		// can't borrow
+___
+	($t3,$t4) = ($t4,$t3);
+}
+$code.=<<___;
+	adds	$acc0,$acc0,$acc4	// accumulate upper half
+	adcs	$acc1,$acc1,$acc5
+	adcs	$acc2,$acc2,$acc6
+	adcs	$acc3,$acc3,$acc7
+	adc	$acc4,xzr,xzr
+
+	subs	$t0,$acc0,$ord0		// ret -= modulus
+	sbcs	$t1,$acc1,$ord1
+	sbcs	$t2,$acc2,$ord2
+	sbcs	$t3,$acc3,$ord3
+	sbcs	xzr,$acc4,xzr
+
+	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$a1,$acc1,$t1,lo
+	csel	$a2,$acc2,$t2,lo
+	csel	$a3,$acc3,$t3,lo
+
+	cbnz	$bp,.Loop_ord_sqr
+
+	stp	$a0,$a1,[$rp]
+	stp	$a2,$a3,[$rp,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+}	}
+
+########################################################################
+# select subroutines
+# These select functions are similar to those in p256-x86_64-asm.pl
+# They load all points in the lookup table
+# keeping in the output only the one corresponding to the input index.
+{
+my ($val,$in_t)=map("x$_",(0..1));
+my ($index)=("w2");
+my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11");
+my ($Mask)=("v3");
+my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21));
+my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27));
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,%function
+.align	4
+ecp_nistz256_select_w5:
+    AARCH64_VALID_CALL_TARGET
+
+    // $Val_in := $val
+    // $Idx_ctr := 0; loop counter and incremented internal index
+    mov     $Val_in, $val
+    mov     $Idx_ctr, #0
+
+    // [$Ra-$Rf] := 0
+    movi    $Ra.16b, #0
+    movi    $Rb.16b, #0
+    movi    $Rc.16b, #0
+    movi    $Rd.16b, #0
+    movi    $Re.16b, #0
+    movi    $Rf.16b, #0
+
+.Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+    add $Idx_ctr, $Idx_ctr, #1
+
+    // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t
+    //  and advance $in_t to point to the next entry
+    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
+
+    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
+    cmp     $Idx_ctr, $index
+    csetm   $Mask_64, eq
+
+    // continue loading ...
+    ld1     {$T0e.2d, $T0f.2d}, [$in_t],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+    dup     $Mask.2d, $Mask_64
+
+    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
+    // i.e., values in output registers will remain the same if $Idx_ctr != $index
+    bit     $Ra.16b, $T0a.16b, $Mask.16b
+    bit     $Rb.16b, $T0b.16b, $Mask.16b
+
+    bit     $Rc.16b, $T0c.16b, $Mask.16b
+    bit     $Rd.16b, $T0d.16b, $Mask.16b
+
+    bit     $Re.16b, $T0e.16b, $Mask.16b
+    bit     $Rf.16b, $T0f.16b, $Mask.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+    tbz    $Idx_ctr, #4, .Lselect_w5_loop
+
+    // Write [$Ra-$Rf] to memory at the output pointer
+    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64
+    st1     {$Re.2d, $Rf.2d}, [$Val_in]
+
+	ret
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,%function
+.align	4
+ecp_nistz256_select_w7:
+    AARCH64_VALID_CALL_TARGET
+
+    // $Idx_ctr := 0; loop counter and incremented internal index
+    mov     $Idx_ctr, #0
+
+    // [$Ra-$Rf] := 0
+    movi    $Ra.16b, #0
+    movi    $Rb.16b, #0
+    movi    $Rc.16b, #0
+    movi    $Rd.16b, #0
+
+.Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+    add $Idx_ctr, $Idx_ctr, #1
+
+    // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t
+    //  and advance $in_t to point to the next entry
+    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
+
+    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
+    cmp     $Idx_ctr, $index
+    csetm   $Mask_64, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+    dup     $Mask.2d, $Mask_64
+
+    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
+    // i.e., values in output registers will remain the same if $Idx_ctr != $index
+    bit     $Ra.16b, $T0a.16b, $Mask.16b
+    bit     $Rb.16b, $T0b.16b, $Mask.16b
+
+    bit     $Rc.16b, $T0c.16b, $Mask.16b
+    bit     $Rd.16b, $T0d.16b, $Mask.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+    tbz    $Idx_ctr, #6, .Lselect_w7_loop
+
+    // Write [$Ra-$Rd] to memory at the output pointer
+    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val]
+
+	ret
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!";	# enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
new file mode 100644
index 0000000000..87d9852695
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@@ -0,0 +1,4144 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel
+# (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
+#
+# Reference:
+# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+#                          256 Bit Primes"
+
+# Further optimization by <appro@openssl.org>:
+#
+#		this/original	with/without -DECP_NISTZ256_ASM(*)
+# Opteron	+15-49%		+150-195%
+# Bulldozer	+18-45%		+175-240%
+# P4		+24-46%		+100-150%
+# Westmere	+18-34%		+87-160%
+# Sandy Bridge	+14-35%		+120-185%
+# Ivy Bridge	+11-35%		+125-180%
+# Haswell	+10-37%		+160-200%
+# Broadwell	+24-58%		+210-270%
+# Atom		+20-50%		+180-240%
+# VIA Nano	+50-160%	+480-480%
+#
+# (*)	"without -DECP_NISTZ256_ASM" refers to build with
+#	"enable-ec_nistp_64_gcc_128";
+#
+# Ranges denote minimum and maximum improvement coefficients depending
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$avx = 2;
+$addx = 1;
+
+$code.=<<___;
+.text
+
+# The polynomial
+.section .rodata
+.align 64
+.Lpoly:
+.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+.LOne:
+.long 1,1,1,1,1,1,1,1
+.LTwo:
+.long 2,2,2,2,2,2,2,2
+.LThree:
+.long 3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
+.text
+___
+
+{
+my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
+my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+
+################################################################################
+# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
+.globl	ecp_nistz256_neg
+.type	ecp_nistz256_neg,\@function,2
+.align	32
+ecp_nistz256_neg:
+.cfi_startproc
+	_CET_ENDBR
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+.Lneg_body:
+
+	xor	$a0, $a0
+	xor	$a1, $a1
+	xor	$a2, $a2
+	xor	$a3, $a3
+	xor	$t4, $t4
+
+	sub	8*0($a_ptr), $a0
+	sbb	8*1($a_ptr), $a1
+	sbb	8*2($a_ptr), $a2
+	 mov	$a0, $t0
+	sbb	8*3($a_ptr), $a3
+	lea	.Lpoly(%rip), $a_ptr
+	 mov	$a1, $t1
+	sbb	\$0, $t4
+
+	add	8*0($a_ptr), $a0
+	 mov	$a2, $t2
+	adc	8*1($a_ptr), $a1
+	adc	8*2($a_ptr), $a2
+	 mov	$a3, $t3
+	adc	8*3($a_ptr), $a3
+	test	$t4, $t4
+
+	cmovz	$t0, $a0
+	cmovz	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovz	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovz	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	mov	0(%rsp),%r13
+.cfi_restore	%r13
+	mov	8(%rsp),%r12
+.cfi_restore	%r12
+	lea	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lneg_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
+my ($poly1,$poly3)=($acc6,$acc7);
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_ord_mul_mont_nohw
+.type	ecp_nistz256_ord_mul_mont_nohw,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mul_body:
+
+	mov	8*0($b_org), %rax
+	mov	$b_org, $b_ptr
+	lea	.Lord(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# * b[0]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	mov	%rax, $acc0
+	mov	$t0, %rax
+	mov	%rdx, $acc1
+
+	mulq	8*1($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc2
+
+	mulq	8*2($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc0, $acc5
+	 imulq	%r15,$acc0
+
+	mov	%rdx, $acc3
+	mulq	8*3($a_ptr)
+	add	%rax, $acc3
+	 mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# First reduction step
+	mulq	8*0(%r14)
+	mov	$acc0, $t1
+	add	%rax, $acc5		# guaranteed to be zero
+	mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $acc0		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$t1, %rax
+	adc	%rdx, $acc2
+	mov	$t1, %rdx
+	adc	\$0, $acc0		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*1($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+
+	################################# * b[1]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc1, $t0
+	 imulq	%r15, $acc1
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	xor	$acc0, $acc0
+	add	%rax, $acc4
+	 mov	$acc1, %rax
+	adc	%rdx, $acc5
+	adc	\$0, $acc0
+
+	################################# Second reduction step
+	mulq	8*0(%r14)
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc1, %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $acc1		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	%rdx, $acc3
+	mov	$t1, %rdx
+	adc	\$0, $acc1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc4
+	 mov	8*2($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc1, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+
+	################################## * b[2]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc2, $t0
+	 imulq	%r15, $acc2
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	xor	$acc1, $acc1
+	add	%rax, $acc5
+	 mov	$acc2, %rax
+	adc	%rdx, $acc0
+	adc	\$0, $acc1
+
+	################################# Third reduction step
+	mulq	8*0(%r14)
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc2, %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc4
+	sbb	\$0, $acc2		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	%rdx, $acc4
+	mov	$t1, %rdx
+	adc	\$0, $acc2		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc5
+	 mov	8*3($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc2, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+
+	################################# * b[3]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	add	%rax, $acc5
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc3, $t0
+	 imulq	%r15, $acc3
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc0
+	adc	\$0, %rdx
+	xor	$acc2, $acc2
+	add	%rax, $acc0
+	 mov	$acc3, %rax
+	adc	%rdx, $acc1
+	adc	\$0, $acc2
+
+	################################# Last reduction step
+	mulq	8*0(%r14)
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc3, %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc5
+	sbb	\$0, $acc3		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t1, %rax
+	adc	%rdx, $acc5
+	mov	$t1, %rdx
+	adc	\$0, $acc3		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc3, $acc0
+	adc	$t1, $acc1
+	adc	\$0, $acc2
+
+	################################# Subtract ord
+	 mov	$acc4, $a_ptr
+	sub	8*0(%r14), $acc4
+	 mov	$acc5, $acc3
+	sbb	8*1(%r14), $acc5
+	 mov	$acc0, $t0
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$a_ptr, $acc4
+	cmovc	$acc3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t rep);
+
+.globl	ecp_nistz256_ord_sqr_mont_nohw
+.type	ecp_nistz256_ord_sqr_mont_nohw,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqr_body:
+
+	mov	8*0($a_ptr), $acc0
+	mov	8*1($a_ptr), %rax
+	mov	8*2($a_ptr), $acc6
+	mov	8*3($a_ptr), $acc7
+	lea	.Lord(%rip), $a_ptr	# pointer to modulus
+	mov	$b_org, $b_ptr
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+	################################# a[1:] * a[0]
+	mov	%rax, $t1		# put aside a[1]
+	mul	$acc0			# a[1] * a[0]
+	mov	%rax, $acc1
+	movq	$t1, %xmm1		# offload a[1]
+	mov	$acc6, %rax
+	mov	%rdx, $acc2
+
+	mul	$acc0			# a[2] * a[0]
+	add	%rax, $acc2
+	mov	$acc7, %rax
+	movq	$acc6, %xmm2		# offload a[2]
+	adc	\$0, %rdx
+	mov	%rdx, $acc3
+
+	mul	$acc0			# a[3] * a[0]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	movq	$acc7, %xmm3		# offload a[3]
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# a[3] * a[2]
+	mul	$acc6			# a[3] * a[2]
+	mov	%rax, $acc5
+	mov	$acc6, %rax
+	mov	%rdx, $acc6
+
+	################################# a[2:] * a[1]
+	mul	$t1			# a[2] * a[1]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc7
+
+	mul	$t1			# a[3] * a[1]
+	add	%rax, $acc4
+	adc	\$0, %rdx
+
+	add	$acc7, $acc4
+	adc	%rdx, $acc5
+	adc	\$0, $acc6		# can't overflow
+
+	################################# *2
+	xor	$acc7, $acc7
+	mov	$acc0, %rax
+	add	$acc1, $acc1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$acc6, $acc6
+	adc	\$0, $acc7
+
+	################################# Missing products
+	mul	%rax			# a[0] * a[0]
+	mov	%rax, $acc0
+	movq	%xmm1, %rax
+	mov	%rdx, $t1
+
+	mul	%rax			# a[1] * a[1]
+	add	$t1, $acc1
+	adc	%rax, $acc2
+	movq	%xmm2, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mul	%rax			# a[2] * a[2]
+	add	$t1, $acc3
+	adc	%rax, $acc4
+	movq	%xmm3, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	 mov	$acc0, $t0
+	 imulq	8*4($a_ptr), $acc0	# *= .LordK
+
+	mul	%rax			# a[3] * a[3]
+	add	$t1, $acc5
+	adc	%rax, $acc6
+	 mov	8*0($a_ptr), %rax	# modulus[0]
+	adc	%rdx, $acc7		# can't overflow
+
+	################################# First reduction step
+	mul	$acc0
+	mov	$acc0, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax	# modulus[1]
+	adc	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc0
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$acc0, %rax
+	adc	%rdx, $acc2
+	mov	$acc0, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc1, $t0
+	 imulq	8*4($a_ptr), $acc1	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc0		# can't borrow
+
+	add	$t1, $acc3
+	adc	\$0, $acc0		# can't overflow
+
+	################################# Second reduction step
+	mul	$acc1
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc1
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$acc1, %rax
+	adc	%rdx, $acc3
+	mov	$acc1, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc2, $t0
+	 imulq	8*4($a_ptr), $acc2	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc1		# can't borrow
+
+	add	$t1, $acc0
+	adc	\$0, $acc1		# can't overflow
+
+	################################# Third reduction step
+	mul	$acc2
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc0
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc2
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$acc2, %rax
+	adc	%rdx, $acc0
+	mov	$acc2, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc3, $t0
+	 imulq	8*4($a_ptr), $acc3	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc1
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc2		# can't borrow
+
+	add	$t1, $acc1
+	adc	\$0, $acc2		# can't overflow
+
+	################################# Last reduction step
+	mul	$acc3
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc1
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc3
+	add	$t0, $acc0
+	adc	\$0, %rdx
+	add	%rax, $acc0
+	mov	$acc3, %rax
+	adc	%rdx, $acc1
+	mov	$acc3, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc2
+	sbb	%rdx, $acc3		# can't borrow
+
+	add	$t1, $acc2
+	adc	\$0, $acc3		# can't overflow
+
+	################################# Add bits [511:256] of the sqr result
+	xor	%rdx, %rdx
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc0, $acc4
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, %rax
+	adc	\$0, %rdx
+
+	################################# Compare to modulus
+	sub	8*0($a_ptr), $acc0
+	 mov	$acc2, $acc6
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc7
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rdx
+
+	cmovc	$acc4, $acc0
+	cmovnc	$acc1, %rax
+	cmovnc	$acc2, $acc6
+	cmovnc	$acc3, $acc7
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqr
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	%rax,  8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc6, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc7, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw
+___
+
+$code.=<<___	if ($addx);
+################################################################################
+.globl	ecp_nistz256_ord_mul_mont_adx
+.type	ecp_nistz256_ord_mul_mont_adx,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont_adx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_mont_adx:
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mulx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	.Lord-128(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	mulx	$acc3, $t1, $acc3
+	add	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	 mulx	%r15, %rdx, %rax
+	adc	$t1, $acc2
+	adc	$t0, $acc3
+	adc	\$0, $acc4
+
+	################################# reduction
+	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*1($b_ptr), %rdx
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	adcx	$acc0, $acc4
+	adox	$acc0, $acc5
+	adc	\$0, $acc5		# cf=0, of=0
+
+	################################# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc1		# guaranteed to be zero
+	adox	$t1, $acc2
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*2($b_ptr), %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adcx	$acc1, $acc5
+	adox	$acc1, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*3($b_ptr), %rdx
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	adcx	$acc2, $acc0
+	adox	$acc2, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc3		# guranteed to be zero
+	adox	$t1, $acc4
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128(%r14), $t0, $t1
+	lea	128(%r14),%r14
+	 mov	$acc4, $t2
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	 mov	$acc5, $t3
+	adcx	$acc3, $acc1
+	adox	$acc3, $acc2
+	adc	\$0, $acc2
+
+	#################################
+	# Branch-less conditional subtraction of P
+	 mov	$acc0, $t0
+	sub	8*0(%r14), $acc4
+	sbb	8*1(%r14), $acc5
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx
+
+.globl	ecp_nistz256_ord_sqr_mont_adx
+.type	ecp_nistz256_ord_sqr_mont_adx,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont_adx:
+.cfi_startproc
+	_CET_ENDBR
+.Lecp_nistz256_ord_sqr_mont_adx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqrx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), $acc6
+	mov	8*2($a_ptr), $acc7
+	mov	8*3($a_ptr), $acc0
+	lea	.Lord(%rip), $a_ptr
+	jmp	.Loop_ord_sqrx
+
+.align	32
+.Loop_ord_sqrx:
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	 mov	%rdx, %rax		# offload a[0]
+	 movq	$acc6, %xmm1		# offload a[1]
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	add	$t0, $acc2
+	 movq	$acc7, %xmm2		# offload a[2]
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	mov	%rax, %rdx
+	 movq	$acc0, %xmm3		# offload a[3]
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
+
+	################################# a[i]*a[i]
+	mulx	%rdx, $acc0, $t1
+	movq	%xmm1, %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	movq	%xmm2, %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	mulx	%rdx, $t0, $t1
+	.byte	0x67
+	movq	%xmm3, %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	%rdx, $t0, $t4
+	adox	$t0, $acc6
+	adox	$t4, $acc7
+
+	################################# reduction
+	mov	$acc0, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	xor	%rax, %rax		# cf=0, of=0
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0		# of=0
+	adcx	%rax, $acc0		# cf=0
+
+	#################################
+	mov	$acc1, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc1		# guaranteed to be zero
+	adcx	$t1, $acc2
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc3
+	adcx	$t1, $acc0
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1		# cf=0
+	adox	%rax, $acc1		# of=0
+
+	#################################
+	mov	$acc2, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2		# of=0
+	adcx	%rax, $acc2		# cf=0
+
+	#################################
+	mov	$acc3, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc3		# guaranteed to be zero
+	adcx	$t1, $acc0
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc1
+	adcx	$t1, $acc2
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	adox	%rax, $acc3
+
+	################################# accumulate upper half
+	add	$acc0, $acc4		# add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc4, %rdx
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, $acc6
+	adc	\$0, %rax
+
+	################################# compare to modulus
+	sub	8*0($a_ptr), $acc4
+	 mov	$acc2, $acc7
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc0
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rax
+
+	cmovnc	$acc4, %rdx
+	cmovnc	$acc1, $acc6
+	cmovnc	$acc2, $acc7
+	cmovnc	$acc3, $acc0
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqrx
+
+	mov	%rdx, 8*0($r_ptr)
+	mov	$acc6, 8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc7, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc0, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx
+___
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_mul_mont_nohw
+.type	ecp_nistz256_mul_mont_nohw,\@function,3
+.align	32
+ecp_nistz256_mul_mont_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lmul_body:
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rax
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+
+	call	__ecp_nistz256_mul_montq
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmul_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw
+
+.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
+.align	32
+__ecp_nistz256_mul_montq:
+.cfi_startproc
+	########################################################################
+	# Multiply a by b[0]
+	mov	%rax, $t1
+	mulq	$acc1
+	mov	.Lpoly+8*1(%rip),$poly1
+	mov	%rax, $acc0
+	mov	$t1, %rax
+	mov	%rdx, $acc1
+
+	mulq	$acc2
+	mov	.Lpoly+8*3(%rip),$poly3
+	add	%rax, $acc1
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc2
+
+	mulq	$acc3
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc3
+
+	mulq	$acc4
+	add	%rax, $acc3
+	 mov	$acc0, %rax
+	adc	\$0, %rdx
+	xor	$acc5, $acc5
+	mov	%rdx, $acc4
+
+	########################################################################
+	# First reduction step
+	# Basically now we want to multiply acc[0] by p256,
+	# and add the result to the acc.
+	# Due to the special form of p256 we do some optimizations
+	#
+	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+	# then we add acc[0] and get acc[0] x 2^96
+
+	mov	$acc0, $t1
+	shl	\$32, $acc0
+	mulq	$poly3
+	shr	\$32, $t1
+	add	$acc0, $acc1		# +=acc[0]<<96
+	adc	$t1, $acc2
+	adc	%rax, $acc3
+	 mov	8*1($b_ptr), %rax
+	adc	%rdx, $acc4
+	adc	\$0, $acc5
+	xor	$acc0, $acc0
+
+	########################################################################
+	# Multiply by b[1]
+	mov	%rax, $t1
+	mulq	8*0($a_ptr)
+	add	%rax, $acc1
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*1($a_ptr)
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*2($a_ptr)
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*3($a_ptr)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	 mov	$acc1, %rax
+	adc	%rdx, $acc5
+	adc	\$0, $acc0
+
+	########################################################################
+	# Second reduction step
+	mov	$acc1, $t1
+	shl	\$32, $acc1
+	mulq	$poly3
+	shr	\$32, $t1
+	add	$acc1, $acc2
+	adc	$t1, $acc3
+	adc	%rax, $acc4
+	 mov	8*2($b_ptr), %rax
+	adc	%rdx, $acc5
+	adc	\$0, $acc0
+	xor	$acc1, $acc1
+
+	########################################################################
+	# Multiply by b[2]
+	mov	%rax, $t1
+	mulq	8*0($a_ptr)
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*1($a_ptr)
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*2($a_ptr)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*3($a_ptr)
+	add	$t0, $acc5
+	adc	\$0, %rdx
+	add	%rax, $acc5
+	 mov	$acc2, %rax
+	adc	%rdx, $acc0
+	adc	\$0, $acc1
+
+	########################################################################
+	# Third reduction step
+	mov	$acc2, $t1
+	shl	\$32, $acc2
+	mulq	$poly3
+	shr	\$32, $t1
+	add	$acc2, $acc3
+	adc	$t1, $acc4
+	adc	%rax, $acc5
+	 mov	8*3($b_ptr), %rax
+	adc	%rdx, $acc0
+	adc	\$0, $acc1
+	xor	$acc2, $acc2
+
+	########################################################################
+	# Multiply by b[3]
+	mov	%rax, $t1
+	mulq	8*0($a_ptr)
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*1($a_ptr)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*2($a_ptr)
+	add	$t0, $acc5
+	adc	\$0, %rdx
+	add	%rax, $acc5
+	mov	$t1, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	8*3($a_ptr)
+	add	$t0, $acc0
+	adc	\$0, %rdx
+	add	%rax, $acc0
+	 mov	$acc3, %rax
+	adc	%rdx, $acc1
+	adc	\$0, $acc2
+
+	########################################################################
+	# Final reduction step
+	mov	$acc3, $t1
+	shl	\$32, $acc3
+	mulq	$poly3
+	shr	\$32, $t1
+	add	$acc3, $acc4
+	adc	$t1, $acc5
+	 mov	$acc4, $t0
+	adc	%rax, $acc0
+	adc	%rdx, $acc1
+	 mov	$acc5, $t1
+	adc	\$0, $acc2
+
+	########################################################################
+	# Branch-less conditional subtraction of P
+	sub	\$-1, $acc4		# .Lpoly[0]
+	 mov	$acc0, $t2
+	sbb	$poly1, $acc5		# .Lpoly[1]
+	sbb	\$0, $acc0		# .Lpoly[2]
+	 mov	$acc1, $t3
+	sbb	$poly3, $acc1		# .Lpoly[3]
+	sbb	\$0, $acc2
+
+	cmovc	$t0, $acc4
+	cmovc	$t1, $acc5
+	mov	$acc4, 8*0($r_ptr)
+	cmovc	$t2, $acc0
+	mov	$acc5, 8*1($r_ptr)
+	cmovc	$t3, $acc1
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+################################################################################
+# void ecp_nistz256_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4]);
+
+# we optimize the square according to S.Gueron and V.Krasnov,
+# "Speeding up Big-Number Squaring"
+.globl	ecp_nistz256_sqr_mont_nohw
+.type	ecp_nistz256_sqr_mont_nohw,\@function,2
+.align	32
+ecp_nistz256_sqr_mont_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lsqr_body:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), $acc6
+	mov	8*2($a_ptr), $acc7
+	mov	8*3($a_ptr), $acc0
+
+	call	__ecp_nistz256_sqr_montq
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqr_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw
+
+.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
+.align	32
+__ecp_nistz256_sqr_montq:
+.cfi_startproc
+	mov	%rax, $acc5
+	mulq	$acc6			# a[1]*a[0]
+	mov	%rax, $acc1
+	mov	$acc7, %rax
+	mov	%rdx, $acc2
+
+	mulq	$acc5			# a[0]*a[2]
+	add	%rax, $acc2
+	mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc3
+
+	mulq	$acc5			# a[0]*a[3]
+	add	%rax, $acc3
+	 mov	$acc7, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	#################################
+	mulq	$acc6			# a[1]*a[2]
+	add	%rax, $acc3
+	mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	$acc6			# a[1]*a[3]
+	add	%rax, $acc4
+	 mov	$acc0, %rax
+	adc	\$0, %rdx
+	add	$t1, $acc4
+	mov	%rdx, $acc5
+	adc	\$0, $acc5
+
+	#################################
+	mulq	$acc7			# a[2]*a[3]
+	xor	$acc7, $acc7
+	add	%rax, $acc5
+	 mov	8*0($a_ptr), %rax
+	mov	%rdx, $acc6
+	adc	\$0, $acc6
+
+	add	$acc1, $acc1		# acc1:6<<1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$acc6, $acc6
+	adc	\$0, $acc7
+
+	mulq	%rax
+	mov	%rax, $acc0
+	mov	8*1($a_ptr), %rax
+	mov	%rdx, $t0
+
+	mulq	%rax
+	add	$t0, $acc1
+	adc	%rax, $acc2
+	mov	8*2($a_ptr), %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	%rax
+	add	$t0, $acc3
+	adc	%rax, $acc4
+	mov	8*3($a_ptr), %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	mulq	%rax
+	add	$t0, $acc5
+	adc	%rax, $acc6
+	 mov	$acc0, %rax
+	adc	%rdx, $acc7
+
+	mov	.Lpoly+8*1(%rip), $a_ptr
+	mov	.Lpoly+8*3(%rip), $t1
+
+	##########################################
+	# Now the reduction
+	# First iteration
+	mov	$acc0, $t0
+	shl	\$32, $acc0
+	mulq	$t1
+	shr	\$32, $t0
+	add	$acc0, $acc1		# +=acc[0]<<96
+	adc	$t0, $acc2
+	adc	%rax, $acc3
+	 mov	$acc1, %rax
+	adc	\$0, %rdx
+
+	##########################################
+	# Second iteration
+	mov	$acc1, $t0
+	shl	\$32, $acc1
+	mov	%rdx, $acc0
+	mulq	$t1
+	shr	\$32, $t0
+	add	$acc1, $acc2
+	adc	$t0, $acc3
+	adc	%rax, $acc0
+	 mov	$acc2, %rax
+	adc	\$0, %rdx
+
+	##########################################
+	# Third iteration
+	mov	$acc2, $t0
+	shl	\$32, $acc2
+	mov	%rdx, $acc1
+	mulq	$t1
+	shr	\$32, $t0
+	add	$acc2, $acc3
+	adc	$t0, $acc0
+	adc	%rax, $acc1
+	 mov	$acc3, %rax
+	adc	\$0, %rdx
+
+	###########################################
+	# Last iteration
+	mov	$acc3, $t0
+	shl	\$32, $acc3
+	mov	%rdx, $acc2
+	mulq	$t1
+	shr	\$32, $t0
+	add	$acc3, $acc0
+	adc	$t0, $acc1
+	adc	%rax, $acc2
+	adc	\$0, %rdx
+	xor	$acc3, $acc3
+
+	############################################
+	# Add the rest of the acc
+	add	$acc0, $acc4
+	adc	$acc1, $acc5
+	 mov	$acc4, $acc0
+	adc	$acc2, $acc6
+	adc	%rdx, $acc7
+	 mov	$acc5, $acc1
+	adc	\$0, $acc3
+
+	sub	\$-1, $acc4		# .Lpoly[0]
+	 mov	$acc6, $acc2
+	sbb	$a_ptr, $acc5		# .Lpoly[1]
+	sbb	\$0, $acc6		# .Lpoly[2]
+	 mov	$acc7, $t0
+	sbb	$t1, $acc7		# .Lpoly[3]
+	sbb	\$0, $acc3
+
+	cmovc	$acc0, $acc4
+	cmovc	$acc1, $acc5
+	mov	$acc4, 8*0($r_ptr)
+	cmovc	$acc2, $acc6
+	mov	$acc5, 8*1($r_ptr)
+	cmovc	$t0, $acc7
+	mov	$acc6, 8*2($r_ptr)
+	mov	$acc7, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+___
+
+if ($addx) {
+$code.=<<___;
+.globl	ecp_nistz256_mul_mont_adx
+.type	ecp_nistz256_mul_mont_adx,\@function,3
+.align	32
+ecp_nistz256_mul_mont_adx:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lmulx_body:
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+
+	call	__ecp_nistz256_mul_montx
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmulx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx
+
+.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
+.align	32
+__ecp_nistz256_mul_montx:
+.cfi_startproc
+	########################################################################
+	# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	mov	\$32, $poly1
+	xor	$acc5, $acc5		# cf=0
+	mulx	$acc3, $t1, $acc3
+	mov	.Lpoly+8*3(%rip), $poly3
+	adc	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	adc	$t1, $acc2
+	 shlx	$poly1,$acc0,$t1
+	adc	$t0, $acc3
+	 shrx	$poly1,$acc0,$t0
+	adc	\$0, $acc4
+
+	########################################################################
+	# First reduction step
+	add	$t1, $acc1
+	adc	$t0, $acc2
+
+	mulx	$poly3, $t0, $t1
+	 mov	8*1($b_ptr), %rdx
+	adc	$t0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
+
+	########################################################################
+	# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	adcx	$t0, $acc4
+	 shlx	$poly1, $acc1, $t0
+	adox	$t1, $acc5
+	 shrx	$poly1, $acc1, $t1
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0
+
+	########################################################################
+	# Second reduction step
+	add	$t0, $acc2
+	adc	$t1, $acc3
+
+	mulx	$poly3, $t0, $t1
+	 mov	8*2($b_ptr), %rdx
+	adc	$t0, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
+
+	########################################################################
+	# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	adcx	$t0, $acc5
+	 shlx	$poly1, $acc2, $t0
+	adox	$t1, $acc0
+	 shrx	$poly1, $acc2, $t1
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1
+
+	########################################################################
+	# Third reduction step
+	add	$t0, $acc3
+	adc	$t1, $acc4
+
+	mulx	$poly3, $t0, $t1
+	 mov	8*3($b_ptr), %rdx
+	adc	$t0, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
+
+	########################################################################
+	# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	adcx	$t0, $acc0
+	 shlx	$poly1, $acc3, $t0
+	adox	$t1, $acc1
+	 shrx	$poly1, $acc3, $t1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2
+
+	########################################################################
+	# Fourth reduction step
+	add	$t0, $acc4
+	adc	$t1, $acc5
+
+	mulx	$poly3, $t0, $t1
+	 mov	$acc4, $t2
+	mov	.Lpoly+8*1(%rip), $poly1
+	adc	$t0, $acc0
+	 mov	$acc5, $t3
+	adc	$t1, $acc1
+	adc	\$0, $acc2
+
+	########################################################################
+	# Branch-less conditional subtraction of P
+	xor	%eax, %eax
+	 mov	$acc0, $t0
+	sbb	\$-1, $acc4		# .Lpoly[0]
+	sbb	$poly1, $acc5		# .Lpoly[1]
+	sbb	\$0, $acc0		# .Lpoly[2]
+	 mov	$acc1, $t1
+	sbb	$poly3, $acc1		# .Lpoly[3]
+	sbb	\$0, $acc2
+
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	mov	$acc4, 8*0($r_ptr)
+	cmovc	$t0, $acc0
+	mov	$acc5, 8*1($r_ptr)
+	cmovc	$t1, $acc1
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.globl	ecp_nistz256_sqr_mont_adx
+.type	ecp_nistz256_sqr_mont_adx,\@function,2
+.align	32
+ecp_nistz256_sqr_mont_adx:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lsqrx_body:
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), $acc6
+	mov	8*2($a_ptr), $acc7
+	mov	8*3($a_ptr), $acc0
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+
+	call	__ecp_nistz256_sqr_montx
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqrx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx
+
+.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
+.align	32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	xor	%eax, %eax
+	adc	$t0, $acc2
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	 mov	8*0+128($a_ptr), %rdx
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
+
+	mulx	%rdx, $acc0, $t1
+	mov	8*1+128($a_ptr), %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	mov	8*2+128($a_ptr), %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	.byte	0x67
+	mulx	%rdx, $t0, $t1
+	mov	8*3+128($a_ptr), %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	 mov	\$32, $a_ptr
+	adox	$t1, $acc5
+	.byte	0x67,0x67
+	mulx	%rdx, $t0, $t4
+	 mov	.Lpoly+8*3(%rip), %rdx
+	adox	$t0, $acc6
+	 shlx	$a_ptr, $acc0, $t0
+	adox	$t4, $acc7
+	 shrx	$a_ptr, $acc0, $t4
+	mov	%rdx,$t1
+
+	# reduction step 1
+	add	$t0, $acc1
+	adc	$t4, $acc2
+
+	mulx	$acc0, $t0, $acc0
+	adc	$t0, $acc3
+	 shlx	$a_ptr, $acc1, $t0
+	adc	\$0, $acc0
+	 shrx	$a_ptr, $acc1, $t4
+
+	# reduction step 2
+	add	$t0, $acc2
+	adc	$t4, $acc3
+
+	mulx	$acc1, $t0, $acc1
+	adc	$t0, $acc0
+	 shlx	$a_ptr, $acc2, $t0
+	adc	\$0, $acc1
+	 shrx	$a_ptr, $acc2, $t4
+
+	# reduction step 3
+	add	$t0, $acc3
+	adc	$t4, $acc0
+
+	mulx	$acc2, $t0, $acc2
+	adc	$t0, $acc1
+	 shlx	$a_ptr, $acc3, $t0
+	adc	\$0, $acc2
+	 shrx	$a_ptr, $acc3, $t4
+
+	# reduction step 4
+	add	$t0, $acc0
+	adc	$t4, $acc1
+
+	mulx	$acc3, $t0, $acc3
+	adc	$t0, $acc2
+	adc	\$0, $acc3
+
+	xor	$t3, $t3
+	add	$acc0, $acc4		# accumulate upper half
+	 mov	.Lpoly+8*1(%rip), $a_ptr
+	adc	$acc1, $acc5
+	 mov	$acc4, $acc0
+	adc	$acc2, $acc6
+	adc	$acc3, $acc7
+	 mov	$acc5, $acc1
+	adc	\$0, $t3
+
+	sub	\$-1, $acc4		# .Lpoly[0]
+	 mov	$acc6, $acc2
+	sbb	$a_ptr, $acc5		# .Lpoly[1]
+	sbb	\$0, $acc6		# .Lpoly[2]
+	 mov	$acc7, $acc3
+	sbb	$t1, $acc7		# .Lpoly[3]
+	sbb	\$0, $t3
+
+	cmovc	$acc0, $acc4
+	cmovc	$acc1, $acc5
+	mov	$acc4, 8*0($r_ptr)
+	cmovc	$acc2, $acc6
+	mov	$acc5, 8*1($r_ptr)
+	cmovc	$acc3, $acc7
+	mov	$acc6, 8*2($r_ptr)
+	mov	$acc7, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+___
+}
+}
+{
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
+my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
+my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_select_w5_nohw(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5_nohw
+.type	ecp_nistz256_select_w5_nohw,\@abi-omnipotent
+.align	32
+ecp_nistz256_select_w5_nohw:
+.cfi_startproc
+	_CET_ENDBR
+___
+$code.=<<___	if ($win64);
+	lea	-0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w5_nohw:
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
+___
+$code.=<<___;
+	movdqa	.LOne(%rip), $ONE
+	movd	$index, $INDEX
+
+	pxor	$Ra, $Ra
+	pxor	$Rb, $Rb
+	pxor	$Rc, $Rc
+	pxor	$Rd, $Rd
+	pxor	$Re, $Re
+	pxor	$Rf, $Rf
+
+	movdqa	$ONE, $M0
+	pshufd	\$0, $INDEX, $INDEX
+
+	mov	\$16, %rax
+.Lselect_loop_sse_w5:
+
+	movdqa	$M0, $TMP0
+	paddd	$ONE, $M0
+	pcmpeqd $INDEX, $TMP0
+
+	movdqa	16*0($in_t), $T0a
+	movdqa	16*1($in_t), $T0b
+	movdqa	16*2($in_t), $T0c
+	movdqa	16*3($in_t), $T0d
+	movdqa	16*4($in_t), $T0e
+	movdqa	16*5($in_t), $T0f
+	lea 16*6($in_t), $in_t
+
+	pand	$TMP0, $T0a
+	pand	$TMP0, $T0b
+	por	$T0a, $Ra
+	pand	$TMP0, $T0c
+	por	$T0b, $Rb
+	pand	$TMP0, $T0d
+	por	$T0c, $Rc
+	pand	$TMP0, $T0e
+	por	$T0d, $Rd
+	pand	$TMP0, $T0f
+	por	$T0e, $Re
+	por	$T0f, $Rf
+
+	dec	%rax
+	jnz	.Lselect_loop_sse_w5
+
+	movdqu	$Ra, 16*0($val)
+	movdqu	$Rb, 16*1($val)
+	movdqu	$Rc, 16*2($val)
+	movdqu	$Rd, 16*3($val)
+	movdqu	$Re, 16*4($val)
+	movdqu	$Rf, 16*5($val)
+___
+$code.=<<___	if ($win64);
+	movaps	(%rsp), %xmm6
+	movaps	0x10(%rsp), %xmm7
+	movaps	0x20(%rsp), %xmm8
+	movaps	0x30(%rsp), %xmm9
+	movaps	0x40(%rsp), %xmm10
+	movaps	0x50(%rsp), %xmm11
+	movaps	0x60(%rsp), %xmm12
+	movaps	0x70(%rsp), %xmm13
+	movaps	0x80(%rsp), %xmm14
+	movaps	0x90(%rsp), %xmm15
+	lea	0xa8(%rsp), %rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w5_nohw:
+.size	ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw
+
+################################################################################
+# void ecp_nistz256_select_w7_nohw(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7_nohw
+.type	ecp_nistz256_select_w7_nohw,\@abi-omnipotent
+.align	32
+ecp_nistz256_select_w7_nohw:
+.cfi_startproc
+	_CET_ENDBR
+___
+$code.=<<___	if ($win64);
+	lea	-0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w7_nohw:
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
+___
+$code.=<<___;
+	movdqa	.LOne(%rip), $M0
+	movd	$index, $INDEX
+
+	pxor	$Ra, $Ra
+	pxor	$Rb, $Rb
+	pxor	$Rc, $Rc
+	pxor	$Rd, $Rd
+
+	movdqa	$M0, $ONE
+	pshufd	\$0, $INDEX, $INDEX
+	mov	\$64, %rax
+
+.Lselect_loop_sse_w7:
+	movdqa	$M0, $TMP0
+	paddd	$ONE, $M0
+	movdqa	16*0($in_t), $T0a
+	movdqa	16*1($in_t), $T0b
+	pcmpeqd	$INDEX, $TMP0
+	movdqa	16*2($in_t), $T0c
+	movdqa	16*3($in_t), $T0d
+	lea	16*4($in_t), $in_t
+
+	pand	$TMP0, $T0a
+	pand	$TMP0, $T0b
+	por	$T0a, $Ra
+	pand	$TMP0, $T0c
+	por	$T0b, $Rb
+	pand	$TMP0, $T0d
+	por	$T0c, $Rc
+	prefetcht0	255($in_t)
+	por	$T0d, $Rd
+
+	dec	%rax
+	jnz	.Lselect_loop_sse_w7
+
+	movdqu	$Ra, 16*0($val)
+	movdqu	$Rb, 16*1($val)
+	movdqu	$Rc, 16*2($val)
+	movdqu	$Rd, 16*3($val)
+___
+$code.=<<___	if ($win64);
+	movaps	(%rsp), %xmm6
+	movaps	0x10(%rsp), %xmm7
+	movaps	0x20(%rsp), %xmm8
+	movaps	0x30(%rsp), %xmm9
+	movaps	0x40(%rsp), %xmm10
+	movaps	0x50(%rsp), %xmm11
+	movaps	0x60(%rsp), %xmm12
+	movaps	0x70(%rsp), %xmm13
+	movaps	0x80(%rsp), %xmm14
+	movaps	0x90(%rsp), %xmm15
+	lea	0xa8(%rsp), %rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w7_nohw:
+.size	ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw
+___
+}
+if ($avx>1) {
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
+my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
+my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_select_w5_avx2(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5_avx2
+.type	ecp_nistz256_select_w5_avx2,\@abi-omnipotent
+.align	32
+ecp_nistz256_select_w5_avx2:
+.cfi_startproc
+	_CET_ENDBR
+	vzeroupper
+___
+$code.=<<___	if ($win64);
+	lea	-0x88(%rsp), %rax
+	mov	%rsp,%r11
+.LSEH_begin_ecp_nistz256_select_w5_avx2:
+	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+	vmovdqa	.LTwo(%rip), $TWO
+
+	vpxor	$Ra, $Ra, $Ra
+	vpxor	$Rb, $Rb, $Rb
+	vpxor	$Rc, $Rc, $Rc
+
+	vmovdqa .LOne(%rip), $M0
+	vmovdqa .LTwo(%rip), $M1
+
+	vmovd	$index, %xmm1
+	vpermd	$INDEX, $Ra, $INDEX
+
+	mov	\$8, %rax
+.Lselect_loop_avx2_w5:
+
+	vmovdqa	32*0($in_t), $T0a
+	vmovdqa	32*1($in_t), $T0b
+	vmovdqa	32*2($in_t), $T0c
+
+	vmovdqa	32*3($in_t), $T1a
+	vmovdqa	32*4($in_t), $T1b
+	vmovdqa	32*5($in_t), $T1c
+
+	vpcmpeqd	$INDEX, $M0, $TMP0
+	vpcmpeqd	$INDEX, $M1, $TMP1
+
+	vpaddd	$TWO, $M0, $M0
+	vpaddd	$TWO, $M1, $M1
+	lea	32*6($in_t), $in_t
+
+	vpand	$TMP0, $T0a, $T0a
+	vpand	$TMP0, $T0b, $T0b
+	vpand	$TMP0, $T0c, $T0c
+	vpand	$TMP1, $T1a, $T1a
+	vpand	$TMP1, $T1b, $T1b
+	vpand	$TMP1, $T1c, $T1c
+
+	vpxor	$T0a, $Ra, $Ra
+	vpxor	$T0b, $Rb, $Rb
+	vpxor	$T0c, $Rc, $Rc
+	vpxor	$T1a, $Ra, $Ra
+	vpxor	$T1b, $Rb, $Rb
+	vpxor	$T1c, $Rc, $Rc
+
+	dec %rax
+	jnz .Lselect_loop_avx2_w5
+
+	vmovdqu $Ra, 32*0($val)
+	vmovdqu $Rb, 32*1($val)
+	vmovdqu $Rc, 32*2($val)
+	vzeroupper
+___
+$code.=<<___	if ($win64);
+	movaps	(%rsp), %xmm6
+	movaps	0x10(%rsp), %xmm7
+	movaps	0x20(%rsp), %xmm8
+	movaps	0x30(%rsp), %xmm9
+	movaps	0x40(%rsp), %xmm10
+	movaps	0x50(%rsp), %xmm11
+	movaps	0x60(%rsp), %xmm12
+	movaps	0x70(%rsp), %xmm13
+	movaps	0x80(%rsp), %xmm14
+	movaps	0x90(%rsp), %xmm15
+	lea	(%r11), %rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w5_avx2:
+.size	ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2
+___
+}
+if ($avx>1) {
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
+my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
+my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
+my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
+
+$code.=<<___;
+
+################################################################################
+# void ecp_nistz256_select_w7_avx2(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7_avx2
+.type	ecp_nistz256_select_w7_avx2,\@abi-omnipotent
+.align	32
+ecp_nistz256_select_w7_avx2:
+.cfi_startproc
+	_CET_ENDBR
+	vzeroupper
+___
+$code.=<<___	if ($win64);
+	mov	%rsp,%r11
+	lea	-0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w7_avx2:
+	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+	vmovdqa	.LThree(%rip), $THREE
+
+	vpxor	$Ra, $Ra, $Ra
+	vpxor	$Rb, $Rb, $Rb
+
+	vmovdqa .LOne(%rip), $M0
+	vmovdqa .LTwo(%rip), $M1
+	vmovdqa .LThree(%rip), $M2
+
+	vmovd	$index, %xmm1
+	vpermd	$INDEX, $Ra, $INDEX
+	# Skip index = 0, because it is implicitly the point at infinity
+
+	mov	\$21, %rax
+.Lselect_loop_avx2_w7:
+
+	vmovdqa	32*0($in_t), $T0a
+	vmovdqa	32*1($in_t), $T0b
+
+	vmovdqa	32*2($in_t), $T1a
+	vmovdqa	32*3($in_t), $T1b
+
+	vmovdqa	32*4($in_t), $T2a
+	vmovdqa	32*5($in_t), $T2b
+
+	vpcmpeqd	$INDEX, $M0, $TMP0
+	vpcmpeqd	$INDEX, $M1, $TMP1
+	vpcmpeqd	$INDEX, $M2, $TMP2
+
+	vpaddd	$THREE, $M0, $M0
+	vpaddd	$THREE, $M1, $M1
+	vpaddd	$THREE, $M2, $M2
+	lea	32*6($in_t), $in_t
+
+	vpand	$TMP0, $T0a, $T0a
+	vpand	$TMP0, $T0b, $T0b
+	vpand	$TMP1, $T1a, $T1a
+	vpand	$TMP1, $T1b, $T1b
+	vpand	$TMP2, $T2a, $T2a
+	vpand	$TMP2, $T2b, $T2b
+
+	vpxor	$T0a, $Ra, $Ra
+	vpxor	$T0b, $Rb, $Rb
+	vpxor	$T1a, $Ra, $Ra
+	vpxor	$T1b, $Rb, $Rb
+	vpxor	$T2a, $Ra, $Ra
+	vpxor	$T2b, $Rb, $Rb
+
+	dec %rax
+	jnz .Lselect_loop_avx2_w7
+
+
+	vmovdqa	32*0($in_t), $T0a
+	vmovdqa	32*1($in_t), $T0b
+
+	vpcmpeqd	$INDEX, $M0, $TMP0
+
+	vpand	$TMP0, $T0a, $T0a
+	vpand	$TMP0, $T0b, $T0b
+
+	vpxor	$T0a, $Ra, $Ra
+	vpxor	$T0b, $Rb, $Rb
+
+	vmovdqu $Ra, 32*0($val)
+	vmovdqu $Rb, 32*1($val)
+	vzeroupper
+___
+$code.=<<___	if ($win64);
+	movaps	(%rsp), %xmm6
+	movaps	0x10(%rsp), %xmm7
+	movaps	0x20(%rsp), %xmm8
+	movaps	0x30(%rsp), %xmm9
+	movaps	0x40(%rsp), %xmm10
+	movaps	0x50(%rsp), %xmm11
+	movaps	0x60(%rsp), %xmm12
+	movaps	0x70(%rsp), %xmm13
+	movaps	0x80(%rsp), %xmm14
+	movaps	0x90(%rsp), %xmm15
+	lea	(%r11), %rsp
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w7_avx2:
+.size	ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2
+___
+}
+{{{
+########################################################################
+# This block implements higher level point_double, point_add and
+# point_add_affine. The key to performance in this case is to allow
+# out-of-order execution logic to overlap computations from next step
+# with tail processing from current step. By using tailored calling
+# sequence we minimize inter-step overhead to give processor better
+# shot at overlapping operations...
+#
+# You will notice that input data is copied to stack. Trouble is that
+# there are no registers to spare for holding original pointers and
+# reloading them, pointers, would create undesired dependencies on
+# effective addresses calculation paths. In other words it's too done
+# to favour out-of-order execution logic.
+#						<appro@openssl.org>
+
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
+my ($poly1,$poly3)=($acc6,$acc7);
+
+sub load_for_mul () {
+my ($a,$b,$src0) = @_;
+my $bias = $src0 eq "%rax" ? 0 : -128;
+
+"	mov	$b, $src0
+	lea	$b, $b_ptr
+	mov	8*0+$a, $acc1
+	mov	8*1+$a, $acc2
+	lea	$bias+$a, $a_ptr
+	mov	8*2+$a, $acc3
+	mov	8*3+$a, $acc4"
+}
+
+sub load_for_sqr () {
+my ($a,$src0) = @_;
+my $bias = $src0 eq "%rax" ? 0 : -128;
+
+"	mov	8*0+$a, $src0
+	mov	8*1+$a, $acc6
+	lea	$bias+$a, $a_ptr
+	mov	8*2+$a, $acc7
+	mov	8*3+$a, $acc0"
+}
+
+									{
+########################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+
+$code.=<<___;
+.type	__ecp_nistz256_add_toq,\@abi-omnipotent
+.align	32
+__ecp_nistz256_add_toq:
+.cfi_startproc
+	xor	$t4,$t4
+	add	8*0($b_ptr), $a0
+	adc	8*1($b_ptr), $a1
+	 mov	$a0, $t0
+	adc	8*2($b_ptr), $a2
+	adc	8*3($b_ptr), $a3
+	 mov	$a1, $t1
+	adc	\$0, $t4
+
+	sub	\$-1, $a0
+	 mov	$a2, $t2
+	sbb	$poly1, $a1
+	sbb	\$0, $a2
+	 mov	$a3, $t3
+	sbb	$poly3, $a3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovc	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovc	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
+.align	32
+__ecp_nistz256_sub_fromq:
+.cfi_startproc
+	sub	8*0($b_ptr), $a0
+	sbb	8*1($b_ptr), $a1
+	 mov	$a0, $t0
+	sbb	8*2($b_ptr), $a2
+	sbb	8*3($b_ptr), $a3
+	 mov	$a1, $t1
+	sbb	$t4, $t4
+
+	add	\$-1, $a0
+	 mov	$a2, $t2
+	adc	$poly1, $a1
+	adc	\$0, $a2
+	 mov	$a3, $t3
+	adc	$poly3, $a3
+	test	$t4, $t4
+
+	cmovz	$t0, $a0
+	cmovz	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovz	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovz	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type	__ecp_nistz256_subq,\@abi-omnipotent
+.align	32
+__ecp_nistz256_subq:
+.cfi_startproc
+	sub	$a0, $t0
+	sbb	$a1, $t1
+	 mov	$t0, $a0
+	sbb	$a2, $t2
+	sbb	$a3, $t3
+	 mov	$t1, $a1
+	sbb	$t4, $t4
+
+	add	\$-1, $t0
+	 mov	$t2, $a2
+	adc	$poly1, $t1
+	adc	\$0, $t2
+	 mov	$t3, $a3
+	adc	$poly3, $t3
+	test	$t4, $t4
+
+	cmovnz	$t0, $a0
+	cmovnz	$t1, $a1
+	cmovnz	$t2, $a2
+	cmovnz	$t3, $a3
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
+.align	32
+__ecp_nistz256_mul_by_2q:
+.cfi_startproc
+	xor	$t4, $t4
+	add	$a0, $a0		# a0:a3+a0:a3
+	adc	$a1, $a1
+	 mov	$a0, $t0
+	adc	$a2, $a2
+	adc	$a3, $a3
+	 mov	$a1, $t1
+	adc	\$0, $t4
+
+	sub	\$-1, $a0
+	 mov	$a2, $t2
+	sbb	$poly1, $a1
+	sbb	\$0, $a2
+	 mov	$a3, $t3
+	sbb	$poly3, $a3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovc	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovc	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+___
+									}
+sub gen_double () {
+    my $x = shift;
+    my ($src0,$sfx,$bias);
+    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
+
+    if ($x ne "x") {
+	$src0 = "%rax";
+	$sfx  = "_nohw";
+	$bias = 0;
+    } else {
+	$src0 = "%rdx";
+	$sfx  = "_adx";
+	$bias = 128;
+    }
+$code.=<<___;
+.globl	ecp_nistz256_point_double$sfx
+.type	ecp_nistz256_point_double$sfx,\@function,2
+.align	32
+ecp_nistz256_point_double$sfx:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$32*5+8, %rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_double${x}_body:
+
+.Lpoint_double_shortcut$x:
+	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
+	mov	$a_ptr, $b_ptr			# backup copy
+	movdqu	0x10($a_ptr), %xmm1
+	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
+	 mov	0x20+8*1($a_ptr), $acc5
+	 mov	0x20+8*2($a_ptr), $acc0
+	 mov	0x20+8*3($a_ptr), $acc1
+	 mov	.Lpoly+8*1(%rip), $poly1
+	 mov	.Lpoly+8*3(%rip), $poly3
+	movdqa	%xmm0, $in_x(%rsp)
+	movdqa	%xmm1, $in_x+0x10(%rsp)
+	lea	0x20($r_ptr), $acc2
+	lea	0x40($r_ptr), $acc3
+	movq	$r_ptr, %xmm0
+	movq	$acc2, %xmm1
+	movq	$acc3, %xmm2
+
+	lea	$S(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
+
+	mov	0x40+8*0($a_ptr), $src0
+	mov	0x40+8*1($a_ptr), $acc6
+	mov	0x40+8*2($a_ptr), $acc7
+	mov	0x40+8*3($a_ptr), $acc0
+	lea	0x40-$bias($a_ptr), $a_ptr
+	lea	$Zsqr(%rsp), $r_ptr
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
+
+	`&load_for_sqr("$S(%rsp)", "$src0")`
+	lea	$S(%rsp), $r_ptr
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
+
+	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
+	mov	0x40+8*0($b_ptr), $acc1
+	mov	0x40+8*1($b_ptr), $acc2
+	mov	0x40+8*2($b_ptr), $acc3
+	mov	0x40+8*3($b_ptr), $acc4
+	lea	0x40-$bias($b_ptr), $a_ptr
+	lea	0x20($b_ptr), $b_ptr
+	movq	%xmm2, $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
+	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
+
+	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
+	mov	$in_x+8*1(%rsp), $acc5
+	lea	$Zsqr(%rsp), $b_ptr
+	mov	$in_x+8*2(%rsp), $acc0
+	mov	$in_x+8*3(%rsp), $acc1
+	lea	$M(%rsp), $r_ptr
+	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
+
+	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
+	mov	$in_x+8*1(%rsp), $acc5
+	lea	$Zsqr(%rsp), $b_ptr
+	mov	$in_x+8*2(%rsp), $acc0
+	mov	$in_x+8*3(%rsp), $acc1
+	lea	$Zsqr(%rsp), $r_ptr
+	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
+
+	`&load_for_sqr("$S(%rsp)", "$src0")`
+	movq	%xmm1, $r_ptr
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
+___
+{
+######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
+# operate in 4-5-6-7 "name space" that matches squaring output
+#
+my ($poly1,$poly3)=($a_ptr,$t1);
+my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
+
+$code.=<<___;
+	xor	$t4, $t4
+	mov	$a0, $t0
+	add	\$-1, $a0
+	mov	$a1, $t1
+	adc	$poly1, $a1
+	mov	$a2, $t2
+	adc	\$0, $a2
+	mov	$a3, $t3
+	adc	$poly3, $a3
+	adc	\$0, $t4
+	xor	$a_ptr, $a_ptr		# borrow $a_ptr
+	test	\$1, $t0
+
+	cmovz	$t0, $a0
+	cmovz	$t1, $a1
+	cmovz	$t2, $a2
+	cmovz	$t3, $a3
+	cmovz	$a_ptr, $t4
+
+	mov	$a1, $t0		# a0:a3>>1
+	shr	\$1, $a0
+	shl	\$63, $t0
+	mov	$a2, $t1
+	shr	\$1, $a1
+	or	$t0, $a0
+	shl	\$63, $t1
+	mov	$a3, $t2
+	shr	\$1, $a2
+	or	$t1, $a1
+	shl	\$63, $t2
+	mov	$a0, 8*0($r_ptr)
+	shr	\$1, $a3
+	mov	$a1, 8*1($r_ptr)
+	shl	\$63, $t4
+	or	$t2, $a2
+	or	$t4, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
+	lea	$M(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
+
+	lea	$tmp0(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_by_2$x
+
+	lea	$M(%rsp), $b_ptr
+	lea	$M(%rsp), $r_ptr
+	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
+
+	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
+	lea	$S(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
+
+	lea	$tmp0(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
+
+	`&load_for_sqr("$M(%rsp)", "$src0")`
+	movq	%xmm0, $r_ptr
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
+
+	lea	$tmp0(%rsp), $b_ptr
+	mov	$acc6, $acc0			# harmonize sqr output and sub input
+	mov	$acc7, $acc1
+	mov	$a_ptr, $poly1
+	mov	$t1, $poly3
+	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
+
+	mov	$S+8*0(%rsp), $t0
+	mov	$S+8*1(%rsp), $t1
+	mov	$S+8*2(%rsp), $t2
+	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
+	lea	$S(%rsp), $r_ptr
+	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
+
+	mov	$M(%rsp), $src0
+	lea	$M(%rsp), $b_ptr
+	mov	$acc4, $acc6			# harmonize sub output and mul input
+	xor	%ecx, %ecx
+	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
+	mov	$acc5, $acc2
+	mov	$acc5, $S+8*1(%rsp)
+	cmovz	$acc0, $acc3
+	mov	$acc0, $S+8*2(%rsp)
+	lea	$S-$bias(%rsp), $a_ptr
+	cmovz	$acc1, $acc4
+	mov	$acc1, $S+8*3(%rsp)
+	mov	$acc6, $acc1
+	lea	$S(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
+
+	movq	%xmm1, $b_ptr
+	movq	%xmm1, $r_ptr
+	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
+
+	lea	32*5+56(%rsp), %rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	mov	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_double${x}_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
+___
+}
+&gen_double("q");
+
+sub gen_add () {
+    my $x = shift;
+    my ($src0,$sfx,$bias);
+    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
+	$U1,$U2,$S1,$S2,
+	$res_x,$res_y,$res_z,
+	$in1_x,$in1_y,$in1_z,
+	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
+    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
+
+    if ($x ne "x") {
+	$src0 = "%rax";
+	$sfx  = "_nohw";
+	$bias = 0;
+    } else {
+	$src0 = "%rdx";
+	$sfx  = "_adx";
+	$bias = 128;
+    }
+$code.=<<___;
+.globl	ecp_nistz256_point_add$sfx
+.type	ecp_nistz256_point_add$sfx,\@function,3
+.align	32
+ecp_nistz256_point_add$sfx:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$32*18+8, %rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_add${x}_body:
+
+	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
+	movdqu	0x10($a_ptr), %xmm1
+	movdqu	0x20($a_ptr), %xmm2
+	movdqu	0x30($a_ptr), %xmm3
+	movdqu	0x40($a_ptr), %xmm4
+	movdqu	0x50($a_ptr), %xmm5
+	mov	$a_ptr, $b_ptr			# reassign
+	mov	$b_org, $a_ptr			# reassign
+	movdqa	%xmm0, $in1_x(%rsp)
+	movdqa	%xmm1, $in1_x+0x10(%rsp)
+	movdqa	%xmm2, $in1_y(%rsp)
+	movdqa	%xmm3, $in1_y+0x10(%rsp)
+	movdqa	%xmm4, $in1_z(%rsp)
+	movdqa	%xmm5, $in1_z+0x10(%rsp)
+	por	%xmm4, %xmm5
+
+	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
+	 pshufd	\$0xb1, %xmm5, %xmm3
+	movdqu	0x10($a_ptr), %xmm1
+	movdqu	0x20($a_ptr), %xmm2
+	 por	%xmm3, %xmm5
+	movdqu	0x30($a_ptr), %xmm3
+	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
+	 mov	0x40+8*1($a_ptr), $acc6
+	 mov	0x40+8*2($a_ptr), $acc7
+	 mov	0x40+8*3($a_ptr), $acc0
+	movdqa	%xmm0, $in2_x(%rsp)
+	 pshufd	\$0x1e, %xmm5, %xmm4
+	movdqa	%xmm1, $in2_x+0x10(%rsp)
+	movdqu	0x40($a_ptr),%xmm0		# in2_z again
+	movdqu	0x50($a_ptr),%xmm1
+	movdqa	%xmm2, $in2_y(%rsp)
+	movdqa	%xmm3, $in2_y+0x10(%rsp)
+	 por	%xmm4, %xmm5
+	 pxor	%xmm4, %xmm4
+	por	%xmm0, %xmm1
+	 movq	$r_ptr, %xmm0			# save $r_ptr
+
+	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
+	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
+	 mov	$acc6, $in2_z+8*1(%rsp)
+	 mov	$acc7, $in2_z+8*2(%rsp)
+	 mov	$acc0, $in2_z+8*3(%rsp)
+	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
+
+	pcmpeqd	%xmm4, %xmm5
+	pshufd	\$0xb1, %xmm1, %xmm4
+	por	%xmm1, %xmm4
+	pshufd	\$0, %xmm5, %xmm5		# in1infty
+	pshufd	\$0x1e, %xmm4, %xmm3
+	por	%xmm3, %xmm4
+	pxor	%xmm3, %xmm3
+	pcmpeqd	%xmm3, %xmm4
+	pshufd	\$0, %xmm4, %xmm4		# in2infty
+	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
+	 mov	0x40+8*1($b_ptr), $acc6
+	 mov	0x40+8*2($b_ptr), $acc7
+	 mov	0x40+8*3($b_ptr), $acc0
+	movq	$b_ptr, %xmm1
+
+	lea	0x40-$bias($b_ptr), $a_ptr
+	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
+
+	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
+	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
+
+	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
+	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
+
+	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
+	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
+
+	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
+	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
+
+	lea	$S1(%rsp), $b_ptr
+	lea	$R(%rsp), $r_ptr		# R = S2 - S1
+	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
+
+	or	$acc5, $acc4			# see if result is zero
+	movdqa	%xmm4, %xmm2
+	or	$acc0, $acc4
+	or	$acc1, $acc4
+	por	%xmm5, %xmm2			# in1infty || in2infty
+	movq	$acc4, %xmm3
+
+	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
+	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
+
+	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
+	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
+
+	lea	$U1(%rsp), $b_ptr
+	lea	$H(%rsp), $r_ptr		# H = U2 - U1
+	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
+
+	or	$acc5, $acc4			# see if result is zero
+	or	$acc0, $acc4
+	or	$acc1, $acc4			# !is_equal(U1, U2)
+
+	movq	%xmm2, $acc0
+	movq	%xmm3, $acc1
+	or	$acc0, $acc4
+	.byte	0x3e				# predict taken
+	jnz	.Ladd_proceed$x			# !is_equal(U1, U2) || in1infty || in2infty
+
+	# We now know A = B or A = -B and neither is infinity. Compare the
+	# y-coordinates via S1 and S2.
+	test	$acc1, $acc1
+	jz	.Ladd_double$x			# is_equal(S1, S2)
+
+	# A = -B, so the result is infinity.
+	#
+	# TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in
+	# which case we should eliminate this special-case and simplify the
+	# timing analysis.
+	movq	%xmm0, $r_ptr			# restore $r_ptr
+	pxor	%xmm0, %xmm0
+	movdqu	%xmm0, 0x00($r_ptr)
+	movdqu	%xmm0, 0x10($r_ptr)
+	movdqu	%xmm0, 0x20($r_ptr)
+	movdqu	%xmm0, 0x30($r_ptr)
+	movdqu	%xmm0, 0x40($r_ptr)
+	movdqu	%xmm0, 0x50($r_ptr)
+	jmp	.Ladd_done$x
+
+.align	32
+.Ladd_double$x:
+	movq	%xmm1, $a_ptr			# restore $a_ptr
+	movq	%xmm0, $r_ptr			# restore $r_ptr
+	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
+.cfi_adjust_cfa_offset	`-32*(18-5)`
+	jmp	.Lpoint_double_shortcut$x
+.cfi_adjust_cfa_offset	`32*(18-5)`
+
+.align	32
+.Ladd_proceed$x:
+	`&load_for_sqr("$R(%rsp)", "$src0")`
+	lea	$Rsqr(%rsp), $r_ptr		# R^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
+
+	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
+	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
+
+	`&load_for_sqr("$H(%rsp)", "$src0")`
+	lea	$Hsqr(%rsp), $r_ptr		# H^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
+
+	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
+	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
+
+	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
+	lea	$Hcub(%rsp), $r_ptr		# H^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
+
+	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
+	lea	$U2(%rsp), $r_ptr		# U1*H^2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
+___
+{
+#######################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+my ($poly1, $poly3)=($acc6,$acc7);
+
+$code.=<<___;
+	#lea	$U2(%rsp), $a_ptr
+	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
+	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
+
+	xor	$t4, $t4
+	add	$acc0, $acc0		# a0:a3+a0:a3
+	lea	$Rsqr(%rsp), $a_ptr
+	adc	$acc1, $acc1
+	 mov	$acc0, $t0
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	 mov	$acc1, $t1
+	adc	\$0, $t4
+
+	sub	\$-1, $acc0
+	 mov	$acc2, $t2
+	sbb	$poly1, $acc1
+	sbb	\$0, $acc2
+	 mov	$acc3, $t3
+	sbb	$poly3, $acc3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $acc0
+	mov	8*0($a_ptr), $t0
+	cmovc	$t1, $acc1
+	mov	8*1($a_ptr), $t1
+	cmovc	$t2, $acc2
+	mov	8*2($a_ptr), $t2
+	cmovc	$t3, $acc3
+	mov	8*3($a_ptr), $t3
+
+	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
+
+	lea	$Hcub(%rsp), $b_ptr
+	lea	$res_x(%rsp), $r_ptr
+	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
+
+	mov	$U2+8*0(%rsp), $t0
+	mov	$U2+8*1(%rsp), $t1
+	mov	$U2+8*2(%rsp), $t2
+	mov	$U2+8*3(%rsp), $t3
+	lea	$res_y(%rsp), $r_ptr
+
+	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
+
+	mov	$acc0, 8*0($r_ptr)		# save the result, as
+	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
+	lea	$S2(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
+
+	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
+	lea	$res_y(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
+
+	lea	$S2(%rsp), $b_ptr
+	lea	$res_y(%rsp), $r_ptr
+	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
+
+	movq	%xmm0, $r_ptr		# restore $r_ptr
+
+	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
+	movdqa	%xmm5, %xmm1
+	pandn	$res_z(%rsp), %xmm0
+	movdqa	%xmm5, %xmm2
+	pandn	$res_z+0x10(%rsp), %xmm1
+	movdqa	%xmm5, %xmm3
+	pand	$in2_z(%rsp), %xmm2
+	pand	$in2_z+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+
+	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
+	movdqa	%xmm4, %xmm1
+	pandn	%xmm2, %xmm0
+	movdqa	%xmm4, %xmm2
+	pandn	%xmm3, %xmm1
+	movdqa	%xmm4, %xmm3
+	pand	$in1_z(%rsp), %xmm2
+	pand	$in1_z+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+	movdqu	%xmm2, 0x40($r_ptr)
+	movdqu	%xmm3, 0x50($r_ptr)
+
+	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
+	movdqa	%xmm5, %xmm1
+	pandn	$res_x(%rsp), %xmm0
+	movdqa	%xmm5, %xmm2
+	pandn	$res_x+0x10(%rsp), %xmm1
+	movdqa	%xmm5, %xmm3
+	pand	$in2_x(%rsp), %xmm2
+	pand	$in2_x+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+
+	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
+	movdqa	%xmm4, %xmm1
+	pandn	%xmm2, %xmm0
+	movdqa	%xmm4, %xmm2
+	pandn	%xmm3, %xmm1
+	movdqa	%xmm4, %xmm3
+	pand	$in1_x(%rsp), %xmm2
+	pand	$in1_x+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+	movdqu	%xmm2, 0x00($r_ptr)
+	movdqu	%xmm3, 0x10($r_ptr)
+
+	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
+	movdqa	%xmm5, %xmm1
+	pandn	$res_y(%rsp), %xmm0
+	movdqa	%xmm5, %xmm2
+	pandn	$res_y+0x10(%rsp), %xmm1
+	movdqa	%xmm5, %xmm3
+	pand	$in2_y(%rsp), %xmm2
+	pand	$in2_y+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+
+	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
+	movdqa	%xmm4, %xmm1
+	pandn	%xmm2, %xmm0
+	movdqa	%xmm4, %xmm2
+	pandn	%xmm3, %xmm1
+	movdqa	%xmm4, %xmm3
+	pand	$in1_y(%rsp), %xmm2
+	pand	$in1_y+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+	movdqu	%xmm2, 0x20($r_ptr)
+	movdqu	%xmm3, 0x30($r_ptr)
+
+.Ladd_done$x:
+	lea	32*18+56(%rsp), %rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	mov	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_add${x}_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
+___
+}
+&gen_add("q");
+
+sub gen_add_affine () {
+    my $x = shift;
+    my ($src0,$sfx,$bias);
+    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
+	$res_x,$res_y,$res_z,
+	$in1_x,$in1_y,$in1_z,
+	$in2_x,$in2_y)=map(32*$_,(0..14));
+    my $Z1sqr = $S2;
+
+    if ($x ne "x") {
+	$src0 = "%rax";
+	$sfx  = "_nohw";
+	$bias = 0;
+    } else {
+	$src0 = "%rdx";
+	$sfx  = "_adx";
+	$bias = 128;
+    }
+$code.=<<___;
+.globl	ecp_nistz256_point_add_affine$sfx
+.type	ecp_nistz256_point_add_affine$sfx,\@function,3
+.align	32
+ecp_nistz256_point_add_affine$sfx:
+.cfi_startproc
+	_CET_ENDBR
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$32*15+8, %rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affine${x}_body:
+
+	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
+	mov	$b_org, $b_ptr		# reassign
+	movdqu	0x10($a_ptr), %xmm1
+	movdqu	0x20($a_ptr), %xmm2
+	movdqu	0x30($a_ptr), %xmm3
+	movdqu	0x40($a_ptr), %xmm4
+	movdqu	0x50($a_ptr), %xmm5
+	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
+	 mov	0x40+8*1($a_ptr), $acc6
+	 mov	0x40+8*2($a_ptr), $acc7
+	 mov	0x40+8*3($a_ptr), $acc0
+	movdqa	%xmm0, $in1_x(%rsp)
+	movdqa	%xmm1, $in1_x+0x10(%rsp)
+	movdqa	%xmm2, $in1_y(%rsp)
+	movdqa	%xmm3, $in1_y+0x10(%rsp)
+	movdqa	%xmm4, $in1_z(%rsp)
+	movdqa	%xmm5, $in1_z+0x10(%rsp)
+	por	%xmm4, %xmm5
+
+	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
+	 pshufd	\$0xb1, %xmm5, %xmm3
+	movdqu	0x10($b_ptr), %xmm1
+	movdqu	0x20($b_ptr), %xmm2
+	 por	%xmm3, %xmm5
+	movdqu	0x30($b_ptr), %xmm3
+	movdqa	%xmm0, $in2_x(%rsp)
+	 pshufd	\$0x1e, %xmm5, %xmm4
+	movdqa	%xmm1, $in2_x+0x10(%rsp)
+	por	%xmm0, %xmm1
+	 movq	$r_ptr, %xmm0		# save $r_ptr
+	movdqa	%xmm2, $in2_y(%rsp)
+	movdqa	%xmm3, $in2_y+0x10(%rsp)
+	por	%xmm2, %xmm3
+	 por	%xmm4, %xmm5
+	 pxor	%xmm4, %xmm4
+	por	%xmm1, %xmm3
+
+	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
+	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
+
+	pcmpeqd	%xmm4, %xmm5
+	pshufd	\$0xb1, %xmm3, %xmm4
+	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
+	 #lea	0x00($b_ptr), $b_ptr
+	 mov	$acc4, $acc1			# harmonize sqr output and mul input
+	por	%xmm3, %xmm4
+	pshufd	\$0, %xmm5, %xmm5		# in1infty
+	pshufd	\$0x1e, %xmm4, %xmm3
+	 mov	$acc5, $acc2
+	por	%xmm3, %xmm4
+	pxor	%xmm3, %xmm3
+	 mov	$acc6, $acc3
+	pcmpeqd	%xmm3, %xmm4
+	pshufd	\$0, %xmm4, %xmm4		# in2infty
+
+	lea	$Z1sqr-$bias(%rsp), $a_ptr
+	mov	$acc7, $acc4
+	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
+
+	lea	$in1_x(%rsp), $b_ptr
+	lea	$H(%rsp), $r_ptr		# H = U2 - U1
+	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
+
+	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
+	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
+
+	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
+	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
+
+	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
+	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
+
+	lea	$in1_y(%rsp), $b_ptr
+	lea	$R(%rsp), $r_ptr		# R = S2 - S1
+	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
+
+	`&load_for_sqr("$H(%rsp)", "$src0")`
+	lea	$Hsqr(%rsp), $r_ptr		# H^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
+
+	`&load_for_sqr("$R(%rsp)", "$src0")`
+	lea	$Rsqr(%rsp), $r_ptr		# R^2
+	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
+
+	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
+	lea	$Hcub(%rsp), $r_ptr		# H^3
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
+
+	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
+	lea	$U2(%rsp), $r_ptr		# U1*H^2
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
+___
+{
+#######################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+my ($poly1, $poly3)=($acc6,$acc7);
+
+$code.=<<___;
+	#lea	$U2(%rsp), $a_ptr
+	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
+	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
+
+	xor	$t4, $t4
+	add	$acc0, $acc0		# a0:a3+a0:a3
+	lea	$Rsqr(%rsp), $a_ptr
+	adc	$acc1, $acc1
+	 mov	$acc0, $t0
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	 mov	$acc1, $t1
+	adc	\$0, $t4
+
+	sub	\$-1, $acc0
+	 mov	$acc2, $t2
+	sbb	$poly1, $acc1
+	sbb	\$0, $acc2
+	 mov	$acc3, $t3
+	sbb	$poly3, $acc3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $acc0
+	mov	8*0($a_ptr), $t0
+	cmovc	$t1, $acc1
+	mov	8*1($a_ptr), $t1
+	cmovc	$t2, $acc2
+	mov	8*2($a_ptr), $t2
+	cmovc	$t3, $acc3
+	mov	8*3($a_ptr), $t3
+
+	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
+
+	lea	$Hcub(%rsp), $b_ptr
+	lea	$res_x(%rsp), $r_ptr
+	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
+
+	mov	$U2+8*0(%rsp), $t0
+	mov	$U2+8*1(%rsp), $t1
+	mov	$U2+8*2(%rsp), $t2
+	mov	$U2+8*3(%rsp), $t3
+	lea	$H(%rsp), $r_ptr
+
+	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
+
+	mov	$acc0, 8*0($r_ptr)		# save the result, as
+	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
+	lea	$S2(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
+
+	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
+	lea	$H(%rsp), $r_ptr
+	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
+
+	lea	$S2(%rsp), $b_ptr
+	lea	$res_y(%rsp), $r_ptr
+	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
+
+	movq	%xmm0, $r_ptr		# restore $r_ptr
+
+	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
+	movdqa	%xmm5, %xmm1
+	pandn	$res_z(%rsp), %xmm0
+	movdqa	%xmm5, %xmm2
+	pandn	$res_z+0x10(%rsp), %xmm1
+	movdqa	%xmm5, %xmm3
+	pand	.LONE_mont(%rip), %xmm2
+	pand	.LONE_mont+0x10(%rip), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+
+	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
+	movdqa	%xmm4, %xmm1
+	pandn	%xmm2, %xmm0
+	movdqa	%xmm4, %xmm2
+	pandn	%xmm3, %xmm1
+	movdqa	%xmm4, %xmm3
+	pand	$in1_z(%rsp), %xmm2
+	pand	$in1_z+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+	movdqu	%xmm2, 0x40($r_ptr)
+	movdqu	%xmm3, 0x50($r_ptr)
+
+	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
+	movdqa	%xmm5, %xmm1
+	pandn	$res_x(%rsp), %xmm0
+	movdqa	%xmm5, %xmm2
+	pandn	$res_x+0x10(%rsp), %xmm1
+	movdqa	%xmm5, %xmm3
+	pand	$in2_x(%rsp), %xmm2
+	pand	$in2_x+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+
+	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
+	movdqa	%xmm4, %xmm1
+	pandn	%xmm2, %xmm0
+	movdqa	%xmm4, %xmm2
+	pandn	%xmm3, %xmm1
+	movdqa	%xmm4, %xmm3
+	pand	$in1_x(%rsp), %xmm2
+	pand	$in1_x+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+	movdqu	%xmm2, 0x00($r_ptr)
+	movdqu	%xmm3, 0x10($r_ptr)
+
+	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
+	movdqa	%xmm5, %xmm1
+	pandn	$res_y(%rsp), %xmm0
+	movdqa	%xmm5, %xmm2
+	pandn	$res_y+0x10(%rsp), %xmm1
+	movdqa	%xmm5, %xmm3
+	pand	$in2_y(%rsp), %xmm2
+	pand	$in2_y+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+
+	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
+	movdqa	%xmm4, %xmm1
+	pandn	%xmm2, %xmm0
+	movdqa	%xmm4, %xmm2
+	pandn	%xmm3, %xmm1
+	movdqa	%xmm4, %xmm3
+	pand	$in1_y(%rsp), %xmm2
+	pand	$in1_y+0x10(%rsp), %xmm3
+	por	%xmm0, %xmm2
+	por	%xmm1, %xmm3
+	movdqu	%xmm2, 0x20($r_ptr)
+	movdqu	%xmm3, 0x30($r_ptr)
+
+	lea	32*15+56(%rsp), %rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	mov	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affine${x}_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
+___
+}
+&gen_add_affine("q");
+
+########################################################################
+# AD*X magic
+#
+if ($addx) {								{
+########################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+
+$code.=<<___;
+.type	__ecp_nistz256_add_tox,\@abi-omnipotent
+.align	32
+__ecp_nistz256_add_tox:
+.cfi_startproc
+	xor	$t4, $t4
+	adc	8*0($b_ptr), $a0
+	adc	8*1($b_ptr), $a1
+	 mov	$a0, $t0
+	adc	8*2($b_ptr), $a2
+	adc	8*3($b_ptr), $a3
+	 mov	$a1, $t1
+	adc	\$0, $t4
+
+	xor	$t3, $t3
+	sbb	\$-1, $a0
+	 mov	$a2, $t2
+	sbb	$poly1, $a1
+	sbb	\$0, $a2
+	 mov	$a3, $t3
+	sbb	$poly3, $a3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovc	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovc	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
+.align	32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc
+	xor	$t4, $t4
+	sbb	8*0($b_ptr), $a0
+	sbb	8*1($b_ptr), $a1
+	 mov	$a0, $t0
+	sbb	8*2($b_ptr), $a2
+	sbb	8*3($b_ptr), $a3
+	 mov	$a1, $t1
+	sbb	\$0, $t4
+
+	xor	$t3, $t3
+	adc	\$-1, $a0
+	 mov	$a2, $t2
+	adc	$poly1, $a1
+	adc	\$0, $a2
+	 mov	$a3, $t3
+	adc	$poly3, $a3
+
+	bt	\$0, $t4
+	cmovnc	$t0, $a0
+	cmovnc	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovnc	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovnc	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type	__ecp_nistz256_subx,\@abi-omnipotent
+.align	32
+__ecp_nistz256_subx:
+.cfi_startproc
+	xor	$t4, $t4
+	sbb	$a0, $t0
+	sbb	$a1, $t1
+	 mov	$t0, $a0
+	sbb	$a2, $t2
+	sbb	$a3, $t3
+	 mov	$t1, $a1
+	sbb	\$0, $t4
+
+	xor	$a3 ,$a3
+	adc	\$-1, $t0
+	 mov	$t2, $a2
+	adc	$poly1, $t1
+	adc	\$0, $t2
+	 mov	$t3, $a3
+	adc	$poly3, $t3
+
+	bt	\$0, $t4
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	cmovc	$t2, $a2
+	cmovc	$t3, $a3
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
+.align	32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc
+	xor	$t4, $t4
+	adc	$a0, $a0		# a0:a3+a0:a3
+	adc	$a1, $a1
+	 mov	$a0, $t0
+	adc	$a2, $a2
+	adc	$a3, $a3
+	 mov	$a1, $t1
+	adc	\$0, $t4
+
+	xor	$t3, $t3
+	sbb	\$-1, $a0
+	 mov	$a2, $t2
+	sbb	$poly1, $a1
+	sbb	\$0, $a2
+	 mov	$a3, $t3
+	sbb	$poly3, $a3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovc	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovc	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	ret
+.cfi_endproc
+.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+___
+									}
+&gen_double("x");
+&gen_add("x");
+&gen_add_affine("x");
+}
+}}}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+
+.type	short_handler,\@abi-omnipotent
+.align	16
+short_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	16(%rax),%rax
+
+	mov	-8(%rax),%r12
+	mov	-16(%rax),%r13
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+
+	jmp	.Lcommon_seh_tail
+.size	short_handler,.-short_handler
+
+.type	full_handler,\@abi-omnipotent
+.align	16
+full_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rax,%r10),%rax
+
+	mov	-8(%rax),%rbp
+	mov	-16(%rax),%rbx
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	full_handler,.-full_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_ecp_nistz256_neg
+	.rva	.LSEH_end_ecp_nistz256_neg
+	.rva	.LSEH_info_ecp_nistz256_neg
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont_nohw
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont_nohw
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont_nohw
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont_nohw
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont_nohw
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont_nohw
+___
+$code.=<<___	if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont_adx
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont_adx
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont_adx
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont_adx
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont_adx
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont_adx
+___
+$code.=<<___;
+	.rva	.LSEH_begin_ecp_nistz256_mul_mont_nohw
+	.rva	.LSEH_end_ecp_nistz256_mul_mont_nohw
+	.rva	.LSEH_info_ecp_nistz256_mul_mont_nohw
+
+	.rva	.LSEH_begin_ecp_nistz256_sqr_mont_nohw
+	.rva	.LSEH_end_ecp_nistz256_sqr_mont_nohw
+	.rva	.LSEH_info_ecp_nistz256_sqr_mont_nohw
+___
+$code.=<<___	if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_mul_mont_adx
+	.rva	.LSEH_end_ecp_nistz256_mul_mont_adx
+	.rva	.LSEH_info_ecp_nistz256_mul_mont_adx
+
+	.rva	.LSEH_begin_ecp_nistz256_sqr_mont_adx
+	.rva	.LSEH_end_ecp_nistz256_sqr_mont_adx
+	.rva	.LSEH_info_ecp_nistz256_sqr_mont_adx
+___
+$code.=<<___;
+	.rva	.LSEH_begin_ecp_nistz256_select_w5_nohw
+	.rva	.LSEH_end_ecp_nistz256_select_w5_nohw
+	.rva	.LSEH_info_ecp_nistz256_select_wX_nohw
+
+	.rva	.LSEH_begin_ecp_nistz256_select_w7_nohw
+	.rva	.LSEH_end_ecp_nistz256_select_w7_nohw
+	.rva	.LSEH_info_ecp_nistz256_select_wX_nohw
+___
+$code.=<<___	if ($avx>1);
+	.rva	.LSEH_begin_ecp_nistz256_select_w5_avx2
+	.rva	.LSEH_end_ecp_nistz256_select_w5_avx2
+	.rva	.LSEH_info_ecp_nistz256_select_wX_avx2
+
+	.rva	.LSEH_begin_ecp_nistz256_select_w7_avx2
+	.rva	.LSEH_end_ecp_nistz256_select_w7_avx2
+	.rva	.LSEH_info_ecp_nistz256_select_wX_avx2
+___
+$code.=<<___;
+	.rva	.LSEH_begin_ecp_nistz256_point_double_nohw
+	.rva	.LSEH_end_ecp_nistz256_point_double_nohw
+	.rva	.LSEH_info_ecp_nistz256_point_double_nohw
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add_nohw
+	.rva	.LSEH_end_ecp_nistz256_point_add_nohw
+	.rva	.LSEH_info_ecp_nistz256_point_add_nohw
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add_affine_nohw
+	.rva	.LSEH_end_ecp_nistz256_point_add_affine_nohw
+	.rva	.LSEH_info_ecp_nistz256_point_add_affine_nohw
+___
+$code.=<<___ if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_point_double_adx
+	.rva	.LSEH_end_ecp_nistz256_point_double_adx
+	.rva	.LSEH_info_ecp_nistz256_point_double_adx
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add_adx
+	.rva	.LSEH_end_ecp_nistz256_point_add_adx
+	.rva	.LSEH_info_ecp_nistz256_point_add_adx
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add_affine_adx
+	.rva	.LSEH_end_ecp_nistz256_point_add_affine_adx
+	.rva	.LSEH_info_ecp_nistz256_point_add_affine_adx
+___
+$code.=<<___;
+
+.section	.xdata
+.align	8
+.LSEH_info_ecp_nistz256_neg:
+	.byte	9,0,0,0
+	.rva	short_handler
+	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
+.LSEH_info_ecp_nistz256_ord_mul_mont_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_ord_mul_mont_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___;
+.LSEH_info_ecp_nistz256_mul_mont_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_sqr_mont_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
+	.long	48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_mul_mont_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lmulx_body,.Lmulx_epilogue		# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_sqr_mont_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lsqrx_body,.Lsqrx_epilogue		# HandlerData[]
+	.long	48,0
+___
+$code.=<<___;
+.LSEH_info_ecp_nistz256_select_wX_nohw:
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
+	.align	8
+___
+$code.=<<___	if ($avx>1);
+.LSEH_info_ecp_nistz256_select_wX_avx2:
+	.byte	0x01,0x36,0x17,0x0b
+	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
+	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
+	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
+	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
+	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
+	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
+	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
+	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
+	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
+	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
+	.align	8
+___
+$code.=<<___;
+.LSEH_info_ecp_nistz256_point_double_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
+	.long	32*5+56,0
+.LSEH_info_ecp_nistz256_point_add_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
+	.long	32*18+56,0
+.LSEH_info_ecp_nistz256_point_add_affine_nohw:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
+	.long	32*15+56,0
+___
+$code.=<<___ if ($addx);
+.align	8
+.LSEH_info_ecp_nistz256_point_double_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
+	.long	32*5+56,0
+.LSEH_info_ecp_nistz256_point_add_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
+	.long	32*18+56,0
+.LSEH_info_ecp_nistz256_point_add_affine_adx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
+	.long	32*15+56,0
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c
new file mode 100644
index 0000000000..1811e28374
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c
@@ -0,0 +1,52 @@
+/* Copyright (c) 2014, Intel Corporation.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "ecp_nistz.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+/* Fills |str| with the bytewise little-endian encoding of |scalar|, where
+ * |scalar| has |num_limbs| limbs. |str| is padded with zeros at the end up
+ * to |str_len| bytes. Actually, |str_len| must be exactly one byte more than
+ * needed to encode |num_limbs| losslessly, so that there is an extra byte at
+ * the end. The extra byte is useful because the caller will be breaking |str|
+ * up into windows of a number of bits (5 or 7) that isn't divisible by 8, and
+ * so it is useful for it to be able to read an extra zero byte. */
+void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
+                                         const Limb scalar[],
+                                         size_t num_limbs) {
+  debug_assert_nonsecret(str_len == (num_limbs * sizeof(Limb)) + 1);
+
+  size_t i;
+  for (i = 0; i < num_limbs * sizeof(Limb); i += sizeof(Limb)) {
+    Limb d = scalar[i / sizeof(Limb)];
+
+    str[i + 0] = d & 0xff;
+    str[i + 1] = (d >> 8) & 0xff;
+    str[i + 2] = (d >> 16) & 0xff;
+    str[i + 3] = (d >>= 24) & 0xff;
+    if (sizeof(Limb) == 8) {
+      d >>= 8;
+      str[i + 4] = d & 0xff;
+      str[i + 5] = (d >> 8) & 0xff;
+      str[i + 6] = (d >> 16) & 0xff;
+      str[i + 7] = (d >> 24) & 0xff;
+    }
+  }
+  for (; i < str_len; i++) {
+    str[i] = 0;
+  }
+}
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h
new file mode 100644
index 0000000000..3c04c475c6
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h
@@ -0,0 +1,274 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_EC_ECP_NISTZ_H
+#define OPENSSL_HEADER_EC_ECP_NISTZ_H
+
+#include <ring-core/base.h>
+
+#include "../../limbs/limbs.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+// This function looks at `w + 1` scalar bits (`w` current, 1 adjacent less
+// significant bit), and recodes them into a signed digit for use in fast point
+// multiplication: the use of signed rather than unsigned digits means that
+// fewer points need to be precomputed, given that point inversion is easy (a
+// precomputed point dP makes -dP available as well).
+//
+// BACKGROUND:
+//
+// Signed digits for multiplication were introduced by Booth ("A signed binary
+// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
+// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
+// Booth's original encoding did not generally improve the density of nonzero
+// digits over the binary representation, and was merely meant to simplify the
+// handling of signed factors given in two's complement; but it has since been
+// shown to be the basis of various signed-digit representations that do have
+// further advantages, including the wNAF, using the following general
+// approach:
+//
+// (1) Given a binary representation
+//
+//       b_k  ...  b_2  b_1  b_0,
+//
+//     of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
+//     by using bit-wise subtraction as follows:
+//
+//        b_k     b_(k-1)  ...  b_2  b_1  b_0
+//      -         b_k      ...  b_3  b_2  b_1  b_0
+//       -----------------------------------------
+//        s_(k+1) s_k      ...  s_3  s_2  s_1  s_0
+//
+//     A left-shift followed by subtraction of the original value yields a new
+//     representation of the same value, using signed bits s_i = b_(i-1) - b_i.
+//     This representation from Booth's paper has since appeared in the
+//     literature under a variety of different names including "reversed binary
+//     form", "alternating greedy expansion", "mutual opposite form", and
+//     "sign-alternating {+-1}-representation".
+//
+//     An interesting property is that among the nonzero bits, values 1 and -1
+//     strictly alternate.
+//
+// (2) Various window schemes can be applied to the Booth representation of
+//     integers: for example, right-to-left sliding windows yield the wNAF
+//     (a signed-digit encoding independently discovered by various researchers
+//     in the 1990s), and left-to-right sliding windows yield a left-to-right
+//     equivalent of the wNAF (independently discovered by various researchers
+//     around 2004).
+//
+// To prevent leaking information through side channels in point multiplication,
+// we need to recode the given integer into a regular pattern: sliding windows
+// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
+// decades older: we'll be using the so-called "modified Booth encoding" due to
+// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
+// (1961), pp. 67-91), in a radix-2**w setting.  That is, we always combine `w`
+// signed bits into a signed digit, e.g. (for `w == 5`):
+//
+//       s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
+//
+// The sign-alternating property implies that the resulting digit values are
+// integers from `-2**(w-1)` to `2**(w-1)`, e.g. -16 to 16 for `w == 5`.
+//
+// Of course, we don't actually need to compute the signed digits s_i as an
+// intermediate step (that's just a nice way to see how this scheme relates
+// to the wNAF): a direct computation obtains the recoded digit from the
+// six bits b_(5j + 4) ... b_(5j - 1).
+//
+// This function takes those `w` bits as an integer (e.g. 0 .. 63), writing the
+// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
+// value, in the range 0 .. 2**(w-1).  Note that this integer essentially provides
+// the input bits "shifted to the left" by one position: for example, the input
+// to compute the least significant recoded digit, given that there's no bit
+// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
+//
+// DOUBLING CASE:
+//
+// Point addition formulas for short Weierstrass curves are often incomplete.
+// Edge cases such as P + P or P + ∞ must be handled separately. This
+// complicates constant-time requirements. P + ∞ cannot be avoided (any window
+// may be zero) and is handled with constant-time selects. P + P (where P is not
+// ∞) usually is not. Instead, windowing strategies are chosen to avoid this
+// case. Whether this happens depends on the group order.
+//
+// Let w be the window width (in this function, w = 5). The non-trivial doubling
+// case in single-point scalar multiplication may occur if and only if the
+// 2^(w-1) bit of the group order is zero.
+//
+// Note the above only holds if the scalar is fully reduced and the group order
+// is a prime that is much larger than 2^w. It also only holds when windows
+// are applied from most significant to least significant, doubling between each
+// window. It does not apply to more complex table strategies such as
+// |EC_nistz256_method|.
+//
+// PROOF:
+//
+// Let n be the group order. Let l be the number of bits needed to represent n.
+// Assume there exists some 0 <= k < n such that signed w-bit windowed
+// multiplication hits the doubling case.
+//
+// Windowed multiplication consists of iterating over groups of s_i (defined
+// above based on k's binary representation) from most to least significant. At
+// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant
+// window), we:
+//
+//  1. Double the accumulator A, w times. Let A_i be the value of A at this
+//     point.
+//
+//  2. Set A to T_i + A_i, where T_i is a precomputed multiple of P
+//     corresponding to the window s_(i+w-1) ... s_i.
+//
+// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as
+// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i.
+// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is
+// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i =
+// 2^w * (a_(i+w) + t_(i+w)).
+//
+// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it
+// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i.
+// This is computed as:
+//
+//         b_(i+w-2) b_(i+w-3)  ...  b_i      b_(i-1)
+//      -  b_(i+w-1) b_(i+w-2)  ...  b_(i+1)  b_i
+//       --------------------------------------------
+//   t_i = s_(i+w-1) s_(i+w-2)  ...  s_(i+1)  s_i
+//
+// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer
+// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i.
+//
+//   t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x)
+//       = x - 2^(w-1)*b_(i+w-1) + b_(i-1)
+//
+// Or, using C notation for bit operations:
+//
+//   t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1
+//
+// Note b_(i-1) is added in left-shifted by one (or doubled) from its place.
+// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed
+// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C
+// notation, this is:
+//
+//   a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w
+//
+// Observe that, while t_i may be positive or negative, a_i is bounded by
+// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
+// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
+// all groups. That would imply the subsequent a_i is zero, which means all
+// terms thus far were zero.)
+//
+// Returning to our doubling position, we have a_j = t_j (mod n). We now
+// determine the value of a_j - t_j, which must be divisible by n. Our bounds on
+// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w
+// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if
+// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n.
+//
+// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
+//
+//   n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j
+//                <= k/2^j + 2^w - t_j
+//                 < n/2^w + 2^w + 2^(w-1)
+//
+// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final
+// addition may hit the doubling case.
+//
+// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
+// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the
+// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0.
+// That is:
+//
+// - 2^w divides k_H
+// - k_M is 0 or 2^(w-1)
+// - 0 <= k_L < 2^(w-1)
+//
+// Divide n into n_H + n_M + n_L similarly. We thus have:
+//
+//   t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1
+//       = k & ((1<<(w-1)) - 1) - k & (1<<(w-1))
+//       = k_L - k_M
+//
+//   a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w
+//       = (k>>w) << w + ((k>>(w-1)) & 1) << w
+//       = k_H + 2*k_M
+//
+//                 n = a_0 - t_0
+//   n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M)
+//                   = k_H + 3*k_M - k_L
+//
+// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be
+// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H.
+// Then,
+//
+//   n_M + n_L = 3*(2^(w-1)) - k_L
+//             > 3*(2^(w-1)) - 2^(w-1)
+//             = 2^w
+//
+// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose
+// k_H < n_H - 2*2^w. Then,
+//
+//   n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
+//                   < n_H - 2*2^w + 3*(2^(w-1)) - k_L
+//         n_M + n_L < -2^(w-1) - k_L
+//
+// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus,
+//
+//   n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
+//                   = n_H - 2^w + 3*(2^(w-1)) - k_L
+//         n_M + n_L = 2^(w-1) - k_L
+//                  <= 2^(w-1)
+//
+// Equality would mean 2^(w-1) divides n, which is impossible if n is prime.
+// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition.
+//
+// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w,
+// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point
+// doubling in the final addition and is the only such scalar.
+//
+// COMMON CURVES:
+//
+// The group orders for common curves end in the following bit patterns:
+//
+//   P-521: ...00001001; w = 4 is okay
+//   P-384: ...01110011; w = 2, 5, 6, 7 are okay
+//   P-256: ...01010001; w = 5, 7 are okay
+//   P-224: ...00111101; w = 3, 4, 5, 6 are okay
+static inline void booth_recode(crypto_word_t *is_negative, crypto_word_t *digit,
+                                crypto_word_t in, crypto_word_t w) {
+  debug_assert_nonsecret(w >= 2);
+  debug_assert_nonsecret(w <= 7);
+
+  // Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|,
+  // but 'in' seen as (`w+1`)-bit value.
+  crypto_word_t s = ~((in >> w) - 1);
+  crypto_word_t d;
+  d = ((crypto_word_t)1u << (w + 1)) - in - 1;
+  d = (d & s) | (in & ~s);
+  d = (d >> 1) + (d & 1);
+
+  *is_negative = constant_time_is_nonzero_w(s & 1);
+  *digit = d;
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
+                                     const Limb scalar[],
+                                     size_t num_limbs);
+
+#endif // OPENSSL_HEADER_EC_ECP_NISTZ_H
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h
new file mode 100644
index 0000000000..ca87e60721
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2014, Intel Corporation.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
+#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
+
+#include "../../limbs/limbs.h"
+
+#define P384_LIMBS (384u / LIMB_BITS)
+
+typedef struct {
+  Limb X[P384_LIMBS];
+  Limb Y[P384_LIMBS];
+  Limb Z[P384_LIMBS];
+} P384_POINT;
+
+typedef struct {
+  Limb X[P384_LIMBS];
+  Limb Y[P384_LIMBS];
+} P384_POINT_AFFINE;
+
+
+#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl
new file mode 100644
index 0000000000..ae28f97ae5
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl
@@ -0,0 +1,300 @@
+/* Copyright (c) 2014, Intel Corporation.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/* Developers and authors:
+ * Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * (1) Intel Corporation, Israel Development Center
+ * (2) University of Haifa
+ * Reference:
+ *   Shay Gueron and Vlad Krasnov
+ *   "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
+ *   http://eprint.iacr.org/2013/816 */
+
+#include "ecp_nistz.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+/* Point double: r = 2*a */
+static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
+  BN_ULONG S[P384_LIMBS];
+  BN_ULONG M[P384_LIMBS];
+  BN_ULONG Zsqr[P384_LIMBS];
+  BN_ULONG tmp0[P384_LIMBS];
+
+  const BN_ULONG *in_x = a->X;
+  const BN_ULONG *in_y = a->Y;
+  const BN_ULONG *in_z = a->Z;
+
+  BN_ULONG *res_x = r->X;
+  BN_ULONG *res_y = r->Y;
+  BN_ULONG *res_z = r->Z;
+
+  elem_mul_by_2(S, in_y);
+
+  elem_sqr_mont(Zsqr, in_z);
+
+  elem_sqr_mont(S, S);
+
+  elem_mul_mont(res_z, in_z, in_y);
+  elem_mul_by_2(res_z, res_z);
+
+  elem_add(M, in_x, Zsqr);
+  elem_sub(Zsqr, in_x, Zsqr);
+
+  elem_sqr_mont(res_y, S);
+  elem_div_by_2(res_y, res_y);
+
+  elem_mul_mont(M, M, Zsqr);
+  elem_mul_by_3(M, M);
+
+  elem_mul_mont(S, S, in_x);
+  elem_mul_by_2(tmp0, S);
+
+  elem_sqr_mont(res_x, M);
+
+  elem_sub(res_x, res_x, tmp0);
+  elem_sub(S, S, res_x);
+
+  elem_mul_mont(S, S, M);
+  elem_sub(res_y, S, res_y);
+}
+
+/* Point addition: r = a+b */
+static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
+                               const P384_POINT *b) {
+  BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
+  BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
+  BN_ULONG Z1sqr[P384_LIMBS];
+  BN_ULONG Z2sqr[P384_LIMBS];
+  BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
+  BN_ULONG Hsqr[P384_LIMBS];
+  BN_ULONG Rsqr[P384_LIMBS];
+  BN_ULONG Hcub[P384_LIMBS];
+
+  BN_ULONG res_x[P384_LIMBS];
+  BN_ULONG res_y[P384_LIMBS];
+  BN_ULONG res_z[P384_LIMBS];
+
+  const BN_ULONG *in1_x = a->X;
+  const BN_ULONG *in1_y = a->Y;
+  const BN_ULONG *in1_z = a->Z;
+
+  const BN_ULONG *in2_x = b->X;
+  const BN_ULONG *in2_y = b->Y;
+  const BN_ULONG *in2_z = b->Z;
+
+  BN_ULONG in1infty = is_zero(a->Z);
+  BN_ULONG in2infty = is_zero(b->Z);
+
+  elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
+  elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
+
+  elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
+  elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
+
+  elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
+  elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
+  elem_sub(R, S2, S1);          /* R = S2 - S1 */
+
+  elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
+  elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
+  elem_sub(H, U2, U1);             /* H = U2 - U1 */
+
+  BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
+  if (is_exceptional) {
+    if (is_equal(S1, S2)) {
+      nistz384_point_double(r, a);
+    } else {
+      limbs_zero(r->X, P384_LIMBS);
+      limbs_zero(r->Y, P384_LIMBS);
+      limbs_zero(r->Z, P384_LIMBS);
+    }
+    return;
+  }
+
+  elem_sqr_mont(Rsqr, R);             /* R^2 */
+  elem_mul_mont(res_z, H, in1_z);     /* Z3 = H*Z1*Z2 */
+  elem_sqr_mont(Hsqr, H);             /* H^2 */
+  elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
+  elem_mul_mont(Hcub, Hsqr, H);       /* H^3 */
+
+  elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
+  elem_mul_by_2(Hsqr, U2);     /* 2*U1*H^2 */
+
+  elem_sub(res_x, Rsqr, Hsqr);
+  elem_sub(res_x, res_x, Hcub);
+
+  elem_sub(res_y, U2, res_x);
+
+  elem_mul_mont(S2, S1, Hcub);
+  elem_mul_mont(res_y, R, res_y);
+  elem_sub(res_y, res_y, S2);
+
+  copy_conditional(res_x, in2_x, in1infty);
+  copy_conditional(res_y, in2_y, in1infty);
+  copy_conditional(res_z, in2_z, in1infty);
+
+  copy_conditional(res_x, in1_x, in2infty);
+  copy_conditional(res_y, in1_y, in2infty);
+  copy_conditional(res_z, in1_z, in2infty);
+
+  limbs_copy(r->X, res_x, P384_LIMBS);
+  limbs_copy(r->Y, res_y, P384_LIMBS);
+  limbs_copy(r->Z, res_z, P384_LIMBS);
+}
+
+static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue,
+                               const P384_POINT table[16]) {
+  crypto_word_t recoded_is_negative;
+  crypto_word_t recoded;
+  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
+
+  alignas(64) P384_POINT h;
+  p384_point_select_w5(&h, table, recoded);
+
+  alignas(64) BN_ULONG tmp[P384_LIMBS];
+  p384_elem_neg(tmp, h.Y);
+  copy_conditional(h.Y, tmp, recoded_is_negative);
+
+  nistz384_point_add(r, r, &h);
+}
+
+/* r = p * p_scalar */
+static void nistz384_point_mul(P384_POINT *r,
+                               const BN_ULONG p_scalar[P384_LIMBS],
+                               const Limb p_x[P384_LIMBS],
+                               const Limb p_y[P384_LIMBS]) {
+  static const size_t kWindowSize = 5;
+  static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
+
+  uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
+  little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
+                                  p_scalar, P384_LIMBS);
+
+  /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
+  * add no more than 63 bytes of overhead. Thus, |table| should require
+  * ~2367 ((144 * 16) + 63) bytes of stack space. */
+  alignas(64) P384_POINT table[16];
+
+  /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
+  * not stored. All other values are actually stored with an offset of -1 in
+  * table. */
+  P384_POINT *row = table;
+
+  limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
+  limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
+  limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
+
+  nistz384_point_double(&row[2 - 1], &row[1 - 1]);
+  nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
+  nistz384_point_double(&row[4 - 1], &row[2 - 1]);
+  nistz384_point_double(&row[6 - 1], &row[3 - 1]);
+  nistz384_point_double(&row[8 - 1], &row[4 - 1]);
+  nistz384_point_double(&row[12 - 1], &row[6 - 1]);
+  nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
+  nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
+  nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
+  nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
+  nistz384_point_double(&row[14 - 1], &row[7 - 1]);
+  nistz384_point_double(&row[10 - 1], &row[5 - 1]);
+  nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
+  nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
+  nistz384_point_double(&row[16 - 1], &row[8 - 1]);
+
+  static const size_t START_INDEX = 384 - 4;
+  size_t index = START_INDEX;
+
+  BN_ULONG recoded_is_negative;
+  crypto_word_t recoded;
+
+  crypto_word_t wvalue = p_str[(index - 1) / 8];
+  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
+
+  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
+  dev_assert_secret(!recoded_is_negative);
+
+  p384_point_select_w5(r, table, recoded);
+
+  while (index >= kWindowSize) {
+    if (index != START_INDEX) {
+      size_t off = (index - 1) / 8;
+
+      wvalue = p_str[off] | p_str[off + 1] << 8;
+      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
+      add_precomputed_w5(r, wvalue, table);
+    }
+
+    index -= kWindowSize;
+
+    nistz384_point_double(r, r);
+    nistz384_point_double(r, r);
+    nistz384_point_double(r, r);
+    nistz384_point_double(r, r);
+    nistz384_point_double(r, r);
+  }
+
+  /* Final window */
+  wvalue = p_str[0];
+  wvalue = (wvalue << 1) & kMask;
+  add_precomputed_w5(r, wvalue, table);
+}
+
+void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS])
+{
+  P384_POINT t;
+  limbs_copy(t.X, a[0], P384_LIMBS);
+  limbs_copy(t.Y, a[1], P384_LIMBS);
+  limbs_copy(t.Z, a[2], P384_LIMBS);
+  nistz384_point_double(&t, &t);
+  limbs_copy(r[0], t.X, P384_LIMBS);
+  limbs_copy(r[1], t.Y, P384_LIMBS);
+  limbs_copy(r[2], t.Z, P384_LIMBS);
+}
+
+void p384_point_add(Limb r[3][P384_LIMBS],
+                    const Limb a[3][P384_LIMBS],
+                    const Limb b[3][P384_LIMBS])
+{
+  P384_POINT t1;
+  limbs_copy(t1.X, a[0], P384_LIMBS);
+  limbs_copy(t1.Y, a[1], P384_LIMBS);
+  limbs_copy(t1.Z, a[2], P384_LIMBS);
+
+  P384_POINT t2;
+  limbs_copy(t2.X, b[0], P384_LIMBS);
+  limbs_copy(t2.Y, b[1], P384_LIMBS);
+  limbs_copy(t2.Z, b[2], P384_LIMBS);
+
+  nistz384_point_add(&t1, &t1, &t2);
+
+  limbs_copy(r[0], t1.X, P384_LIMBS);
+  limbs_copy(r[1], t1.Y, P384_LIMBS);
+  limbs_copy(r[2], t1.Z, P384_LIMBS);
+}
+
+void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS],
+                    const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) {
+  alignas(64) P384_POINT acc;
+  nistz384_point_mul(&acc, p_scalar, p_x, p_y);
+  limbs_copy(r[0], acc.X, P384_LIMBS);
+  limbs_copy(r[1], acc.Y, P384_LIMBS);
+  limbs_copy(r[2], acc.Z, P384_LIMBS);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c
new file mode 100644
index 0000000000..2aa0ae2ce2
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c
@@ -0,0 +1,54 @@
+/* Copyright 2016 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "./p256_shared.h"
+
+#include "../../limbs/limbs.h"
+
+#if !defined(OPENSSL_USE_NISTZ256)
+
+typedef Limb ScalarMont[P256_LIMBS];
+typedef Limb Scalar[P256_LIMBS];
+
+#include "../bn/internal.h"
+
+static const BN_ULONG N[P256_LIMBS] = {
+#if defined(OPENSSL_64_BIT)
+  0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+#else
+  0xfc632551, 0xf3b9cac2, 0xa7179e84, 0xbce6faad, 0xffffffff, 0xffffffff, 0,
+  0xffffffff
+#endif
+};
+
+static const BN_ULONG N_N0[] = {
+  BN_MONT_CTX_N0(0xccd1c8aa, 0xee00bc4f)
+};
+
+void p256_scalar_mul_mont(ScalarMont r, const ScalarMont a,
+                              const ScalarMont b) {
+  /* XXX: Inefficient. TODO: optimize with dedicated multiplication routine. */
+  bn_mul_mont_small(r, a, b, N, N_N0, P256_LIMBS);
+}
+
+/* XXX: Inefficient. TODO: optimize with dedicated squaring routine. */
+void p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) {
+  dev_assert_secret(rep >= 1);
+  p256_scalar_mul_mont(r, a, a);
+  for (Limb i = 1; i < rep; ++i) {
+    p256_scalar_mul_mont(r, r, r);
+  }
+}
+
+#endif
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c
new file mode 100644
index 0000000000..f9a66faa2a
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c
@@ -0,0 +1,246 @@
+/* Copyright 2016 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "../../limbs/limbs.h"
+
+#include "ecp_nistz384.h"
+#include "../bn/internal.h"
+#include "../../internal.h"
+
+#include "../../limbs/limbs.inl"
+
+ /* XXX: Here we assume that the conversion from |Carry| to |Limb| is
+  * constant-time, but we haven't verified that assumption. TODO: Fix it so
+  * we don't need to make that assumption. */
+
+
+typedef Limb Elem[P384_LIMBS];
+typedef Limb ScalarMont[P384_LIMBS];
+typedef Limb Scalar[P384_LIMBS];
+
+static const BN_ULONG Q[P384_LIMBS] = {
+#if defined(OPENSSL_64_BIT)
+  0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff,
+  0xffffffffffffffff, 0xffffffffffffffff
+#else
+  0xffffffff, 0, 0, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff,
+  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+#endif
+};
+
+static const BN_ULONG N[P384_LIMBS] = {
+#if defined(OPENSSL_64_BIT)
+  0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff,
+  0xffffffffffffffff, 0xffffffffffffffff
+#else
+  0xccc52973, 0xecec196a, 0x48b0a77a, 0x581a0db2, 0xf4372ddf, 0xc7634d81,
+  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+#endif
+};
+
+static const BN_ULONG ONE[P384_LIMBS] = {
+#if defined(OPENSSL_64_BIT)
+  0xffffffff00000001, 0xffffffff, 1, 0, 0
+#else
+  1, 0xffffffff, 0xffffffff, 0, 1, 0, 0, 0, 0, 0
+#endif
+};
+
+static const Elem Q_PLUS_1_SHR_1 = {
+#if defined(OPENSSL_64_BIT)
+  0x80000000, 0x7fffffff80000000, 0xffffffffffffffff, 0xffffffffffffffff,
+  0xffffffffffffffff, 0x7fffffffffffffff
+#else
+  0x80000000, 0, 0x80000000, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff
+#endif
+};
+
+static const BN_ULONG Q_N0[] = {
+  BN_MONT_CTX_N0(1, 1)
+};
+
+static const BN_ULONG N_N0[] = {
+  BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45)
+};
+
+/* XXX: MSVC for x86 warns when it fails to inline these functions it should
+ * probably inline. */
+#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86)
+#define INLINE_IF_POSSIBLE __forceinline
+#else
+#define INLINE_IF_POSSIBLE inline
+#endif
+
+static inline Limb is_equal(const Elem a, const Elem b) {
+  return LIMBS_equal(a, b, P384_LIMBS);
+}
+
+static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
+  return LIMBS_are_zero(a, P384_LIMBS);
+}
+
+static inline void copy_conditional(Elem r, const Elem a,
+                                                const Limb condition) {
+  for (size_t i = 0; i < P384_LIMBS; ++i) {
+    r[i] = constant_time_select_w(condition, a[i], r[i]);
+  }
+}
+
+
+static inline void elem_add(Elem r, const Elem a, const Elem b) {
+  LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
+}
+
+static inline void elem_sub(Elem r, const Elem a, const Elem b) {
+  LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
+}
+
+static void elem_div_by_2(Elem r, const Elem a) {
+  /* Consider the case where `a` is even. Then we can shift `a` right one bit
+   * and the result will still be valid because we didn't lose any bits and so
+   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
+   *
+   * The remainder of this comment is considering the case where `a` is odd.
+   *
+   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
+   * because the lowest bit is lost during the shift. For example, consider:
+   *
+   * ```python
+   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
+   * a = 2**383
+   * two_a = a * 2 % q
+   * assert two_a == 0x100000000ffffffffffffffff00000001
+   * ```
+   *
+   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
+   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
+   * we obviously can't get with just a right shift.
+   *
+   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
+   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
+   * keep track of an extra most significant bit. We can avoid that by instead
+   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
+   * significant bit of `a`. `q + 1` is even, which means it can be shifted
+   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
+   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
+   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
+   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
+   * bit of `a`, which is 1. Thus:
+   *
+   * sum  =  ((q + 1) >> 1) + (a >> 1)
+   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
+   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
+   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
+   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
+   *     <=  (2q - 2)/2                 (simplifying)
+   *     <=  q - 1                      (simplifying)
+   *
+   * Thus, no reduction of the sum mod `q` is necessary. */
+
+  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
+
+  /* r = a >> 1. */
+  Limb carry = a[P384_LIMBS - 1] & 1;
+  r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
+  for (size_t i = 1; i < P384_LIMBS; ++i) {
+    Limb new_carry = a[P384_LIMBS - i - 1];
+    r[P384_LIMBS - i - 1] =
+        (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
+    carry = new_carry;
+  }
+
+  Elem adjusted;
+  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
+  dev_assert_secret(carry2 == 0);
+  (void)carry2;
+  copy_conditional(r, adjusted, is_odd);
+}
+
+static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
+  /* XXX: Not (clearly) constant-time; inefficient.*/
+  bn_mul_mont_small(r, a, b, Q, Q_N0, P384_LIMBS);
+}
+
+static inline void elem_mul_by_2(Elem r, const Elem a) {
+  LIMBS_shl_mod(r, a, Q, P384_LIMBS);
+}
+
+static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
+  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
+  Elem doubled;
+  elem_add(doubled, a, a);
+  elem_add(r, doubled, a);
+}
+
+static inline void elem_sqr_mont(Elem r, const Elem a) {
+  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
+  elem_mul_mont(r, a, a);
+}
+
+void p384_elem_sub(Elem r, const Elem a, const Elem b) {
+  elem_sub(r, a, b);
+}
+
+void p384_elem_div_by_2(Elem r, const Elem a) {
+  elem_div_by_2(r, a);
+}
+
+void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
+  elem_mul_mont(r, a, b);
+}
+
+void p384_elem_neg(Elem r, const Elem a) {
+  Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
+  Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
+  dev_assert_secret(borrow == 0);
+  (void)borrow;
+  for (size_t i = 0; i < P384_LIMBS; ++i) {
+    r[i] = constant_time_select_w(is_zero, 0, r[i]);
+  }
+}
+
+
+void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
+                              const ScalarMont b) {
+  /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
+  bn_mul_mont_small(r, a, b, N, N_N0, P384_LIMBS);
+}
+
+
+/* TODO(perf): Optimize this. */
+
+static void p384_point_select_w5(P384_POINT *out,
+                                     const P384_POINT table[16], size_t index) {
+  Elem x; limbs_zero(x, P384_LIMBS);
+  Elem y; limbs_zero(y, P384_LIMBS);
+  Elem z; limbs_zero(z, P384_LIMBS);
+
+  // TODO: Rewrite in terms of |limbs_select|.
+  for (size_t i = 0; i < 16; ++i) {
+    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
+    for (size_t j = 0; j < P384_LIMBS; ++j) {
+      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
+      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
+      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
+    }
+  }
+
+  limbs_copy(out->X, x, P384_LIMBS);
+  limbs_copy(out->Y, y, P384_LIMBS);
+  limbs_copy(out->Z, z, P384_LIMBS);
+}
+
+
+#include "ecp_nistz384.inl"
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h
new file mode 100644
index 0000000000..03bf782f48
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h
@@ -0,0 +1,9502 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2015, Intel Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is the precomputed constant time access table for the code in
+// p256-nistz.c, for the default generator. The table consists of 37
+// subtables, each subtable contains 64 affine points. The affine points are
+// encoded as eight uint64's, four for the x coordinate and four for the y.
+// Both values are in little-endian order. There are 37 tables because a
+// signed, 6-bit wNAF form of the scalar is used and ceil(256/(6 + 1)) = 37.
+// Within each table there are 64 values because the 6-bit wNAF value can take
+// 64 values, ignoring the sign bit, which is implemented by performing a
+// negation of the affine point when required. We would like to align it to 2MB
+// in order to increase the chances of using a large page but that appears to
+// lead to invalid ELF files being produced.
+
+// This file is generated by make_tables.go.
+
+static const alignas(4096) PRECOMP256_ROW ecp_nistz256_precomputed[37] = {
+    {{{TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601),
+       TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)},
+      {TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c),
+       TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85)}},
+     {{TOBN(0x850046d4, 0x10ddd64d), TOBN(0xaa6ae3c1, 0xa433827d),
+       TOBN(0x73220503, 0x8d1490d9), TOBN(0xf6bb32e4, 0x3dcf3a3b)},
+      {TOBN(0x2f3648d3, 0x61bee1a5), TOBN(0x152cd7cb, 0xeb236ff8),
+       TOBN(0x19a8fb0e, 0x92042dbe), TOBN(0x78c57751, 0x0a5b8a3b)}},
+     {{TOBN(0xffac3f90, 0x4eebc127), TOBN(0xb027f84a, 0x087d81fb),
+       TOBN(0x66ad77dd, 0x87cbbc98), TOBN(0x26936a3f, 0xb6ff747e)},
+      {TOBN(0xb04c5c1f, 0xc983a7eb), TOBN(0x583e47ad, 0x0861fe1a),
+       TOBN(0x78820831, 0x1a2ee98e), TOBN(0xd5f06a29, 0xe587cc07)}},
+     {{TOBN(0x74b0b50d, 0x46918dcc), TOBN(0x4650a6ed, 0xc623c173),
+       TOBN(0x0cdaacac, 0xe8100af2), TOBN(0x577362f5, 0x41b0176b)},
+      {TOBN(0x2d96f24c, 0xe4cbaba6), TOBN(0x17628471, 0xfad6f447),
+       TOBN(0x6b6c36de, 0xe5ddd22e), TOBN(0x84b14c39, 0x4c5ab863)}},
+     {{TOBN(0xbe1b8aae, 0xc45c61f5), TOBN(0x90ec649a, 0x94b9537d),
+       TOBN(0x941cb5aa, 0xd076c20c), TOBN(0xc9079605, 0x890523c8)},
+      {TOBN(0xeb309b4a, 0xe7ba4f10), TOBN(0x73c568ef, 0xe5eb882b),
+       TOBN(0x3540a987, 0x7e7a1f68), TOBN(0x73a076bb, 0x2dd1e916)}},
+     {{TOBN(0x40394737, 0x3e77664a), TOBN(0x55ae744f, 0x346cee3e),
+       TOBN(0xd50a961a, 0x5b17a3ad), TOBN(0x13074b59, 0x54213673)},
+      {TOBN(0x93d36220, 0xd377e44b), TOBN(0x299c2b53, 0xadff14b5),
+       TOBN(0xf424d44c, 0xef639f11), TOBN(0xa4c9916d, 0x4a07f75f)}},
+     {{TOBN(0x0746354e, 0xa0173b4f), TOBN(0x2bd20213, 0xd23c00f7),
+       TOBN(0xf43eaab5, 0x0c23bb08), TOBN(0x13ba5119, 0xc3123e03)},
+      {TOBN(0x2847d030, 0x3f5b9d4d), TOBN(0x6742f2f2, 0x5da67bdd),
+       TOBN(0xef933bdc, 0x77c94195), TOBN(0xeaedd915, 0x6e240867)}},
+     {{TOBN(0x27f14cd1, 0x9499a78f), TOBN(0x462ab5c5, 0x6f9b3455),
+       TOBN(0x8f90f02a, 0xf02cfc6b), TOBN(0xb763891e, 0xb265230d)},
+      {TOBN(0xf59da3a9, 0x532d4977), TOBN(0x21e3327d, 0xcf9eba15),
+       TOBN(0x123c7b84, 0xbe60bbf0), TOBN(0x56ec12f2, 0x7706df76)}},
+     {{TOBN(0x75c96e8f, 0x264e20e8), TOBN(0xabe6bfed, 0x59a7a841),
+       TOBN(0x2cc09c04, 0x44c8eb00), TOBN(0xe05b3080, 0xf0c4e16b)},
+      {TOBN(0x1eb7777a, 0xa45f3314), TOBN(0x56af7bed, 0xce5d45e3),
+       TOBN(0x2b6e019a, 0x88b12f1a), TOBN(0x086659cd, 0xfd835f9b)}},
+     {{TOBN(0x2c18dbd1, 0x9dc21ec8), TOBN(0x98f9868a, 0x0fcf8139),
+       TOBN(0x737d2cd6, 0x48250b49), TOBN(0xcc61c947, 0x24b3428f)},
+      {TOBN(0x0c2b4078, 0x80dd9e76), TOBN(0xc43a8991, 0x383fbe08),
+       TOBN(0x5f7d2d65, 0x779be5d2), TOBN(0x78719a54, 0xeb3b4ab5)}},
+     {{TOBN(0xea7d260a, 0x6245e404), TOBN(0x9de40795, 0x6e7fdfe0),
+       TOBN(0x1ff3a415, 0x8dac1ab5), TOBN(0x3e7090f1, 0x649c9073)},
+      {TOBN(0x1a768561, 0x2b944e88), TOBN(0x250f939e, 0xe57f61c8),
+       TOBN(0x0c0daa89, 0x1ead643d), TOBN(0x68930023, 0xe125b88e)}},
+     {{TOBN(0x04b71aa7, 0xd2697768), TOBN(0xabdedef5, 0xca345a33),
+       TOBN(0x2409d29d, 0xee37385e), TOBN(0x4ee1df77, 0xcb83e156)},
+      {TOBN(0x0cac12d9, 0x1cbb5b43), TOBN(0x170ed2f6, 0xca895637),
+       TOBN(0x28228cfa, 0x8ade6d66), TOBN(0x7ff57c95, 0x53238aca)}},
+     {{TOBN(0xccc42563, 0x4b2ed709), TOBN(0x0e356769, 0x856fd30d),
+       TOBN(0xbcbcd43f, 0x559e9811), TOBN(0x738477ac, 0x5395b759)},
+      {TOBN(0x35752b90, 0xc00ee17f), TOBN(0x68748390, 0x742ed2e3),
+       TOBN(0x7cd06422, 0xbd1f5bc1), TOBN(0xfbc08769, 0xc9e7b797)}},
+     {{TOBN(0xa242a35b, 0xb0cf664a), TOBN(0x126e48f7, 0x7f9707e3),
+       TOBN(0x1717bf54, 0xc6832660), TOBN(0xfaae7332, 0xfd12c72e)},
+      {TOBN(0x27b52db7, 0x995d586b), TOBN(0xbe29569e, 0x832237c2),
+       TOBN(0xe8e4193e, 0x2a65e7db), TOBN(0x152706dc, 0x2eaa1bbb)}},
+     {{TOBN(0x72bcd8b7, 0xbc60055b), TOBN(0x03cc23ee, 0x56e27e4b),
+       TOBN(0xee337424, 0xe4819370), TOBN(0xe2aa0e43, 0x0ad3da09)},
+      {TOBN(0x40b8524f, 0x6383c45d), TOBN(0xd7663554, 0x42a41b25),
+       TOBN(0x64efa6de, 0x778a4797), TOBN(0x2042170a, 0x7079adf4)}},
+     {{TOBN(0x808b0b65, 0x0bc6fb80), TOBN(0x5882e075, 0x3ffe2e6b),
+       TOBN(0xd5ef2f7c, 0x2c83f549), TOBN(0x54d63c80, 0x9103b723)},
+      {TOBN(0xf2f11bd6, 0x52a23f9b), TOBN(0x3670c319, 0x4b0b6587),
+       TOBN(0x55c4623b, 0xb1580e9e), TOBN(0x64edf7b2, 0x01efe220)}},
+     {{TOBN(0x97091dcb, 0xd53c5c9d), TOBN(0xf17624b6, 0xac0a177b),
+       TOBN(0xb0f13975, 0x2cfe2dff), TOBN(0xc1a35c0a, 0x6c7a574e)},
+      {TOBN(0x227d3146, 0x93e79987), TOBN(0x0575bf30, 0xe89cb80e),
+       TOBN(0x2f4e247f, 0x0d1883bb), TOBN(0xebd51226, 0x3274c3d0)}},
+     {{TOBN(0x5f3e51c8, 0x56ada97a), TOBN(0x4afc964d, 0x8f8b403e),
+       TOBN(0xa6f247ab, 0x412e2979), TOBN(0x675abd1b, 0x6f80ebda)},
+      {TOBN(0x66a2bd72, 0x5e485a1d), TOBN(0x4b2a5caf, 0x8f4f0b3c),
+       TOBN(0x2626927f, 0x1b847bba), TOBN(0x6c6fc7d9, 0x0502394d)}},
+     {{TOBN(0xfea912ba, 0xa5659ae8), TOBN(0x68363aba, 0x25e1a16e),
+       TOBN(0xb8842277, 0x752c41ac), TOBN(0xfe545c28, 0x2897c3fc)},
+      {TOBN(0x2d36e9e7, 0xdc4c696b), TOBN(0x5806244a, 0xfba977c5),
+       TOBN(0x85665e9b, 0xe39508c1), TOBN(0xf720ee25, 0x6d12597b)}},
+     {{TOBN(0x8a979129, 0xd2337a31), TOBN(0x5916868f, 0x0f862bdc),
+       TOBN(0x048099d9, 0x5dd283ba), TOBN(0xe2d1eeb6, 0xfe5bfb4e)},
+      {TOBN(0x82ef1c41, 0x7884005d), TOBN(0xa2d4ec17, 0xffffcbae),
+       TOBN(0x9161c53f, 0x8aa95e66), TOBN(0x5ee104e1, 0xc5fee0d0)}},
+     {{TOBN(0x562e4cec, 0xc135b208), TOBN(0x74e1b265, 0x4783f47d),
+       TOBN(0x6d2a506c, 0x5a3f3b30), TOBN(0xecead9f4, 0xc16762fc)},
+      {TOBN(0xf29dd4b2, 0xe286e5b9), TOBN(0x1b0fadc0, 0x83bb3c61),
+       TOBN(0x7a75023e, 0x7fac29a4), TOBN(0xc086d5f1, 0xc9477fa3)}},
+     {{TOBN(0x0fc61135, 0x2f6f3076), TOBN(0xc99ffa23, 0xe3912a9a),
+       TOBN(0x6a0b0685, 0xd2f8ba3d), TOBN(0xfdc777e8, 0xe93358a4)},
+      {TOBN(0x94a787bb, 0x35415f04), TOBN(0x640c2d6a, 0x4d23fea4),
+       TOBN(0x9de917da, 0x153a35b5), TOBN(0x793e8d07, 0x5d5cd074)}},
+     {{TOBN(0xf4f87653, 0x2de45068), TOBN(0x37c7a7e8, 0x9e2e1f6e),
+       TOBN(0xd0825fa2, 0xa3584069), TOBN(0xaf2cea7c, 0x1727bf42)},
+      {TOBN(0x0360a4fb, 0x9e4785a9), TOBN(0xe5fda49c, 0x27299f4a),
+       TOBN(0x48068e13, 0x71ac2f71), TOBN(0x83d0687b, 0x9077666f)}},
+     {{TOBN(0x6d3883b2, 0x15d02819), TOBN(0x6d0d7550, 0x40dd9a35),
+       TOBN(0x61d7cbf9, 0x1d2b469f), TOBN(0xf97b232f, 0x2efc3115)},
+      {TOBN(0xa551d750, 0xb24bcbc7), TOBN(0x11ea4949, 0x88a1e356),
+       TOBN(0x7669f031, 0x93cb7501), TOBN(0x595dc55e, 0xca737b8a)}},
+     {{TOBN(0xa4a319ac, 0xd837879f), TOBN(0x6fc1b49e, 0xed6b67b0),
+       TOBN(0xe3959933, 0x32f1f3af), TOBN(0x966742eb, 0x65432a2e)},
+      {TOBN(0x4b8dc9fe, 0xb4966228), TOBN(0x96cc6312, 0x43f43950),
+       TOBN(0x12068859, 0xc9b731ee), TOBN(0x7b948dc3, 0x56f79968)}},
+     {{TOBN(0x61e4ad32, 0xed1f8008), TOBN(0xe6c9267a, 0xd8b17538),
+       TOBN(0x1ac7c5eb, 0x857ff6fb), TOBN(0x994baaa8, 0x55f2fb10)},
+      {TOBN(0x84cf14e1, 0x1d248018), TOBN(0x5a39898b, 0x628ac508),
+       TOBN(0x14fde97b, 0x5fa944f5), TOBN(0xed178030, 0xd12e5ac7)}},
+     {{TOBN(0x042c2af4, 0x97e2feb4), TOBN(0xd36a42d7, 0xaebf7313),
+       TOBN(0x49d2c9eb, 0x084ffdd7), TOBN(0x9f8aa54b, 0x2ef7c76a)},
+      {TOBN(0x9200b7ba, 0x09895e70), TOBN(0x3bd0c66f, 0xddb7fb58),
+       TOBN(0x2d97d108, 0x78eb4cbb), TOBN(0x2d431068, 0xd84bde31)}},
+     {{TOBN(0x4b523eb7, 0x172ccd1f), TOBN(0x7323cb28, 0x30a6a892),
+       TOBN(0x97082ec0, 0xcfe153eb), TOBN(0xe97f6b6a, 0xf2aadb97)},
+      {TOBN(0x1d3d393e, 0xd1a83da1), TOBN(0xa6a7f9c7, 0x804b2a68),
+       TOBN(0x4a688b48, 0x2d0cb71e), TOBN(0xa9b4cc5f, 0x40585278)}},
+     {{TOBN(0x5e5db46a, 0xcb66e132), TOBN(0xf1be963a, 0x0d925880),
+       TOBN(0x944a7027, 0x0317b9e2), TOBN(0xe266f959, 0x48603d48)},
+      {TOBN(0x98db6673, 0x5c208899), TOBN(0x90472447, 0xa2fb18a3),
+       TOBN(0x8a966939, 0x777c619f), TOBN(0x3798142a, 0x2a3be21b)}},
+     {{TOBN(0xb4241cb1, 0x3298b343), TOBN(0xa3a14e49, 0xb44f65a1),
+       TOBN(0xc5f4d6cd, 0x3ac77acd), TOBN(0xd0288cb5, 0x52b6fc3c)},
+      {TOBN(0xd5cc8c2f, 0x1c040abc), TOBN(0xb675511e, 0x06bf9b4a),
+       TOBN(0xd667da37, 0x9b3aa441), TOBN(0x460d45ce, 0x51601f72)}},
+     {{TOBN(0xe2f73c69, 0x6755ff89), TOBN(0xdd3cf7e7, 0x473017e6),
+       TOBN(0x8ef5689d, 0x3cf7600d), TOBN(0x948dc4f8, 0xb1fc87b4)},
+      {TOBN(0xd9e9fe81, 0x4ea53299), TOBN(0x2d921ca2, 0x98eb6028),
+       TOBN(0xfaecedfd, 0x0c9803fc), TOBN(0xf38ae891, 0x4d7b4745)}},
+     {{TOBN(0xd8c5fccf, 0xc5e3a3d8), TOBN(0xbefd904c, 0x4079dfbf),
+       TOBN(0xbc6d6a58, 0xfead0197), TOBN(0x39227077, 0x695532a4)},
+      {TOBN(0x09e23e6d, 0xdbef42f5), TOBN(0x7e449b64, 0x480a9908),
+       TOBN(0x7b969c1a, 0xad9a2e40), TOBN(0x6231d792, 0x9591c2a4)}},
+     {{TOBN(0x87151456, 0x0f664534), TOBN(0x85ceae7c, 0x4b68f103),
+       TOBN(0xac09c4ae, 0x65578ab9), TOBN(0x33ec6868, 0xf044b10c)},
+      {TOBN(0x6ac4832b, 0x3a8ec1f1), TOBN(0x5509d128, 0x5847d5ef),
+       TOBN(0xf909604f, 0x763f1574), TOBN(0xb16c4303, 0xc32f63c4)}},
+     {{TOBN(0xb6ab2014, 0x7ca23cd3), TOBN(0xcaa7a5c6, 0xa391849d),
+       TOBN(0x5b0673a3, 0x75678d94), TOBN(0xc982ddd4, 0xdd303e64)},
+      {TOBN(0xfd7b000b, 0x5db6f971), TOBN(0xbba2cb1f, 0x6f876f92),
+       TOBN(0xc77332a3, 0x3c569426), TOBN(0xa159100c, 0x570d74f8)}},
+     {{TOBN(0xfd16847f, 0xdec67ef5), TOBN(0x742ee464, 0x233e76b7),
+       TOBN(0x0b8e4134, 0xefc2b4c8), TOBN(0xca640b86, 0x42a3e521)},
+      {TOBN(0x653a0190, 0x8ceb6aa9), TOBN(0x313c300c, 0x547852d5),
+       TOBN(0x24e4ab12, 0x6b237af7), TOBN(0x2ba90162, 0x8bb47af8)}},
+     {{TOBN(0x3d5e58d6, 0xa8219bb7), TOBN(0xc691d0bd, 0x1b06c57f),
+       TOBN(0x0ae4cb10, 0xd257576e), TOBN(0x3569656c, 0xd54a3dc3)},
+      {TOBN(0xe5ebaebd, 0x94cda03a), TOBN(0x934e82d3, 0x162bfe13),
+       TOBN(0x450ac0ba, 0xe251a0c6), TOBN(0x480b9e11, 0xdd6da526)}},
+     {{TOBN(0x00467bc5, 0x8cce08b5), TOBN(0xb636458c, 0x7f178d55),
+       TOBN(0xc5748bae, 0xa677d806), TOBN(0x2763a387, 0xdfa394eb)},
+      {TOBN(0xa12b448a, 0x7d3cebb6), TOBN(0xe7adda3e, 0x6f20d850),
+       TOBN(0xf63ebce5, 0x1558462c), TOBN(0x58b36143, 0x620088a8)}},
+     {{TOBN(0x8a2cc3ca, 0x4d63c0ee), TOBN(0x51233117, 0x0fe948ce),
+       TOBN(0x7463fd85, 0x222ef33b), TOBN(0xadf0c7dc, 0x7c603d6c)},
+      {TOBN(0x0ec32d3b, 0xfe7765e5), TOBN(0xccaab359, 0xbf380409),
+       TOBN(0xbdaa84d6, 0x8e59319c), TOBN(0xd9a4c280, 0x9c80c34d)}},
+     {{TOBN(0xa9d89488, 0xa059c142), TOBN(0x6f5ae714, 0xff0b9346),
+       TOBN(0x068f237d, 0x16fb3664), TOBN(0x5853e4c4, 0x363186ac)},
+      {TOBN(0xe2d87d23, 0x63c52f98), TOBN(0x2ec4a766, 0x81828876),
+       TOBN(0x47b864fa, 0xe14e7b1c), TOBN(0x0c0bc0e5, 0x69192408)}},
+     {{TOBN(0xe4d7681d, 0xb82e9f3e), TOBN(0x83200f0b, 0xdf25e13c),
+       TOBN(0x8909984c, 0x66f27280), TOBN(0x462d7b00, 0x75f73227)},
+      {TOBN(0xd90ba188, 0xf2651798), TOBN(0x74c6e18c, 0x36ab1c34),
+       TOBN(0xab256ea3, 0x5ef54359), TOBN(0x03466612, 0xd1aa702f)}},
+     {{TOBN(0x624d6049, 0x2ed22e91), TOBN(0x6fdfe0b5, 0x6f072822),
+       TOBN(0xeeca1115, 0x39ce2271), TOBN(0x98100a4f, 0xdb01614f)},
+      {TOBN(0xb6b0daa2, 0xa35c628f), TOBN(0xb6f94d2e, 0xc87e9a47),
+       TOBN(0xc6773259, 0x1d57d9ce), TOBN(0xf70bfeec, 0x03884a7b)}},
+     {{TOBN(0x5fb35ccf, 0xed2bad01), TOBN(0xa155cbe3, 0x1da6a5c7),
+       TOBN(0xc2e2594c, 0x30a92f8f), TOBN(0x649c89ce, 0x5bfafe43)},
+      {TOBN(0xd158667d, 0xe9ff257a), TOBN(0x9b359611, 0xf32c50ae),
+       TOBN(0x4b00b20b, 0x906014cf), TOBN(0xf3a8cfe3, 0x89bc7d3d)}},
+     {{TOBN(0x4ff23ffd, 0x248a7d06), TOBN(0x80c5bfb4, 0x878873fa),
+       TOBN(0xb7d9ad90, 0x05745981), TOBN(0x179c85db, 0x3db01994)},
+      {TOBN(0xba41b062, 0x61a6966c), TOBN(0x4d82d052, 0xeadce5a8),
+       TOBN(0x9e91cd3b, 0xa5e6a318), TOBN(0x47795f4f, 0x95b2dda0)}},
+     {{TOBN(0xecfd7c1f, 0xd55a897c), TOBN(0x009194ab, 0xb29110fb),
+       TOBN(0x5f0e2046, 0xe381d3b0), TOBN(0x5f3425f6, 0xa98dd291)},
+      {TOBN(0xbfa06687, 0x730d50da), TOBN(0x0423446c, 0x4b083b7f),
+       TOBN(0x397a247d, 0xd69d3417), TOBN(0xeb629f90, 0x387ba42a)}},
+     {{TOBN(0x1ee426cc, 0xd5cd79bf), TOBN(0x0032940b, 0x946c6e18),
+       TOBN(0x1b1e8ae0, 0x57477f58), TOBN(0xe94f7d34, 0x6d823278)},
+      {TOBN(0xc747cb96, 0x782ba21a), TOBN(0xc5254469, 0xf72b33a5),
+       TOBN(0x772ef6de, 0xc7f80c81), TOBN(0xd73acbfe, 0x2cd9e6b5)}},
+     {{TOBN(0x4075b5b1, 0x49ee90d9), TOBN(0x785c339a, 0xa06e9eba),
+       TOBN(0xa1030d5b, 0xabf825e0), TOBN(0xcec684c3, 0xa42931dc)},
+      {TOBN(0x42ab62c9, 0xc1586e63), TOBN(0x45431d66, 0x5ab43f2b),
+       TOBN(0x57c8b2c0, 0x55f7835d), TOBN(0x033da338, 0xc1b7f865)}},
+     {{TOBN(0x283c7513, 0xcaa76097), TOBN(0x0a624fa9, 0x36c83906),
+       TOBN(0x6b20afec, 0x715af2c7), TOBN(0x4b969974, 0xeba78bfd)},
+      {TOBN(0x220755cc, 0xd921d60e), TOBN(0x9b944e10, 0x7baeca13),
+       TOBN(0x04819d51, 0x5ded93d4), TOBN(0x9bbff86e, 0x6dddfd27)}},
+     {{TOBN(0x6b344130, 0x77adc612), TOBN(0xa7496529, 0xbbd803a0),
+       TOBN(0x1a1baaa7, 0x6d8805bd), TOBN(0xc8403902, 0x470343ad)},
+      {TOBN(0x39f59f66, 0x175adff1), TOBN(0x0b26d7fb, 0xb7d8c5b7),
+       TOBN(0xa875f5ce, 0x529d75e3), TOBN(0x85efc7e9, 0x41325cc2)}},
+     {{TOBN(0x21950b42, 0x1ff6acd3), TOBN(0xffe70484, 0x53dc6909),
+       TOBN(0xff4cd0b2, 0x28766127), TOBN(0xabdbe608, 0x4fb7db2b)},
+      {TOBN(0x837c9228, 0x5e1109e8), TOBN(0x26147d27, 0xf4645b5a),
+       TOBN(0x4d78f592, 0xf7818ed8), TOBN(0xd394077e, 0xf247fa36)}},
+     {{TOBN(0x0fb9c2d0, 0x488c171a), TOBN(0xa78bfbaa, 0x13685278),
+       TOBN(0xedfbe268, 0xd5b1fa6a), TOBN(0x0dceb8db, 0x2b7eaba7)},
+      {TOBN(0xbf9e8089, 0x9ae2b710), TOBN(0xefde7ae6, 0xa4449c96),
+       TOBN(0x43b7716b, 0xcc143a46), TOBN(0xd7d34194, 0xc3628c13)}},
+     {{TOBN(0x508cec1c, 0x3b3f64c9), TOBN(0xe20bc0ba, 0x1e5edf3f),
+       TOBN(0xda1deb85, 0x2f4318d4), TOBN(0xd20ebe0d, 0x5c3fa443)},
+      {TOBN(0x370b4ea7, 0x73241ea3), TOBN(0x61f1511c, 0x5e1a5f65),
+       TOBN(0x99a5e23d, 0x82681c62), TOBN(0xd731e383, 0xa2f54c2d)}},
+     {{TOBN(0x2692f36e, 0x83445904), TOBN(0x2e0ec469, 0xaf45f9c0),
+       TOBN(0x905a3201, 0xc67528b7), TOBN(0x88f77f34, 0xd0e5e542)},
+      {TOBN(0xf67a8d29, 0x5864687c), TOBN(0x23b92eae, 0x22df3562),
+       TOBN(0x5c27014b, 0x9bbec39e), TOBN(0x7ef2f226, 0x9c0f0f8d)}},
+     {{TOBN(0x97359638, 0x546c4d8d), TOBN(0x5f9c3fc4, 0x92f24679),
+       TOBN(0x912e8bed, 0xa8c8acd9), TOBN(0xec3a318d, 0x306634b0)},
+      {TOBN(0x80167f41, 0xc31cb264), TOBN(0x3db82f6f, 0x522113f2),
+       TOBN(0xb155bcd2, 0xdcafe197), TOBN(0xfba1da59, 0x43465283)}},
+     {{TOBN(0xa0425b8e, 0xb212cf53), TOBN(0x4f2e512e, 0xf8557c5f),
+       TOBN(0xc1286ff9, 0x25c4d56c), TOBN(0xbb8a0fea, 0xee26c851)},
+      {TOBN(0xc28f70d2, 0xe7d6107e), TOBN(0x7ee0c444, 0xe76265aa),
+       TOBN(0x3df277a4, 0x1d1936b1), TOBN(0x1a556e3f, 0xea9595eb)}},
+     {{TOBN(0x258bbbf9, 0xe7305683), TOBN(0x31eea5bf, 0x07ef5be6),
+       TOBN(0x0deb0e4a, 0x46c814c1), TOBN(0x5cee8449, 0xa7b730dd)},
+      {TOBN(0xeab495c5, 0xa0182bde), TOBN(0xee759f87, 0x9e27a6b4),
+       TOBN(0xc2cf6a68, 0x80e518ca), TOBN(0x25e8013f, 0xf14cf3f4)}},
+     {{TOBN(0x8fc44140, 0x7e8d7a14), TOBN(0xbb1ff3ca, 0x9556f36a),
+       TOBN(0x6a844385, 0x14600044), TOBN(0xba3f0c4a, 0x7451ae63)},
+      {TOBN(0xdfcac25b, 0x1f9af32a), TOBN(0x01e0db86, 0xb1f2214b),
+       TOBN(0x4e9a5bc2, 0xa4b596ac), TOBN(0x83927681, 0x026c2c08)}},
+     {{TOBN(0x3ec832e7, 0x7acaca28), TOBN(0x1bfeea57, 0xc7385b29),
+       TOBN(0x068212e3, 0xfd1eaf38), TOBN(0xc1329830, 0x6acf8ccc)},
+      {TOBN(0xb909f2db, 0x2aac9e59), TOBN(0x5748060d, 0xb661782a),
+       TOBN(0xc5ab2632, 0xc79b7a01), TOBN(0xda44c6c6, 0x00017626)}},
+     {{TOBN(0xf26c00e8, 0xa7ea82f0), TOBN(0x99cac80d, 0xe4299aaf),
+       TOBN(0xd66fe3b6, 0x7ed78be1), TOBN(0x305f725f, 0x648d02cd)},
+      {TOBN(0x33ed1bc4, 0x623fb21b), TOBN(0xfa70533e, 0x7a6319ad),
+       TOBN(0x17ab562d, 0xbe5ffb3e), TOBN(0x06374994, 0x56674741)}},
+     {{TOBN(0x69d44ed6, 0x5c46aa8e), TOBN(0x2100d5d3, 0xa8d063d1),
+       TOBN(0xcb9727ea, 0xa2d17c36), TOBN(0x4c2bab1b, 0x8add53b7)},
+      {TOBN(0xa084e90c, 0x15426704), TOBN(0x778afcd3, 0xa837ebea),
+       TOBN(0x6651f701, 0x7ce477f8), TOBN(0xa0624998, 0x46fb7a8b)}},
+     {{TOBN(0xdc1e6828, 0xed8a6e19), TOBN(0x33fc2336, 0x4189d9c7),
+       TOBN(0x026f8fe2, 0x671c39bc), TOBN(0xd40c4ccd, 0xbc6f9915)},
+      {TOBN(0xafa135bb, 0xf80e75ca), TOBN(0x12c651a0, 0x22adff2c),
+       TOBN(0xc40a04bd, 0x4f51ad96), TOBN(0x04820109, 0xbbe4e832)}},
+     {{TOBN(0x3667eb1a, 0x7f4c04cc), TOBN(0x59556621, 0xa9404f84),
+       TOBN(0x71cdf653, 0x7eceb50a), TOBN(0x994a44a6, 0x9b8335fa)},
+      {TOBN(0xd7faf819, 0xdbeb9b69), TOBN(0x473c5680, 0xeed4350d),
+       TOBN(0xb6658466, 0xda44bba2), TOBN(0x0d1bc780, 0x872bdbf3)}},
+     {{TOBN(0xe535f175, 0xa1962f91), TOBN(0x6ed7e061, 0xed58f5a7),
+       TOBN(0x177aa4c0, 0x2089a233), TOBN(0x0dbcb03a, 0xe539b413)},
+      {TOBN(0xe3dc424e, 0xbb32e38e), TOBN(0x6472e5ef, 0x6806701e),
+       TOBN(0xdd47ff98, 0x814be9ee), TOBN(0x6b60cfff, 0x35ace009)}},
+     {{TOBN(0xb8d3d931, 0x9ff91fe5), TOBN(0x039c4800, 0xf0518eed),
+       TOBN(0x95c37632, 0x9182cb26), TOBN(0x0763a434, 0x82fc568d)},
+      {TOBN(0x707c04d5, 0x383e76ba), TOBN(0xac98b930, 0x824e8197),
+       TOBN(0x92bf7c8f, 0x91230de0), TOBN(0x90876a01, 0x40959b70)}},
+     {{TOBN(0xdb6d96f3, 0x05968b80), TOBN(0x380a0913, 0x089f73b9),
+       TOBN(0x7da70b83, 0xc2c61e01), TOBN(0x95fb8394, 0x569b38c7)},
+      {TOBN(0x9a3c6512, 0x80edfe2f), TOBN(0x8f726bb9, 0x8faeaf82),
+       TOBN(0x8010a4a0, 0x78424bf8), TOBN(0x29672044, 0x0e844970)}}},
+    {{{TOBN(0x63c5cb81, 0x7a2ad62a), TOBN(0x7ef2b6b9, 0xac62ff54),
+       TOBN(0x3749bba4, 0xb3ad9db5), TOBN(0xad311f2c, 0x46d5a617)},
+      {TOBN(0xb77a8087, 0xc2ff3b6d), TOBN(0xb46feaf3, 0x367834ff),
+       TOBN(0xf8aa266d, 0x75d6b138), TOBN(0xfa38d320, 0xec008188)}},
+     {{TOBN(0x486d8ffa, 0x696946fc), TOBN(0x50fbc6d8, 0xb9cba56d),
+       TOBN(0x7e3d423e, 0x90f35a15), TOBN(0x7c3da195, 0xc0dd962c)},
+      {TOBN(0xe673fdb0, 0x3cfd5d8b), TOBN(0x0704b7c2, 0x889dfca5),
+       TOBN(0xf6ce581f, 0xf52305aa), TOBN(0x399d49eb, 0x914d5e53)}},
+     {{TOBN(0x380a496d, 0x6ec293cd), TOBN(0x733dbda7, 0x8e7051f5),
+       TOBN(0x037e388d, 0xb849140a), TOBN(0xee4b32b0, 0x5946dbf6)},
+      {TOBN(0xb1c4fda9, 0xcae368d1), TOBN(0x5001a7b0, 0xfdb0b2f3),
+       TOBN(0x6df59374, 0x2e3ac46e), TOBN(0x4af675f2, 0x39b3e656)}},
+     {{TOBN(0x44e38110, 0x39949296), TOBN(0x5b63827b, 0x361db1b5),
+       TOBN(0x3e5323ed, 0x206eaff5), TOBN(0x942370d2, 0xc21f4290)},
+      {TOBN(0xf2caaf2e, 0xe0d985a1), TOBN(0x192cc64b, 0x7239846d),
+       TOBN(0x7c0b8f47, 0xae6312f8), TOBN(0x7dc61f91, 0x96620108)}},
+     {{TOBN(0xb830fb5b, 0xc2da7de9), TOBN(0xd0e643df, 0x0ff8d3be),
+       TOBN(0x31ee77ba, 0x188a9641), TOBN(0x4e8aa3aa, 0xbcf6d502)},
+      {TOBN(0xf9fb6532, 0x9a49110f), TOBN(0xd18317f6, 0x2dd6b220),
+       TOBN(0x7e3ced41, 0x52c3ea5a), TOBN(0x0d296a14, 0x7d579c4a)}},
+     {{TOBN(0x35d6a53e, 0xed4c3717), TOBN(0x9f8240cf, 0x3d0ed2a3),
+       TOBN(0x8c0d4d05, 0xe5543aa5), TOBN(0x45d5bbfb, 0xdd33b4b4)},
+      {TOBN(0xfa04cc73, 0x137fd28e), TOBN(0x862ac6ef, 0xc73b3ffd),
+       TOBN(0x403ff9f5, 0x31f51ef2), TOBN(0x34d5e0fc, 0xbc73f5a2)}},
+     {{TOBN(0xf2526820, 0x08913f4f), TOBN(0xea20ed61, 0xeac93d95),
+       TOBN(0x51ed38b4, 0x6ca6b26c), TOBN(0x8662dcbc, 0xea4327b0)},
+      {TOBN(0x6daf295c, 0x725d2aaa), TOBN(0xbad2752f, 0x8e52dcda),
+       TOBN(0x2210e721, 0x0b17dacc), TOBN(0xa37f7912, 0xd51e8232)}},
+     {{TOBN(0x4f7081e1, 0x44cc3add), TOBN(0xd5ffa1d6, 0x87be82cf),
+       TOBN(0x89890b6c, 0x0edd6472), TOBN(0xada26e1a, 0x3ed17863)},
+      {TOBN(0x276f2715, 0x63483caa), TOBN(0xe6924cd9, 0x2f6077fd),
+       TOBN(0x05a7fe98, 0x0a466e3c), TOBN(0xf1c794b0, 0xb1902d1f)}},
+     {{TOBN(0xe5213688, 0x82a8042c), TOBN(0xd931cfaf, 0xcd278298),
+       TOBN(0x069a0ae0, 0xf597a740), TOBN(0x0adbb3f3, 0xeb59107c)},
+      {TOBN(0x983e951e, 0x5eaa8eb8), TOBN(0xe663a8b5, 0x11b48e78),
+       TOBN(0x1631cc0d, 0x8a03f2c5), TOBN(0x7577c11e, 0x11e271e2)}},
+     {{TOBN(0x33b2385c, 0x08369a90), TOBN(0x2990c59b, 0x190eb4f8),
+       TOBN(0x819a6145, 0xc68eac80), TOBN(0x7a786d62, 0x2ec4a014)},
+      {TOBN(0x33faadbe, 0x20ac3a8d), TOBN(0x31a21781, 0x5aba2d30),
+       TOBN(0x209d2742, 0xdba4f565), TOBN(0xdb2ce9e3, 0x55aa0fbb)}},
+     {{TOBN(0x8cef334b, 0x168984df), TOBN(0xe81dce17, 0x33879638),
+       TOBN(0xf6e6949c, 0x263720f0), TOBN(0x5c56feaf, 0xf593cbec)},
+      {TOBN(0x8bff5601, 0xfde58c84), TOBN(0x74e24117, 0x2eccb314),
+       TOBN(0xbcf01b61, 0x4c9a8a78), TOBN(0xa233e35e, 0x544c9868)}},
+     {{TOBN(0xb3156bf3, 0x8bd7aff1), TOBN(0x1b5ee4cb, 0x1d81b146),
+       TOBN(0x7ba1ac41, 0xd628a915), TOBN(0x8f3a8f9c, 0xfd89699e)},
+      {TOBN(0x7329b9c9, 0xa0748be7), TOBN(0x1d391c95, 0xa92e621f),
+       TOBN(0xe51e6b21, 0x4d10a837), TOBN(0xd255f53a, 0x4947b435)}},
+     {{TOBN(0x07669e04, 0xf1788ee3), TOBN(0xc14f27af, 0xa86938a2),
+       TOBN(0x8b47a334, 0xe93a01c0), TOBN(0xff627438, 0xd9366808)},
+      {TOBN(0x7a0985d8, 0xca2a5965), TOBN(0x3d9a5542, 0xd6e9b9b3),
+       TOBN(0xc23eb80b, 0x4cf972e8), TOBN(0x5c1c33bb, 0x4fdf72fd)}},
+     {{TOBN(0x0c4a58d4, 0x74a86108), TOBN(0xf8048a8f, 0xee4c5d90),
+       TOBN(0xe3c7c924, 0xe86d4c80), TOBN(0x28c889de, 0x056a1e60)},
+      {TOBN(0x57e2662e, 0xb214a040), TOBN(0xe8c48e98, 0x37e10347),
+       TOBN(0x87742862, 0x80ac748a), TOBN(0xf1c24022, 0x186b06f2)}},
+     {{TOBN(0xac2dd4c3, 0x5f74040a), TOBN(0x409aeb71, 0xfceac957),
+       TOBN(0x4fbad782, 0x55c4ec23), TOBN(0xb359ed61, 0x8a7b76ec)},
+      {TOBN(0x12744926, 0xed6f4a60), TOBN(0xe21e8d7f, 0x4b912de3),
+       TOBN(0xe2575a59, 0xfc705a59), TOBN(0x72f1d4de, 0xed2dbc0e)}},
+     {{TOBN(0x3d2b24b9, 0xeb7926b8), TOBN(0xbff88cb3, 0xcdbe5509),
+       TOBN(0xd0f399af, 0xe4dd640b), TOBN(0x3c5fe130, 0x2f76ed45)},
+      {TOBN(0x6f3562f4, 0x3764fb3d), TOBN(0x7b5af318, 0x3151b62d),
+       TOBN(0xd5bd0bc7, 0xd79ce5f3), TOBN(0xfdaf6b20, 0xec66890f)}},
+     {{TOBN(0x735c67ec, 0x6063540c), TOBN(0x50b259c2, 0xe5f9cb8f),
+       TOBN(0xb8734f9a, 0x3f99c6ab), TOBN(0xf8cc13d5, 0xa3a7bc85)},
+      {TOBN(0x80c1b305, 0xc5217659), TOBN(0xfe5364d4, 0x4ec12a54),
+       TOBN(0xbd87045e, 0x681345fe), TOBN(0x7f8efeb1, 0x582f897f)}},
+     {{TOBN(0xe8cbf1e5, 0xd5923359), TOBN(0xdb0cea9d, 0x539b9fb0),
+       TOBN(0x0c5b34cf, 0x49859b98), TOBN(0x5e583c56, 0xa4403cc6)},
+      {TOBN(0x11fc1a2d, 0xd48185b7), TOBN(0xc93fbc7e, 0x6e521787),
+       TOBN(0x47e7a058, 0x05105b8b), TOBN(0x7b4d4d58, 0xdb8260c8)}},
+     {{TOBN(0xe33930b0, 0x46eb842a), TOBN(0x8e844a9a, 0x7bdae56d),
+       TOBN(0x34ef3a9e, 0x13f7fdfc), TOBN(0xb3768f82, 0x636ca176)},
+      {TOBN(0x2821f4e0, 0x4e09e61c), TOBN(0x414dc3a1, 0xa0c7cddc),
+       TOBN(0xd5379437, 0x54945fcd), TOBN(0x151b6eef, 0xb3555ff1)}},
+     {{TOBN(0xb31bd613, 0x6339c083), TOBN(0x39ff8155, 0xdfb64701),
+       TOBN(0x7c3388d2, 0xe29604ab), TOBN(0x1e19084b, 0xa6b10442)},
+      {TOBN(0x17cf54c0, 0xeccd47ef), TOBN(0x89693385, 0x4a5dfb30),
+       TOBN(0x69d023fb, 0x47daf9f6), TOBN(0x9222840b, 0x7d91d959)}},
+     {{TOBN(0x439108f5, 0x803bac62), TOBN(0x0b7dd91d, 0x379bd45f),
+       TOBN(0xd651e827, 0xca63c581), TOBN(0x5c5d75f6, 0x509c104f)},
+      {TOBN(0x7d5fc738, 0x1f2dc308), TOBN(0x20faa7bf, 0xd98454be),
+       TOBN(0x95374bee, 0xa517b031), TOBN(0xf036b9b1, 0x642692ac)}},
+     {{TOBN(0xc5106109, 0x39842194), TOBN(0xb7e2353e, 0x49d05295),
+       TOBN(0xfc8c1d5c, 0xefb42ee0), TOBN(0xe04884eb, 0x08ce811c)},
+      {TOBN(0xf1f75d81, 0x7419f40e), TOBN(0x5b0ac162, 0xa995c241),
+       TOBN(0x120921bb, 0xc4c55646), TOBN(0x713520c2, 0x8d33cf97)}},
+     {{TOBN(0xb4a65a5c, 0xe98c5100), TOBN(0x6cec871d, 0x2ddd0f5a),
+       TOBN(0x251f0b7f, 0x9ba2e78b), TOBN(0x224a8434, 0xce3a2a5f)},
+      {TOBN(0x26827f61, 0x25f5c46f), TOBN(0x6a22bedc, 0x48545ec0),
+       TOBN(0x25ae5fa0, 0xb1bb5cdc), TOBN(0xd693682f, 0xfcb9b98f)}},
+     {{TOBN(0x32027fe8, 0x91e5d7d3), TOBN(0xf14b7d17, 0x73a07678),
+       TOBN(0xf88497b3, 0xc0dfdd61), TOBN(0xf7c2eec0, 0x2a8c4f48)},
+      {TOBN(0xaa5573f4, 0x3756e621), TOBN(0xc013a240, 0x1825b948),
+       TOBN(0x1c03b345, 0x63878572), TOBN(0xa0472bea, 0x653a4184)}},
+     {{TOBN(0xf4222e27, 0x0ac69a80), TOBN(0x34096d25, 0xf51e54f6),
+       TOBN(0x00a648cb, 0x8fffa591), TOBN(0x4e87acdc, 0x69b6527f)},
+      {TOBN(0x0575e037, 0xe285ccb4), TOBN(0x188089e4, 0x50ddcf52),
+       TOBN(0xaa96c9a8, 0x870ff719), TOBN(0x74a56cd8, 0x1fc7e369)}},
+     {{TOBN(0x41d04ee2, 0x1726931a), TOBN(0x0bbbb2c8, 0x3660ecfd),
+       TOBN(0xa6ef6de5, 0x24818e18), TOBN(0xe421cc51, 0xe7d57887)},
+      {TOBN(0xf127d208, 0xbea87be6), TOBN(0x16a475d3, 0xb1cdd682),
+       TOBN(0x9db1b684, 0x439b63f7), TOBN(0x5359b3db, 0xf0f113b6)}},
+     {{TOBN(0xdfccf1de, 0x8bf06e31), TOBN(0x1fdf8f44, 0xdd383901),
+       TOBN(0x10775cad, 0x5017e7d2), TOBN(0xdfc3a597, 0x58d11eef)},
+      {TOBN(0x6ec9c8a0, 0xb1ecff10), TOBN(0xee6ed6cc, 0x28400549),
+       TOBN(0xb5ad7bae, 0x1b4f8d73), TOBN(0x61b4f11d, 0xe00aaab9)}},
+     {{TOBN(0x7b32d69b, 0xd4eff2d7), TOBN(0x88ae6771, 0x4288b60f),
+       TOBN(0x159461b4, 0x37a1e723), TOBN(0x1f3d4789, 0x570aae8c)},
+      {TOBN(0x869118c0, 0x7f9871da), TOBN(0x35fbda78, 0xf635e278),
+       TOBN(0x738f3641, 0xe1541dac), TOBN(0x6794b13a, 0xc0dae45f)}},
+     {{TOBN(0x065064ac, 0x09cc0917), TOBN(0x27c53729, 0xc68540fd),
+       TOBN(0x0d2d4c8e, 0xef227671), TOBN(0xd23a9f80, 0xa1785a04)},
+      {TOBN(0x98c59528, 0x52650359), TOBN(0xfa09ad01, 0x74a1acad),
+       TOBN(0x082d5a29, 0x0b55bf5c), TOBN(0xa40f1c67, 0x419b8084)}},
+     {{TOBN(0x3a5c752e, 0xdcc18770), TOBN(0x4baf1f2f, 0x8825c3a5),
+       TOBN(0xebd63f74, 0x21b153ed), TOBN(0xa2383e47, 0xb2f64723)},
+      {TOBN(0xe7bf620a, 0x2646d19a), TOBN(0x56cb44ec, 0x03c83ffd),
+       TOBN(0xaf7267c9, 0x4f6be9f1), TOBN(0x8b2dfd7b, 0xc06bb5e9)}},
+     {{TOBN(0xb87072f2, 0xa672c5c7), TOBN(0xeacb11c8, 0x0d53c5e2),
+       TOBN(0x22dac29d, 0xff435932), TOBN(0x37bdb99d, 0x4408693c)},
+      {TOBN(0xf6e62fb6, 0x2899c20f), TOBN(0x3535d512, 0x447ece24),
+       TOBN(0xfbdc6b88, 0xff577ce3), TOBN(0x726693bd, 0x190575f2)}},
+     {{TOBN(0x6772b0e5, 0xab4b35a2), TOBN(0x1d8b6001, 0xf5eeaacf),
+       TOBN(0x728f7ce4, 0x795b9580), TOBN(0x4a20ed2a, 0x41fb81da)},
+      {TOBN(0x9f685cd4, 0x4fec01e6), TOBN(0x3ed7ddcc, 0xa7ff50ad),
+       TOBN(0x460fd264, 0x0c2d97fd), TOBN(0x3a241426, 0xeb82f4f9)}},
+     {{TOBN(0x17d1df2c, 0x6a8ea820), TOBN(0xb2b50d3b, 0xf22cc254),
+       TOBN(0x03856cba, 0xb7291426), TOBN(0x87fd26ae, 0x04f5ee39)},
+      {TOBN(0x9cb696cc, 0x02bee4ba), TOBN(0x53121804, 0x06820fd6),
+       TOBN(0xa5dfc269, 0x0212e985), TOBN(0x666f7ffa, 0x160f9a09)}},
+     {{TOBN(0xc503cd33, 0xbccd9617), TOBN(0x365dede4, 0xba7730a3),
+       TOBN(0x798c6355, 0x5ddb0786), TOBN(0xa6c3200e, 0xfc9cd3bc)},
+      {TOBN(0x060ffb2c, 0xe5e35efd), TOBN(0x99a4e25b, 0x5555a1c1),
+       TOBN(0x11d95375, 0xf70b3751), TOBN(0x0a57354a, 0x160e1bf6)}},
+     {{TOBN(0xecb3ae4b, 0xf8e4b065), TOBN(0x07a834c4, 0x2e53022b),
+       TOBN(0x1cd300b3, 0x8692ed96), TOBN(0x16a6f792, 0x61ee14ec)},
+      {TOBN(0x8f1063c6, 0x6a8649ed), TOBN(0xfbcdfcfe, 0x869f3e14),
+       TOBN(0x2cfb97c1, 0x00a7b3ec), TOBN(0xcea49b3c, 0x7130c2f1)}},
+     {{TOBN(0x462d044f, 0xe9d96488), TOBN(0x4b53d52e, 0x8182a0c1),
+       TOBN(0x84b6ddd3, 0x0391e9e9), TOBN(0x80ab7b48, 0xb1741a09)},
+      {TOBN(0xec0e15d4, 0x27d3317f), TOBN(0x8dfc1ddb, 0x1a64671e),
+       TOBN(0x93cc5d5f, 0xd49c5b92), TOBN(0xc995d53d, 0x3674a331)}},
+     {{TOBN(0x302e41ec, 0x090090ae), TOBN(0x2278a0cc, 0xedb06830),
+       TOBN(0x1d025932, 0xfbc99690), TOBN(0x0c32fbd2, 0xb80d68da)},
+      {TOBN(0xd79146da, 0xf341a6c1), TOBN(0xae0ba139, 0x1bef68a0),
+       TOBN(0xc6b8a563, 0x8d774b3a), TOBN(0x1cf307bd, 0x880ba4d7)}},
+     {{TOBN(0xc033bdc7, 0x19803511), TOBN(0xa9f97b3b, 0x8888c3be),
+       TOBN(0x3d68aebc, 0x85c6d05e), TOBN(0xc3b88a9d, 0x193919eb)},
+      {TOBN(0x2d300748, 0xc48b0ee3), TOBN(0x7506bc7c, 0x07a746c1),
+       TOBN(0xfc48437c, 0x6e6d57f3), TOBN(0x5bd71587, 0xcfeaa91a)}},
+     {{TOBN(0xa4ed0408, 0xc1bc5225), TOBN(0xd0b946db, 0x2719226d),
+       TOBN(0x109ecd62, 0x758d2d43), TOBN(0x75c8485a, 0x2751759b)},
+      {TOBN(0xb0b75f49, 0x9ce4177a), TOBN(0x4fa61a1e, 0x79c10c3d),
+       TOBN(0xc062d300, 0xa167fcd7), TOBN(0x4df3874c, 0x750f0fa8)}},
+     {{TOBN(0x29ae2cf9, 0x83dfedc9), TOBN(0xf8437134, 0x8d87631a),
+       TOBN(0xaf571711, 0x7429c8d2), TOBN(0x18d15867, 0x146d9272)},
+      {TOBN(0x83053ecf, 0x69769bb7), TOBN(0xc55eb856, 0xc479ab82),
+       TOBN(0x5ef7791c, 0x21b0f4b2), TOBN(0xaa5956ba, 0x3d491525)}},
+     {{TOBN(0x407a96c2, 0x9fe20eba), TOBN(0xf27168bb, 0xe52a5ad3),
+       TOBN(0x43b60ab3, 0xbf1d9d89), TOBN(0xe45c51ef, 0x710e727a)},
+      {TOBN(0xdfca5276, 0x099b4221), TOBN(0x8dc6407c, 0x2557a159),
+       TOBN(0x0ead8335, 0x91035895), TOBN(0x0a9db957, 0x9c55dc32)}},
+     {{TOBN(0xe40736d3, 0xdf61bc76), TOBN(0x13a619c0, 0x3f778cdb),
+       TOBN(0x6dd921a4, 0xc56ea28f), TOBN(0x76a52433, 0x2fa647b4)},
+      {TOBN(0x23591891, 0xac5bdc5d), TOBN(0xff4a1a72, 0xbac7dc01),
+       TOBN(0x9905e261, 0x62df8453), TOBN(0x3ac045df, 0xe63b265f)}},
+     {{TOBN(0x8a3f341b, 0xad53dba7), TOBN(0x8ec269cc, 0x837b625a),
+       TOBN(0xd71a2782, 0x3ae31189), TOBN(0x8fb4f9a3, 0x55e96120)},
+      {TOBN(0x804af823, 0xff9875cf), TOBN(0x23224f57, 0x5d442a9b),
+       TOBN(0x1c4d3b9e, 0xecc62679), TOBN(0x91da22fb, 0xa0e7ddb1)}},
+     {{TOBN(0xa370324d, 0x6c04a661), TOBN(0x9710d3b6, 0x5e376d17),
+       TOBN(0xed8c98f0, 0x3044e357), TOBN(0xc364ebbe, 0x6422701c)},
+      {TOBN(0x347f5d51, 0x7733d61c), TOBN(0xd55644b9, 0xcea826c3),
+       TOBN(0x80c6e0ad, 0x55a25548), TOBN(0x0aa7641d, 0x844220a7)}},
+     {{TOBN(0x1438ec81, 0x31810660), TOBN(0x9dfa6507, 0xde4b4043),
+       TOBN(0x10b515d8, 0xcc3e0273), TOBN(0x1b6066dd, 0x28d8cfb2)},
+      {TOBN(0xd3b04591, 0x9c9efebd), TOBN(0x425d4bdf, 0xa21c1ff4),
+       TOBN(0x5fe5af19, 0xd57607d3), TOBN(0xbbf773f7, 0x54481084)}},
+     {{TOBN(0x8435bd69, 0x94b03ed1), TOBN(0xd9ad1de3, 0x634cc546),
+       TOBN(0x2cf423fc, 0x00e420ca), TOBN(0xeed26d80, 0xa03096dd)},
+      {TOBN(0xd7f60be7, 0xa4db09d2), TOBN(0xf47f569d, 0x960622f7),
+       TOBN(0xe5925fd7, 0x7296c729), TOBN(0xeff2db26, 0x26ca2715)}},
+     {{TOBN(0xa6fcd014, 0xb913e759), TOBN(0x53da4786, 0x8ff4de93),
+       TOBN(0x14616d79, 0xc32068e1), TOBN(0xb187d664, 0xccdf352e)},
+      {TOBN(0xf7afb650, 0x1dc90b59), TOBN(0x8170e943, 0x7daa1b26),
+       TOBN(0xc8e3bdd8, 0x700c0a84), TOBN(0x6e8d345f, 0x6482bdfa)}},
+     {{TOBN(0x84cfbfa1, 0xc5c5ea50), TOBN(0xd3baf14c, 0x67960681),
+       TOBN(0x26398403, 0x0dd50942), TOBN(0xe4b7839c, 0x4716a663)},
+      {TOBN(0xd5f1f794, 0xe7de6dc0), TOBN(0x5cd0f4d4, 0x622aa7ce),
+       TOBN(0x5295f3f1, 0x59acfeec), TOBN(0x8d933552, 0x953e0607)}},
+     {{TOBN(0xc7db8ec5, 0x776c5722), TOBN(0xdc467e62, 0x2b5f290c),
+       TOBN(0xd4297e70, 0x4ff425a9), TOBN(0x4be924c1, 0x0cf7bb72)},
+      {TOBN(0x0d5dc5ae, 0xa1892131), TOBN(0x8bf8a8e3, 0xa705c992),
+       TOBN(0x73a0b064, 0x7a305ac5), TOBN(0x00c9ca4e, 0x9a8c77a8)}},
+     {{TOBN(0x5dfee80f, 0x83774bdd), TOBN(0x63131602, 0x85734485),
+       TOBN(0xa1b524ae, 0x914a69a9), TOBN(0xebc2ffaf, 0xd4e300d7)},
+      {TOBN(0x52c93db7, 0x7cfa46a5), TOBN(0x71e6161f, 0x21653b50),
+       TOBN(0x3574fc57, 0xa4bc580a), TOBN(0xc09015dd, 0xe1bc1253)}},
+     {{TOBN(0x4b7b47b2, 0xd174d7aa), TOBN(0x4072d8e8, 0xf3a15d04),
+       TOBN(0xeeb7d47f, 0xd6fa07ed), TOBN(0x6f2b9ff9, 0xedbdafb1)},
+      {TOBN(0x18c51615, 0x3760fe8a), TOBN(0x7a96e6bf, 0xf06c6c13),
+       TOBN(0x4d7a0410, 0x0ea2d071), TOBN(0xa1914e9b, 0x0be2a5ce)}},
+     {{TOBN(0x5726e357, 0xd8a3c5cf), TOBN(0x1197ecc3, 0x2abb2b13),
+       TOBN(0x6c0d7f7f, 0x31ae88dd), TOBN(0x15b20d1a, 0xfdbb3efe)},
+      {TOBN(0xcd06aa26, 0x70584039), TOBN(0x2277c969, 0xa7dc9747),
+       TOBN(0xbca69587, 0x7855d815), TOBN(0x899ea238, 0x5188b32a)}},
+     {{TOBN(0x37d9228b, 0x760c1c9d), TOBN(0xc7efbb11, 0x9b5c18da),
+       TOBN(0x7f0d1bc8, 0x19f6dbc5), TOBN(0x4875384b, 0x07e6905b)},
+      {TOBN(0xc7c50baa, 0x3ba8cd86), TOBN(0xb0ce40fb, 0xc2905de0),
+       TOBN(0x70840673, 0x7a231952), TOBN(0xa912a262, 0xcf43de26)}},
+     {{TOBN(0x9c38ddcc, 0xeb5b76c1), TOBN(0x746f5285, 0x26fc0ab4),
+       TOBN(0x52a63a50, 0xd62c269f), TOBN(0x60049c55, 0x99458621)},
+      {TOBN(0xe7f48f82, 0x3c2f7c9e), TOBN(0x6bd99043, 0x917d5cf3),
+       TOBN(0xeb1317a8, 0x8701f469), TOBN(0xbd3fe2ed, 0x9a449fe0)}},
+     {{TOBN(0x421e79ca, 0x12ef3d36), TOBN(0x9ee3c36c, 0x3e7ea5de),
+       TOBN(0xe48198b5, 0xcdff36f7), TOBN(0xaff4f967, 0xc6b82228)},
+      {TOBN(0x15e19dd0, 0xc47adb7e), TOBN(0x45699b23, 0x032e7dfa),
+       TOBN(0x40680c8b, 0x1fae026a), TOBN(0x5a347a48, 0x550dbf4d)}},
+     {{TOBN(0xe652533b, 0x3cef0d7d), TOBN(0xd94f7b18, 0x2bbb4381),
+       TOBN(0x838752be, 0x0e80f500), TOBN(0x8e6e2488, 0x9e9c9bfb)},
+      {TOBN(0xc9751697, 0x16caca6a), TOBN(0x866c49d8, 0x38531ad9),
+       TOBN(0xc917e239, 0x7151ade1), TOBN(0x2d016ec1, 0x6037c407)}},
+     {{TOBN(0xa407ccc9, 0x00eac3f9), TOBN(0x835f6280, 0xe2ed4748),
+       TOBN(0xcc54c347, 0x1cc98e0d), TOBN(0x0e969937, 0xdcb572eb)},
+      {TOBN(0x1b16c8e8, 0x8f30c9cb), TOBN(0xa606ae75, 0x373c4661),
+       TOBN(0x47aa689b, 0x35502cab), TOBN(0xf89014ae, 0x4d9bb64f)}},
+     {{TOBN(0x202f6a9c, 0x31c71f7b), TOBN(0x01f95aa3, 0x296ffe5c),
+       TOBN(0x5fc06014, 0x53cec3a3), TOBN(0xeb991237, 0x5f498a45)},
+      {TOBN(0xae9a935e, 0x5d91ba87), TOBN(0xc6ac6281, 0x0b564a19),
+       TOBN(0x8a8fe81c, 0x3bd44e69), TOBN(0x7c8b467f, 0x9dd11d45)}},
+     {{TOBN(0xf772251f, 0xea5b8e69), TOBN(0xaeecb3bd, 0xc5b75fbc),
+       TOBN(0x1aca3331, 0x887ff0e5), TOBN(0xbe5d49ff, 0x19f0a131)},
+      {TOBN(0x582c13aa, 0xe5c8646f), TOBN(0xdbaa12e8, 0x20e19980),
+       TOBN(0x8f40f31a, 0xf7abbd94), TOBN(0x1f13f5a8, 0x1dfc7663)}},
+     {{TOBN(0x5d81f1ee, 0xaceb4fc0), TOBN(0x36256002, 0x5e6f0f42),
+       TOBN(0x4b67d6d7, 0x751370c8), TOBN(0x2608b698, 0x03e80589)},
+      {TOBN(0xcfc0d2fc, 0x05268301), TOBN(0xa6943d39, 0x40309212),
+       TOBN(0x192a90c2, 0x1fd0e1c2), TOBN(0xb209f113, 0x37f1dc76)}},
+     {{TOBN(0xefcc5e06, 0x97bf1298), TOBN(0xcbdb6730, 0x219d639e),
+       TOBN(0xd009c116, 0xb81e8c6f), TOBN(0xa3ffdde3, 0x1a7ce2e5)},
+      {TOBN(0xc53fbaaa, 0xa914d3ba), TOBN(0x836d500f, 0x88df85ee),
+       TOBN(0xd98dc71b, 0x66ee0751), TOBN(0x5a3d7005, 0x714516fd)}},
+     {{TOBN(0x21d3634d, 0x39eedbba), TOBN(0x35cd2e68, 0x0455a46d),
+       TOBN(0xc8cafe65, 0xf9d7eb0c), TOBN(0xbda3ce9e, 0x00cefb3e)},
+      {TOBN(0xddc17a60, 0x2c9cf7a4), TOBN(0x01572ee4, 0x7bcb8773),
+       TOBN(0xa92b2b01, 0x8c7548df), TOBN(0x732fd309, 0xa84600e3)}},
+     {{TOBN(0xe22109c7, 0x16543a40), TOBN(0x9acafd36, 0xfede3c6c),
+       TOBN(0xfb206852, 0x6824e614), TOBN(0x2a4544a9, 0xda25dca0)},
+      {TOBN(0x25985262, 0x91d60b06), TOBN(0x281b7be9, 0x28753545),
+       TOBN(0xec667b1a, 0x90f13b27), TOBN(0x33a83aff, 0x940e2eb4)}},
+     {{TOBN(0x80009862, 0xd5d721d5), TOBN(0x0c3357a3, 0x5bd3a182),
+       TOBN(0x27f3a83b, 0x7aa2cda4), TOBN(0xb58ae74e, 0xf6f83085)},
+      {TOBN(0x2a911a81, 0x2e6dad6b), TOBN(0xde286051, 0xf43d6c5b),
+       TOBN(0x4bdccc41, 0xf996c4d8), TOBN(0xe7312ec0, 0x0ae1e24e)}}},
+    {{{TOBN(0xf8d112e7, 0x6e6485b3), TOBN(0x4d3e24db, 0x771c52f8),
+       TOBN(0x48e3ee41, 0x684a2f6d), TOBN(0x7161957d, 0x21d95551)},
+      {TOBN(0x19631283, 0xcdb12a6c), TOBN(0xbf3fa882, 0x2e50e164),
+       TOBN(0xf6254b63, 0x3166cc73), TOBN(0x3aefa7ae, 0xaee8cc38)}},
+     {{TOBN(0x79b0fe62, 0x3b36f9fd), TOBN(0x26543b23, 0xfde19fc0),
+       TOBN(0x136e64a0, 0x958482ef), TOBN(0x23f63771, 0x9b095825)},
+      {TOBN(0x14cfd596, 0xb6a1142e), TOBN(0x5ea6aac6, 0x335aac0b),
+       TOBN(0x86a0e8bd, 0xf3081dd5), TOBN(0x5fb89d79, 0x003dc12a)}},
+     {{TOBN(0xf615c33a, 0xf72e34d4), TOBN(0x0bd9ea40, 0x110eec35),
+       TOBN(0x1c12bc5b, 0xc1dea34e), TOBN(0x686584c9, 0x49ae4699)},
+      {TOBN(0x13ad95d3, 0x8c97b942), TOBN(0x4609561a, 0x4e5c7562),
+       TOBN(0x9e94a4ae, 0xf2737f89), TOBN(0xf57594c6, 0x371c78b6)}},
+     {{TOBN(0x0f0165fc, 0xe3779ee3), TOBN(0xe00e7f9d, 0xbd495d9e),
+       TOBN(0x1fa4efa2, 0x20284e7a), TOBN(0x4564bade, 0x47ac6219)},
+      {TOBN(0x90e6312a, 0xc4708e8e), TOBN(0x4f5725fb, 0xa71e9adf),
+       TOBN(0xe95f55ae, 0x3d684b9f), TOBN(0x47f7ccb1, 0x1e94b415)}},
+     {{TOBN(0x7322851b, 0x8d946581), TOBN(0xf0d13133, 0xbdf4a012),
+       TOBN(0xa3510f69, 0x6584dae0), TOBN(0x03a7c171, 0x3c9f6c6d)},
+      {TOBN(0x5be97f38, 0xe475381a), TOBN(0xca1ba422, 0x85823334),
+       TOBN(0xf83cc5c7, 0x0be17dda), TOBN(0x158b1494, 0x0b918c0f)}},
+     {{TOBN(0xda3a77e5, 0x522e6b69), TOBN(0x69c908c3, 0xbbcd6c18),
+       TOBN(0x1f1b9e48, 0xd924fd56), TOBN(0x37c64e36, 0xaa4bb3f7)},
+      {TOBN(0x5a4fdbdf, 0xee478d7d), TOBN(0xba75c8bc, 0x0193f7a0),
+       TOBN(0x84bc1e84, 0x56cd16df), TOBN(0x1fb08f08, 0x46fad151)}},
+     {{TOBN(0x8a7cabf9, 0x842e9f30), TOBN(0xa331d4bf, 0x5eab83af),
+       TOBN(0xd272cfba, 0x017f2a6a), TOBN(0x27560abc, 0x83aba0e3)},
+      {TOBN(0x94b83387, 0x0e3a6b75), TOBN(0x25c6aea2, 0x6b9f50f5),
+       TOBN(0x803d691d, 0xb5fdf6d0), TOBN(0x03b77509, 0xe6333514)}},
+     {{TOBN(0x36178903, 0x61a341c1), TOBN(0x3604dc60, 0x0cfd6142),
+       TOBN(0x022295eb, 0x8533316c), TOBN(0x3dbde4ac, 0x44af2922)},
+      {TOBN(0x898afc5d, 0x1c7eef69), TOBN(0x58896805, 0xd14f4fa1),
+       TOBN(0x05002160, 0x203c21ca), TOBN(0x6f0d1f30, 0x40ef730b)}},
+     {{TOBN(0x8e8c44d4, 0x196224f8), TOBN(0x75a4ab95, 0x374d079d),
+       TOBN(0x79085ecc, 0x7d48f123), TOBN(0x56f04d31, 0x1bf65ad8)},
+      {TOBN(0xe220bf1c, 0xbda602b2), TOBN(0x73ee1742, 0xf9612c69),
+       TOBN(0x76008fc8, 0x084fd06b), TOBN(0x4000ef9f, 0xf11380d1)}},
+     {{TOBN(0x48201b4b, 0x12cfe297), TOBN(0x3eee129c, 0x292f74e5),
+       TOBN(0xe1fe114e, 0xc9e874e8), TOBN(0x899b055c, 0x92c5fc41)},
+      {TOBN(0x4e477a64, 0x3a39c8cf), TOBN(0x82f09efe, 0x78963cc9),
+       TOBN(0x6fd3fd8f, 0xd333f863), TOBN(0x85132b2a, 0xdc949c63)}},
+     {{TOBN(0x7e06a3ab, 0x516eb17b), TOBN(0x73bec06f, 0xd2c7372b),
+       TOBN(0xe4f74f55, 0xba896da6), TOBN(0xbb4afef8, 0x8e9eb40f)},
+      {TOBN(0x2d75bec8, 0xe61d66b0), TOBN(0x02bda4b4, 0xef29300b),
+       TOBN(0x8bbaa8de, 0x026baa5a), TOBN(0xff54befd, 0xa07f4440)}},
+     {{TOBN(0xbd9b8b1d, 0xbe7a2af3), TOBN(0xec51caa9, 0x4fb74a72),
+       TOBN(0xb9937a4b, 0x63879697), TOBN(0x7c9a9d20, 0xec2687d5)},
+      {TOBN(0x1773e44f, 0x6ef5f014), TOBN(0x8abcf412, 0xe90c6900),
+       TOBN(0x387bd022, 0x8142161e), TOBN(0x50393755, 0xfcb6ff2a)}},
+     {{TOBN(0x9813fd56, 0xed6def63), TOBN(0x53cf6482, 0x7d53106c),
+       TOBN(0x991a35bd, 0x431f7ac1), TOBN(0xf1e274dd, 0x63e65faf)},
+      {TOBN(0xf63ffa3c, 0x44cc7880), TOBN(0x411a426b, 0x7c256981),
+       TOBN(0xb698b9fd, 0x93a420e0), TOBN(0x89fdddc0, 0xae53f8fe)}},
+     {{TOBN(0x766e0722, 0x32398baa), TOBN(0x205fee42, 0x5cfca031),
+       TOBN(0xa49f5341, 0x7a029cf2), TOBN(0xa88c68b8, 0x4023890d)},
+      {TOBN(0xbc275041, 0x7337aaa8), TOBN(0x9ed364ad, 0x0eb384f4),
+       TOBN(0xe0816f85, 0x29aba92f), TOBN(0x2e9e1941, 0x04e38a88)}},
+     {{TOBN(0x57eef44a, 0x3dafd2d5), TOBN(0x35d1fae5, 0x97ed98d8),
+       TOBN(0x50628c09, 0x2307f9b1), TOBN(0x09d84aae, 0xd6cba5c6)},
+      {TOBN(0x67071bc7, 0x88aaa691), TOBN(0x2dea57a9, 0xafe6cb03),
+       TOBN(0xdfe11bb4, 0x3d78ac01), TOBN(0x7286418c, 0x7fd7aa51)}},
+     {{TOBN(0xfabf7709, 0x77f7195a), TOBN(0x8ec86167, 0xadeb838f),
+       TOBN(0xea1285a8, 0xbb4f012d), TOBN(0xd6883503, 0x9a3eab3f)},
+      {TOBN(0xee5d24f8, 0x309004c2), TOBN(0xa96e4b76, 0x13ffe95e),
+       TOBN(0x0cdffe12, 0xbd223ea4), TOBN(0x8f5c2ee5, 0xb6739a53)}},
+     {{TOBN(0x5cb4aaa5, 0xdd968198), TOBN(0xfa131c52, 0x72413a6c),
+       TOBN(0x53d46a90, 0x9536d903), TOBN(0xb270f0d3, 0x48606d8e)},
+      {TOBN(0x518c7564, 0xa053a3bc), TOBN(0x088254b7, 0x1a86caef),
+       TOBN(0xb3ba8cb4, 0x0ab5efd0), TOBN(0x5c59900e, 0x4605945d)}},
+     {{TOBN(0xecace1dd, 0xa1887395), TOBN(0x40960f36, 0x932a65de),
+       TOBN(0x9611ff5c, 0x3aa95529), TOBN(0xc58215b0, 0x7c1e5a36)},
+      {TOBN(0xd48c9b58, 0xf0e1a524), TOBN(0xb406856b, 0xf590dfb8),
+       TOBN(0xc7605e04, 0x9cd95662), TOBN(0x0dd036ee, 0xa33ecf82)}},
+     {{TOBN(0xa50171ac, 0xc33156b3), TOBN(0xf09d24ea, 0x4a80172e),
+       TOBN(0x4e1f72c6, 0x76dc8eef), TOBN(0xe60caadc, 0x5e3d44ee)},
+      {TOBN(0x006ef8a6, 0x979b1d8f), TOBN(0x60908a1c, 0x97788d26),
+       TOBN(0x6e08f95b, 0x266feec0), TOBN(0x618427c2, 0x22e8c94e)}},
+     {{TOBN(0x3d613339, 0x59145a65), TOBN(0xcd9bc368, 0xfa406337),
+       TOBN(0x82d11be3, 0x2d8a52a0), TOBN(0xf6877b27, 0x97a1c590)},
+      {TOBN(0x837a819b, 0xf5cbdb25), TOBN(0x2a4fd1d8, 0xde090249),
+       TOBN(0x622a7de7, 0x74990e5f), TOBN(0x840fa5a0, 0x7945511b)}},
+     {{TOBN(0x30b974be, 0x6558842d), TOBN(0x70df8c64, 0x17f3d0a6),
+       TOBN(0x7c803520, 0x7542e46d), TOBN(0x7251fe7f, 0xe4ecc823)},
+      {TOBN(0xe59134cb, 0x5e9aac9a), TOBN(0x11bb0934, 0xf0045d71),
+       TOBN(0x53e5d9b5, 0xdbcb1d4e), TOBN(0x8d97a905, 0x92defc91)}},
+     {{TOBN(0xfe289327, 0x7946d3f9), TOBN(0xe132bd24, 0x07472273),
+       TOBN(0xeeeb510c, 0x1eb6ae86), TOBN(0x777708c5, 0xf0595067)},
+      {TOBN(0x18e2c8cd, 0x1297029e), TOBN(0x2c61095c, 0xbbf9305e),
+       TOBN(0xe466c258, 0x6b85d6d9), TOBN(0x8ac06c36, 0xda1ea530)}},
+     {{TOBN(0xa365dc39, 0xa1304668), TOBN(0xe4a9c885, 0x07f89606),
+       TOBN(0x65a4898f, 0xacc7228d), TOBN(0x3e2347ff, 0x84ca8303)},
+      {TOBN(0xa5f6fb77, 0xea7d23a3), TOBN(0x2fac257d, 0x672a71cd),
+       TOBN(0x6908bef8, 0x7e6a44d3), TOBN(0x8ff87566, 0x891d3d7a)}},
+     {{TOBN(0xe58e90b3, 0x6b0cf82e), TOBN(0x6438d246, 0x2615b5e7),
+       TOBN(0x07b1f8fc, 0x669c145a), TOBN(0xb0d8b2da, 0x36f1e1cb)},
+      {TOBN(0x54d5dadb, 0xd9184c4d), TOBN(0x3dbb18d5, 0xf93d9976),
+       TOBN(0x0a3e0f56, 0xd1147d47), TOBN(0x2afa8c8d, 0xa0a48609)}},
+     {{TOBN(0x275353e8, 0xbc36742c), TOBN(0x898f427e, 0xeea0ed90),
+       TOBN(0x26f4947e, 0x3e477b00), TOBN(0x8ad8848a, 0x308741e3)},
+      {TOBN(0x6c703c38, 0xd74a2a46), TOBN(0x5e3e05a9, 0x9ba17ba2),
+       TOBN(0xc1fa6f66, 0x4ab9a9e4), TOBN(0x474a2d9a, 0x3841d6ec)}},
+     {{TOBN(0x871239ad, 0x653ae326), TOBN(0x14bcf72a, 0xa74cbb43),
+       TOBN(0x8737650e, 0x20d4c083), TOBN(0x3df86536, 0x110ed4af)},
+      {TOBN(0xd2d86fe7, 0xb53ca555), TOBN(0x688cb00d, 0xabd5d538),
+       TOBN(0xcf81bda3, 0x1ad38468), TOBN(0x7ccfe3cc, 0xf01167b6)}},
+     {{TOBN(0xcf4f47e0, 0x6c4c1fe6), TOBN(0x557e1f1a, 0x298bbb79),
+       TOBN(0xf93b974f, 0x30d45a14), TOBN(0x174a1d2d, 0x0baf97c4)},
+      {TOBN(0x7a003b30, 0xc51fbf53), TOBN(0xd8940991, 0xee68b225),
+       TOBN(0x5b0aa7b7, 0x1c0f4173), TOBN(0x975797c9, 0xa20a7153)}},
+     {{TOBN(0x26e08c07, 0xe3533d77), TOBN(0xd7222e6a, 0x2e341c99),
+       TOBN(0x9d60ec3d, 0x8d2dc4ed), TOBN(0xbdfe0d8f, 0x7c476cf8)},
+      {TOBN(0x1fe59ab6, 0x1d056605), TOBN(0xa9ea9df6, 0x86a8551f),
+       TOBN(0x8489941e, 0x47fb8d8c), TOBN(0xfeb874eb, 0x4a7f1b10)}},
+     {{TOBN(0xfe5fea86, 0x7ee0d98f), TOBN(0x201ad34b, 0xdbf61864),
+       TOBN(0x45d8fe47, 0x37c031d4), TOBN(0xd5f49fae, 0x795f0822)},
+      {TOBN(0xdb0fb291, 0xc7f4a40c), TOBN(0x2e69d9c1, 0x730ddd92),
+       TOBN(0x754e1054, 0x49d76987), TOBN(0x8a24911d, 0x7662db87)}},
+     {{TOBN(0x61fc1810, 0x60a71676), TOBN(0xe852d1a8, 0xf66a8ad1),
+       TOBN(0x172bbd65, 0x6417231e), TOBN(0x0d6de7bd, 0x3babb11f)},
+      {TOBN(0x6fde6f88, 0xc8e347f8), TOBN(0x1c587547, 0x9bd99cc3),
+       TOBN(0x78e54ed0, 0x34076950), TOBN(0x97f0f334, 0x796e83ba)}},
+     {{TOBN(0xe4dbe1ce, 0x4924867a), TOBN(0xbd5f51b0, 0x60b84917),
+       TOBN(0x37530040, 0x3cb09a79), TOBN(0xdb3fe0f8, 0xff1743d8)},
+      {TOBN(0xed7894d8, 0x556fa9db), TOBN(0xfa262169, 0x23412fbf),
+       TOBN(0x563be0db, 0xba7b9291), TOBN(0x6ca8b8c0, 0x0c9fb234)}},
+     {{TOBN(0xed406aa9, 0xbd763802), TOBN(0xc21486a0, 0x65303da1),
+       TOBN(0x61ae291e, 0xc7e62ec4), TOBN(0x622a0492, 0xdf99333e)},
+      {TOBN(0x7fd80c9d, 0xbb7a8ee0), TOBN(0xdc2ed3bc, 0x6c01aedb),
+       TOBN(0x35c35a12, 0x08be74ec), TOBN(0xd540cb1a, 0x469f671f)}},
+     {{TOBN(0xd16ced4e, 0xcf84f6c7), TOBN(0x8561fb9c, 0x2d090f43),
+       TOBN(0x7e693d79, 0x6f239db4), TOBN(0xa736f928, 0x77bd0d94)},
+      {TOBN(0x07b4d929, 0x2c1950ee), TOBN(0xda177543, 0x56dc11b3),
+       TOBN(0xa5dfbbaa, 0x7a6a878e), TOBN(0x1c70cb29, 0x4decb08a)}},
+     {{TOBN(0xfba28c8b, 0x6f0f7c50), TOBN(0xa8eba2b8, 0x854dcc6d),
+       TOBN(0x5ff8e89a, 0x36b78642), TOBN(0x070c1c8e, 0xf6873adf)},
+      {TOBN(0xbbd3c371, 0x6484d2e4), TOBN(0xfb78318f, 0x0d414129),
+       TOBN(0x2621a39c, 0x6ad93b0b), TOBN(0x979d74c2, 0xa9e917f7)}},
+     {{TOBN(0xfc195647, 0x61fb0428), TOBN(0x4d78954a, 0xbee624d4),
+       TOBN(0xb94896e0, 0xb8ae86fd), TOBN(0x6667ac0c, 0xc91c8b13)},
+      {TOBN(0x9f180512, 0x43bcf832), TOBN(0xfbadf8b7, 0xa0010137),
+       TOBN(0xc69b4089, 0xb3ba8aa7), TOBN(0xfac4bacd, 0xe687ce85)}},
+     {{TOBN(0x9164088d, 0x977eab40), TOBN(0x51f4c5b6, 0x2760b390),
+       TOBN(0xd238238f, 0x340dd553), TOBN(0x358566c3, 0xdb1d31c9)},
+      {TOBN(0x3a5ad69e, 0x5068f5ff), TOBN(0xf31435fc, 0xdaff6b06),
+       TOBN(0xae549a5b, 0xd6debff0), TOBN(0x59e5f0b7, 0x75e01331)}},
+     {{TOBN(0x5d492fb8, 0x98559acf), TOBN(0x96018c2e, 0x4db79b50),
+       TOBN(0x55f4a48f, 0x609f66aa), TOBN(0x1943b3af, 0x4900a14f)},
+      {TOBN(0xc22496df, 0x15a40d39), TOBN(0xb2a44684, 0x4c20f7c5),
+       TOBN(0x76a35afa, 0x3b98404c), TOBN(0xbec75725, 0xff5d1b77)}},
+     {{TOBN(0xb67aa163, 0xbea06444), TOBN(0x27e95bb2, 0xf724b6f2),
+       TOBN(0x3c20e3e9, 0xd238c8ab), TOBN(0x1213754e, 0xddd6ae17)},
+      {TOBN(0x8c431020, 0x716e0f74), TOBN(0x6679c82e, 0xffc095c2),
+       TOBN(0x2eb3adf4, 0xd0ac2932), TOBN(0x2cc970d3, 0x01bb7a76)}},
+     {{TOBN(0x70c71f2f, 0x740f0e66), TOBN(0x545c616b, 0x2b6b23cc),
+       TOBN(0x4528cfcb, 0xb40a8bd7), TOBN(0xff839633, 0x2ab27722)},
+      {TOBN(0x049127d9, 0x025ac99a), TOBN(0xd314d4a0, 0x2b63e33b),
+       TOBN(0xc8c310e7, 0x28d84519), TOBN(0x0fcb8983, 0xb3bc84ba)}},
+     {{TOBN(0x2cc52261, 0x38634818), TOBN(0x501814f4, 0xb44c2e0b),
+       TOBN(0xf7e181aa, 0x54dfdba3), TOBN(0xcfd58ff0, 0xe759718c)},
+      {TOBN(0xf90cdb14, 0xd3b507a8), TOBN(0x57bd478e, 0xc50bdad8),
+       TOBN(0x29c197e2, 0x50e5f9aa), TOBN(0x4db6eef8, 0xe40bc855)}},
+     {{TOBN(0x2cc8f21a, 0xd1fc0654), TOBN(0xc71cc963, 0x81269d73),
+       TOBN(0xecfbb204, 0x077f49f9), TOBN(0xdde92571, 0xca56b793)},
+      {TOBN(0x9abed6a3, 0xf97ad8f7), TOBN(0xe6c19d3f, 0x924de3bd),
+       TOBN(0x8dce92f4, 0xa140a800), TOBN(0x85f44d1e, 0x1337af07)}},
+     {{TOBN(0x5953c08b, 0x09d64c52), TOBN(0xa1b5e49f, 0xf5df9749),
+       TOBN(0x336a8fb8, 0x52735f7d), TOBN(0xb332b6db, 0x9add676b)},
+      {TOBN(0x558b88a0, 0xb4511aa4), TOBN(0x09788752, 0xdbd5cc55),
+       TOBN(0x16b43b9c, 0xd8cd52bd), TOBN(0x7f0bc5a0, 0xc2a2696b)}},
+     {{TOBN(0x146e12d4, 0xc11f61ef), TOBN(0x9ce10754, 0x3a83e79e),
+       TOBN(0x08ec73d9, 0x6cbfca15), TOBN(0x09ff29ad, 0x5b49653f)},
+      {TOBN(0xe31b72bd, 0xe7da946e), TOBN(0xebf9eb3b, 0xee80a4f2),
+       TOBN(0xd1aabd08, 0x17598ce4), TOBN(0x18b5fef4, 0x53f37e80)}},
+     {{TOBN(0xd5d5cdd3, 0x5958cd79), TOBN(0x3580a1b5, 0x1d373114),
+       TOBN(0xa36e4c91, 0xfa935726), TOBN(0xa38c534d, 0xef20d760)},
+      {TOBN(0x7088e40a, 0x2ff5845b), TOBN(0xe5bb40bd, 0xbd78177f),
+       TOBN(0x4f06a7a8, 0x857f9920), TOBN(0xe3cc3e50, 0xe968f05d)}},
+     {{TOBN(0x1d68b7fe, 0xe5682d26), TOBN(0x5206f76f, 0xaec7f87c),
+       TOBN(0x41110530, 0x041951ab), TOBN(0x58ec52c1, 0xd4b5a71a)},
+      {TOBN(0xf3488f99, 0x0f75cf9a), TOBN(0xf411951f, 0xba82d0d5),
+       TOBN(0x27ee75be, 0x618895ab), TOBN(0xeae060d4, 0x6d8aab14)}},
+     {{TOBN(0x9ae1df73, 0x7fb54dc2), TOBN(0x1f3e391b, 0x25963649),
+       TOBN(0x242ec32a, 0xfe055081), TOBN(0x5bd450ef, 0x8491c9bd)},
+      {TOBN(0x367efc67, 0x981eb389), TOBN(0xed7e1928, 0x3a0550d5),
+       TOBN(0x362e776b, 0xab3ce75c), TOBN(0xe890e308, 0x1f24c523)}},
+     {{TOBN(0xb961b682, 0xfeccef76), TOBN(0x8b8e11f5, 0x8bba6d92),
+       TOBN(0x8f2ccc4c, 0x2b2375c4), TOBN(0x0d7f7a52, 0xe2f86cfa)},
+      {TOBN(0xfd94d30a, 0x9efe5633), TOBN(0x2d8d246b, 0x5451f934),
+       TOBN(0x2234c6e3, 0x244e6a00), TOBN(0xde2b5b0d, 0xddec8c50)}},
+     {{TOBN(0x2ce53c5a, 0xbf776f5b), TOBN(0x6f724071, 0x60357b05),
+       TOBN(0xb2593717, 0x71bf3f7a), TOBN(0x87d2501c, 0x440c4a9f)},
+      {TOBN(0x440552e1, 0x87b05340), TOBN(0xb7bf7cc8, 0x21624c32),
+       TOBN(0x4155a6ce, 0x22facddb), TOBN(0x5a4228cb, 0x889837ef)}},
+     {{TOBN(0xef87d6d6, 0xfd4fd671), TOBN(0xa233687e, 0xc2daa10e),
+       TOBN(0x75622244, 0x03c0eb96), TOBN(0x7632d184, 0x8bf19be6)},
+      {TOBN(0x05d0f8e9, 0x40735ff4), TOBN(0x3a3e6e13, 0xc00931f1),
+       TOBN(0x31ccde6a, 0xdafe3f18), TOBN(0xf381366a, 0xcfe51207)}},
+     {{TOBN(0x24c222a9, 0x60167d92), TOBN(0x62f9d6f8, 0x7529f18c),
+       TOBN(0x412397c0, 0x0353b114), TOBN(0x334d89dc, 0xef808043)},
+      {TOBN(0xd9ec63ba, 0x2a4383ce), TOBN(0xcec8e937, 0x5cf92ba0),
+       TOBN(0xfb8b4288, 0xc8be74c0), TOBN(0x67d6912f, 0x105d4391)}},
+     {{TOBN(0x7b996c46, 0x1b913149), TOBN(0x36aae2ef, 0x3a4e02da),
+       TOBN(0xb68aa003, 0x972de594), TOBN(0x284ec70d, 0x4ec6d545)},
+      {TOBN(0xf3d2b2d0, 0x61391d54), TOBN(0x69c5d5d6, 0xfe114e92),
+       TOBN(0xbe0f00b5, 0xb4482dff), TOBN(0xe1596fa5, 0xf5bf33c5)}},
+     {{TOBN(0x10595b56, 0x96a71cba), TOBN(0x944938b2, 0xfdcadeb7),
+       TOBN(0xa282da4c, 0xfccd8471), TOBN(0x98ec05f3, 0x0d37bfe1)},
+      {TOBN(0xe171ce1b, 0x0698304a), TOBN(0x2d691444, 0x21bdf79b),
+       TOBN(0xd0cd3b74, 0x1b21dec1), TOBN(0x712ecd8b, 0x16a15f71)}},
+     {{TOBN(0x8d4c00a7, 0x00fd56e1), TOBN(0x02ec9692, 0xf9527c18),
+       TOBN(0x21c44937, 0x4a3e42e1), TOBN(0x9176fbab, 0x1392ae0a)},
+      {TOBN(0x8726f1ba, 0x44b7b618), TOBN(0xb4d7aae9, 0xf1de491c),
+       TOBN(0xf91df7b9, 0x07b582c0), TOBN(0x7e116c30, 0xef60aa3a)}},
+     {{TOBN(0x99270f81, 0x466265d7), TOBN(0xb15b6fe2, 0x4df7adf0),
+       TOBN(0xfe33b2d3, 0xf9738f7f), TOBN(0x48553ab9, 0xd6d70f95)},
+      {TOBN(0x2cc72ac8, 0xc21e94db), TOBN(0x795ac38d, 0xbdc0bbee),
+       TOBN(0x0a1be449, 0x2e40478f), TOBN(0x81bd3394, 0x052bde55)}},
+     {{TOBN(0x63c8dbe9, 0x56b3c4f2), TOBN(0x017a99cf, 0x904177cc),
+       TOBN(0x947bbddb, 0x4d010fc1), TOBN(0xacf9b00b, 0xbb2c9b21)},
+      {TOBN(0x2970bc8d, 0x47173611), TOBN(0x1a4cbe08, 0xac7d756f),
+       TOBN(0x06d9f4aa, 0x67d541a2), TOBN(0xa3e8b689, 0x59c2cf44)}},
+     {{TOBN(0xaad066da, 0x4d88f1dd), TOBN(0xc604f165, 0x7ad35dea),
+       TOBN(0x7edc0720, 0x4478ca67), TOBN(0xa10dfae0, 0xba02ce06)},
+      {TOBN(0xeceb1c76, 0xaf36f4e4), TOBN(0x994b2292, 0xaf3f8f48),
+       TOBN(0xbf9ed77b, 0x77c8a68c), TOBN(0x74f544ea, 0x51744c9d)}},
+     {{TOBN(0x82d05bb9, 0x8113a757), TOBN(0x4ef2d2b4, 0x8a9885e4),
+       TOBN(0x1e332be5, 0x1aa7865f), TOBN(0x22b76b18, 0x290d1a52)},
+      {TOBN(0x308a2310, 0x44351683), TOBN(0x9d861896, 0xa3f22840),
+       TOBN(0x5959ddcd, 0x841ed947), TOBN(0x0def0c94, 0x154b73bf)}},
+     {{TOBN(0xf0105417, 0x4c7c15e0), TOBN(0x539bfb02, 0x3a277c32),
+       TOBN(0xe699268e, 0xf9dccf5f), TOBN(0x9f5796a5, 0x0247a3bd)},
+      {TOBN(0x8b839de8, 0x4f157269), TOBN(0xc825c1e5, 0x7a30196b),
+       TOBN(0x6ef0aabc, 0xdc8a5a91), TOBN(0xf4a8ce6c, 0x498b7fe6)}},
+     {{TOBN(0x1cce35a7, 0x70cbac78), TOBN(0x83488e9b, 0xf6b23958),
+       TOBN(0x0341a070, 0xd76cb011), TOBN(0xda6c9d06, 0xae1b2658)},
+      {TOBN(0xb701fb30, 0xdd648c52), TOBN(0x994ca02c, 0x52fb9fd1),
+       TOBN(0x06933117, 0x6f563086), TOBN(0x3d2b8100, 0x17856bab)}},
+     {{TOBN(0xe89f48c8, 0x5963a46e), TOBN(0x658ab875, 0xa99e61c7),
+       TOBN(0x6e296f87, 0x4b8517b4), TOBN(0x36c4fcdc, 0xfc1bc656)},
+      {TOBN(0xde5227a1, 0xa3906def), TOBN(0x9fe95f57, 0x62418945),
+       TOBN(0x20c91e81, 0xfdd96cde), TOBN(0x5adbe47e, 0xda4480de)}},
+     {{TOBN(0xa009370f, 0x396de2b6), TOBN(0x98583d4b, 0xf0ecc7bd),
+       TOBN(0xf44f6b57, 0xe51d0672), TOBN(0x03d6b078, 0x556b1984)},
+      {TOBN(0x27dbdd93, 0xb0b64912), TOBN(0x9b3a3434, 0x15687b09),
+       TOBN(0x0dba6461, 0x51ec20a9), TOBN(0xec93db7f, 0xff28187c)}},
+     {{TOBN(0x00ff8c24, 0x66e48bdd), TOBN(0x2514f2f9, 0x11ccd78e),
+       TOBN(0xeba11f4f, 0xe1250603), TOBN(0x8a22cd41, 0x243fa156)},
+      {TOBN(0xa4e58df4, 0xb283e4c6), TOBN(0x78c29859, 0x8b39783f),
+       TOBN(0x5235aee2, 0xa5259809), TOBN(0xc16284b5, 0x0e0227dd)}},
+     {{TOBN(0xa5f57916, 0x1338830d), TOBN(0x6d4b8a6b, 0xd2123fca),
+       TOBN(0x236ea68a, 0xf9c546f8), TOBN(0xc1d36873, 0xfa608d36)},
+      {TOBN(0xcd76e495, 0x8d436d13), TOBN(0xd4d9c221, 0x8fb080af),
+       TOBN(0x665c1728, 0xe8ad3fb5), TOBN(0xcf1ebe4d, 0xb3d572e0)}},
+     {{TOBN(0xa7a8746a, 0x584c5e20), TOBN(0x267e4ea1, 0xb9dc7035),
+       TOBN(0x593a15cf, 0xb9548c9b), TOBN(0x5e6e2135, 0x4bd012f3)},
+      {TOBN(0xdf31cc6a, 0x8c8f936e), TOBN(0x8af84d04, 0xb5c241dc),
+       TOBN(0x63990a6f, 0x345efb86), TOBN(0x6fef4e61, 0xb9b962cb)}}},
+    {{{TOBN(0xf6368f09, 0x25722608), TOBN(0x131260db, 0x131cf5c6),
+       TOBN(0x40eb353b, 0xfab4f7ac), TOBN(0x85c78880, 0x37eee829)},
+      {TOBN(0x4c1581ff, 0xc3bdf24e), TOBN(0x5bff75cb, 0xf5c3c5a8),
+       TOBN(0x35e8c83f, 0xa14e6f40), TOBN(0xb81d1c0f, 0x0295e0ca)}},
+     {{TOBN(0xfcde7cc8, 0xf43a730f), TOBN(0xe89b6f3c, 0x33ab590e),
+       TOBN(0xc823f529, 0xad03240b), TOBN(0x82b79afe, 0x98bea5db)},
+      {TOBN(0x568f2856, 0x962fe5de), TOBN(0x0c590adb, 0x60c591f3),
+       TOBN(0x1fc74a14, 0x4a28a858), TOBN(0x3b662498, 0xb3203f4c)}},
+     {{TOBN(0x91e3cf0d, 0x6c39765a), TOBN(0xa2db3acd, 0xac3cca0b),
+       TOBN(0x288f2f08, 0xcb953b50), TOBN(0x2414582c, 0xcf43cf1a)},
+      {TOBN(0x8dec8bbc, 0x60eee9a8), TOBN(0x54c79f02, 0x729aa042),
+       TOBN(0xd81cd5ec, 0x6532f5d5), TOBN(0xa672303a, 0xcf82e15f)}},
+     {{TOBN(0x376aafa8, 0x719c0563), TOBN(0xcd8ad2dc, 0xbc5fc79f),
+       TOBN(0x303fdb9f, 0xcb750cd3), TOBN(0x14ff052f, 0x4418b08e)},
+      {TOBN(0xf75084cf, 0x3e2d6520), TOBN(0x7ebdf0f8, 0x144ed509),
+       TOBN(0xf43bf0f2, 0xd3f25b98), TOBN(0x86ad71cf, 0xa354d837)}},
+     {{TOBN(0xb827fe92, 0x26f43572), TOBN(0xdfd3ab5b, 0x5d824758),
+       TOBN(0x315dd23a, 0x539094c1), TOBN(0x85c0e37a, 0x66623d68)},
+      {TOBN(0x575c7972, 0x7be19ae0), TOBN(0x616a3396, 0xdf0d36b5),
+       TOBN(0xa1ebb3c8, 0x26b1ff7e), TOBN(0x635b9485, 0x140ad453)}},
+     {{TOBN(0x92bf3cda, 0xda430c0b), TOBN(0x4702850e, 0x3a96dac6),
+       TOBN(0xc91cf0a5, 0x15ac326a), TOBN(0x95de4f49, 0xab8c25e4)},
+      {TOBN(0xb01bad09, 0xe265c17c), TOBN(0x24e45464, 0x087b3881),
+       TOBN(0xd43e583c, 0xe1fac5ca), TOBN(0xe17cb318, 0x6ead97a6)}},
+     {{TOBN(0x6cc39243, 0x74dcec46), TOBN(0x33cfc02d, 0x54c2b73f),
+       TOBN(0x82917844, 0xf26cd99c), TOBN(0x8819dd95, 0xd1773f89)},
+      {TOBN(0x09572aa6, 0x0871f427), TOBN(0x8e0cf365, 0xf6f01c34),
+       TOBN(0x7fa52988, 0xbff1f5af), TOBN(0x4eb357ea, 0xe75e8e50)}},
+     {{TOBN(0xd9d0c8c4, 0x868af75d), TOBN(0xd7325cff, 0x45c8c7ea),
+       TOBN(0xab471996, 0xcc81ecb0), TOBN(0xff5d55f3, 0x611824ed)},
+      {TOBN(0xbe314541, 0x1977a0ee), TOBN(0x5085c4c5, 0x722038c6),
+       TOBN(0x2d5335bf, 0xf94bb495), TOBN(0x894ad8a6, 0xc8e2a082)}},
+     {{TOBN(0x5c3e2341, 0xada35438), TOBN(0xf4a9fc89, 0x049b8c4e),
+       TOBN(0xbeeb355a, 0x9f17cf34), TOBN(0x3f311e0e, 0x6c91fe10)},
+      {TOBN(0xc2d20038, 0x92ab9891), TOBN(0x257bdcc1, 0x3e8ce9a9),
+       TOBN(0x1b2d9789, 0x88c53bee), TOBN(0x927ce89a, 0xcdba143a)}},
+     {{TOBN(0xb0a32cca, 0x523db280), TOBN(0x5c889f8a, 0x50d43783),
+       TOBN(0x503e04b3, 0x4897d16f), TOBN(0x8cdb6e78, 0x08f5f2e8)},
+      {TOBN(0x6ab91cf0, 0x179c8e74), TOBN(0xd8874e52, 0x48211d60),
+       TOBN(0xf948d4d5, 0xea851200), TOBN(0x4076d41e, 0xe6f9840a)}},
+     {{TOBN(0xc20e263c, 0x47b517ea), TOBN(0x79a448fd, 0x30685e5e),
+       TOBN(0xe55f6f78, 0xf90631a0), TOBN(0x88a790b1, 0xa79e6346)},
+      {TOBN(0x62160c7d, 0x80969fe8), TOBN(0x54f92fd4, 0x41491bb9),
+       TOBN(0xa6645c23, 0x5c957526), TOBN(0xf44cc5ae, 0xbea3ce7b)}},
+     {{TOBN(0xf7628327, 0x8b1e68b7), TOBN(0xc731ad7a, 0x303f29d3),
+       TOBN(0xfe5a9ca9, 0x57d03ecb), TOBN(0x96c0d50c, 0x41bc97a7)},
+      {TOBN(0xc4669fe7, 0x9b4f7f24), TOBN(0xfdd781d8, 0x3d9967ef),
+       TOBN(0x7892c7c3, 0x5d2c208d), TOBN(0x8bf64f7c, 0xae545cb3)}},
+     {{TOBN(0xc01f862c, 0x467be912), TOBN(0xf4c85ee9, 0xc73d30cc),
+       TOBN(0x1fa6f4be, 0x6ab83ec7), TOBN(0xa07a3c1c, 0x4e3e3cf9)},
+      {TOBN(0x87f8ef45, 0x0c00beb3), TOBN(0x30e2c2b3, 0x000d4c3e),
+       TOBN(0x1aa00b94, 0xfe08bf5b), TOBN(0x32c133aa, 0x9224ef52)}},
+     {{TOBN(0x38df16bb, 0x32e5685d), TOBN(0x68a9e069, 0x58e6f544),
+       TOBN(0x495aaff7, 0xcdc5ebc6), TOBN(0xf894a645, 0x378b135f)},
+      {TOBN(0xf316350a, 0x09e27ecf), TOBN(0xeced201e, 0x58f7179d),
+       TOBN(0x2eec273c, 0xe97861ba), TOBN(0x47ec2cae, 0xd693be2e)}},
+     {{TOBN(0xfa4c97c4, 0xf68367ce), TOBN(0xe4f47d0b, 0xbe5a5755),
+       TOBN(0x17de815d, 0xb298a979), TOBN(0xd7eca659, 0xc177dc7d)},
+      {TOBN(0x20fdbb71, 0x49ded0a3), TOBN(0x4cb2aad4, 0xfb34d3c5),
+       TOBN(0x2cf31d28, 0x60858a33), TOBN(0x3b6873ef, 0xa24aa40f)}},
+     {{TOBN(0x540234b2, 0x2c11bb37), TOBN(0x2d0366dd, 0xed4c74a3),
+       TOBN(0xf9a968da, 0xeec5f25d), TOBN(0x36601068, 0x67b63142)},
+      {TOBN(0x07cd6d2c, 0x68d7b6d4), TOBN(0xa8f74f09, 0x0c842942),
+       TOBN(0xe2751404, 0x7768b1ee), TOBN(0x4b5f7e89, 0xfe62aee4)}},
+     {{TOBN(0xc6a77177, 0x89070d26), TOBN(0xa1f28e4e, 0xdd1c8bc7),
+       TOBN(0xea5f4f06, 0x469e1f17), TOBN(0x78fc242a, 0xfbdb78e0)},
+      {TOBN(0xc9c7c592, 0x8b0588f1), TOBN(0xb6b7a0fd, 0x1535921e),
+       TOBN(0xcc5bdb91, 0xbde5ae35), TOBN(0xb42c485e, 0x12ff1864)}},
+     {{TOBN(0xa1113e13, 0xdbab98aa), TOBN(0xde9d469b, 0xa17b1024),
+       TOBN(0x23f48b37, 0xc0462d3a), TOBN(0x3752e537, 0x7c5c078d)},
+      {TOBN(0xe3a86add, 0x15544eb9), TOBN(0xf013aea7, 0x80fba279),
+       TOBN(0x8b5bb76c, 0xf22001b5), TOBN(0xe617ba14, 0xf02891ab)}},
+     {{TOBN(0xd39182a6, 0x936219d3), TOBN(0x5ce1f194, 0xae51cb19),
+       TOBN(0xc78f8598, 0xbf07a74c), TOBN(0x6d7158f2, 0x22cbf1bc)},
+      {TOBN(0x3b846b21, 0xe300ce18), TOBN(0x35fba630, 0x2d11275d),
+       TOBN(0x5fe25c36, 0xa0239b9b), TOBN(0xd8beb35d, 0xdf05d940)}},
+     {{TOBN(0x4db02bb0, 0x1f7e320d), TOBN(0x0641c364, 0x6da320ea),
+       TOBN(0x6d95fa5d, 0x821389a3), TOBN(0x92699748, 0x8fcd8e3d)},
+      {TOBN(0x316fef17, 0xceb6c143), TOBN(0x67fcb841, 0xd933762b),
+       TOBN(0xbb837e35, 0x118b17f8), TOBN(0x4b92552f, 0x9fd24821)}},
+     {{TOBN(0xae6bc70e, 0x46aca793), TOBN(0x1cf0b0e4, 0xe579311b),
+       TOBN(0x8dc631be, 0x5802f716), TOBN(0x099bdc6f, 0xbddbee4d)},
+      {TOBN(0xcc352bb2, 0x0caf8b05), TOBN(0xf74d505a, 0x72d63df2),
+       TOBN(0xb9876d4b, 0x91c4f408), TOBN(0x1ce18473, 0x9e229b2d)}},
+     {{TOBN(0x49507597, 0x83abdb4a), TOBN(0x850fbcb6, 0xdee84b18),
+       TOBN(0x6325236e, 0x609e67dc), TOBN(0x04d831d9, 0x9336c6d8)},
+      {TOBN(0x8deaae3b, 0xfa12d45d), TOBN(0xe425f8ce, 0x4746e246),
+       TOBN(0x8004c175, 0x24f5f31e), TOBN(0xaca16d8f, 0xad62c3b7)}},
+     {{TOBN(0x0dc15a6a, 0x9152f934), TOBN(0xf1235e5d, 0xed0e12c1),
+       TOBN(0xc33c06ec, 0xda477dac), TOBN(0x76be8732, 0xb2ea0006)},
+      {TOBN(0xcf3f7831, 0x0c0cd313), TOBN(0x3c524553, 0xa614260d),
+       TOBN(0x31a756f8, 0xcab22d15), TOBN(0x03ee10d1, 0x77827a20)}},
+     {{TOBN(0xd1e059b2, 0x1994ef20), TOBN(0x2a653b69, 0x638ae318),
+       TOBN(0x70d5eb58, 0x2f699010), TOBN(0x279739f7, 0x09f5f84a)},
+      {TOBN(0x5da4663c, 0x8b799336), TOBN(0xfdfdf14d, 0x203c37eb),
+       TOBN(0x32d8a9dc, 0xa1dbfb2d), TOBN(0xab40cff0, 0x77d48f9b)}},
+     {{TOBN(0xc018b383, 0xd20b42d5), TOBN(0xf9a810ef, 0x9f78845f),
+       TOBN(0x40af3753, 0xbdba9df0), TOBN(0xb90bdcfc, 0x131dfdf9)},
+      {TOBN(0x18720591, 0xf01ab782), TOBN(0xc823f211, 0x6af12a88),
+       TOBN(0xa51b80f3, 0x0dc14401), TOBN(0xde248f77, 0xfb2dfbe3)}},
+     {{TOBN(0xef5a44e5, 0x0cafe751), TOBN(0x73997c9c, 0xd4dcd221),
+       TOBN(0x32fd86d1, 0xde854024), TOBN(0xd5b53adc, 0xa09b84bb)},
+      {TOBN(0x008d7a11, 0xdcedd8d1), TOBN(0x406bd1c8, 0x74b32c84),
+       TOBN(0x5d4472ff, 0x05dde8b1), TOBN(0x2e25f2cd, 0xfce2b32f)}},
+     {{TOBN(0xbec0dd5e, 0x29dfc254), TOBN(0x4455fcf6, 0x2b98b267),
+       TOBN(0x0b4d43a5, 0xc72df2ad), TOBN(0xea70e6be, 0x48a75397)},
+      {TOBN(0x2aad6169, 0x5820f3bf), TOBN(0xf410d2dd, 0x9e37f68f),
+       TOBN(0x70fb7dba, 0x7be5ac83), TOBN(0x636bb645, 0x36ec3eec)}},
+     {{TOBN(0x27104ea3, 0x9754e21c), TOBN(0xbc87a3e6, 0x8d63c373),
+       TOBN(0x483351d7, 0x4109db9a), TOBN(0x0fa724e3, 0x60134da7)},
+      {TOBN(0x9ff44c29, 0xb0720b16), TOBN(0x2dd0cf13, 0x06aceead),
+       TOBN(0x5942758c, 0xe26929a6), TOBN(0x96c5db92, 0xb766a92b)}},
+     {{TOBN(0xcec7d4c0, 0x5f18395e), TOBN(0xd3f22744, 0x1f80d032),
+       TOBN(0x7a68b37a, 0xcb86075b), TOBN(0x074764dd, 0xafef92db)},
+      {TOBN(0xded1e950, 0x7bc7f389), TOBN(0xc580c850, 0xb9756460),
+       TOBN(0xaeeec2a4, 0x7da48157), TOBN(0x3f0b4e7f, 0x82c587b3)}},
+     {{TOBN(0x231c6de8, 0xa9f19c53), TOBN(0x5717bd73, 0x6974e34e),
+       TOBN(0xd9e1d216, 0xf1508fa9), TOBN(0x9f112361, 0xdadaa124)},
+      {TOBN(0x80145e31, 0x823b7348), TOBN(0x4dd8f0d5, 0xac634069),
+       TOBN(0xe3d82fc7, 0x2297c258), TOBN(0x276fcfee, 0x9cee7431)}},
+     {{TOBN(0x8eb61b5e, 0x2bc0aea9), TOBN(0x4f668fd5, 0xde329431),
+       TOBN(0x03a32ab1, 0x38e4b87e), TOBN(0xe1374517, 0x73d0ef0b)},
+      {TOBN(0x1a46f7e6, 0x853ac983), TOBN(0xc3bdf42e, 0x68e78a57),
+       TOBN(0xacf20785, 0x2ea96dd1), TOBN(0xa10649b9, 0xf1638460)}},
+     {{TOBN(0xf2369f0b, 0x879fbbed), TOBN(0x0ff0ae86, 0xda9d1869),
+       TOBN(0x5251d759, 0x56766f45), TOBN(0x4984d8c0, 0x2be8d0fc)},
+      {TOBN(0x7ecc95a6, 0xd21008f0), TOBN(0x29bd54a0, 0x3a1a1c49),
+       TOBN(0xab9828c5, 0xd26c50f3), TOBN(0x32c0087c, 0x51d0d251)}},
+     {{TOBN(0x9bac3ce6, 0x0c1cdb26), TOBN(0xcd94d947, 0x557ca205),
+       TOBN(0x1b1bd598, 0x9db1fdcd), TOBN(0x0eda0108, 0xa3d8b149)},
+      {TOBN(0x95066610, 0x56152fcc), TOBN(0xc2f037e6, 0xe7192b33),
+       TOBN(0xdeffb41a, 0xc92e05a4), TOBN(0x1105f6c2, 0xc2f6c62e)}},
+     {{TOBN(0x68e73500, 0x8733913c), TOBN(0xcce86163, 0x3f3adc40),
+       TOBN(0xf407a942, 0x38a278e9), TOBN(0xd13c1b9d, 0x2ab21292)},
+      {TOBN(0x93ed7ec7, 0x1c74cf5c), TOBN(0x8887dc48, 0xf1a4c1b4),
+       TOBN(0x3830ff30, 0x4b3a11f1), TOBN(0x358c5a3c, 0x58937cb6)}},
+     {{TOBN(0x027dc404, 0x89022829), TOBN(0x40e93977, 0x3b798f79),
+       TOBN(0x90ad3337, 0x38be6ead), TOBN(0x9c23f6bc, 0xf34c0a5d)},
+      {TOBN(0xd1711a35, 0xfbffd8bb), TOBN(0x60fcfb49, 0x1949d3dd),
+       TOBN(0x09c8ef4b, 0x7825d93a), TOBN(0x24233cff, 0xa0a8c968)}},
+     {{TOBN(0x67ade46c, 0xe6d982af), TOBN(0xebb6bf3e, 0xe7544d7c),
+       TOBN(0xd6b9ba76, 0x3d8bd087), TOBN(0x46fe382d, 0x4dc61280)},
+      {TOBN(0xbd39a7e8, 0xb5bdbd75), TOBN(0xab381331, 0xb8f228fe),
+       TOBN(0x0709a77c, 0xce1c4300), TOBN(0x6a247e56, 0xf337ceac)}},
+     {{TOBN(0x8f34f21b, 0x636288be), TOBN(0x9dfdca74, 0xc8a7c305),
+       TOBN(0x6decfd1b, 0xea919e04), TOBN(0xcdf2688d, 0x8e1991f8)},
+      {TOBN(0xe607df44, 0xd0f8a67e), TOBN(0xd985df4b, 0x0b58d010),
+       TOBN(0x57f834c5, 0x0c24f8f4), TOBN(0xe976ef56, 0xa0bf01ae)}},
+     {{TOBN(0x536395ac, 0xa1c32373), TOBN(0x351027aa, 0x734c0a13),
+       TOBN(0xd2f1b5d6, 0x5e6bd5bc), TOBN(0x2b539e24, 0x223debed)},
+      {TOBN(0xd4994cec, 0x0eaa1d71), TOBN(0x2a83381d, 0x661dcf65),
+       TOBN(0x5f1aed2f, 0x7b54c740), TOBN(0x0bea3fa5, 0xd6dda5ee)}},
+     {{TOBN(0x9d4fb684, 0x36cc6134), TOBN(0x8eb9bbf3, 0xc0a443dd),
+       TOBN(0xfc500e2e, 0x383b7d2a), TOBN(0x7aad621c, 0x5b775257)},
+      {TOBN(0x69284d74, 0x0a8f7cc0), TOBN(0xe820c2ce, 0x07562d65),
+       TOBN(0xbf9531b9, 0x499758ee), TOBN(0x73e95ca5, 0x6ee0cc2d)}},
+     {{TOBN(0xf61790ab, 0xfbaf50a5), TOBN(0xdf55e76b, 0x684e0750),
+       TOBN(0xec516da7, 0xf176b005), TOBN(0x575553bb, 0x7a2dddc7)},
+      {TOBN(0x37c87ca3, 0x553afa73), TOBN(0x315f3ffc, 0x4d55c251),
+       TOBN(0xe846442a, 0xaf3e5d35), TOBN(0x61b91149, 0x6495ff28)}},
+     {{TOBN(0x23cc95d3, 0xfa326dc3), TOBN(0x1df4da1f, 0x18fc2cea),
+       TOBN(0x24bf9adc, 0xd0a37d59), TOBN(0xb6710053, 0x320d6e1e)},
+      {TOBN(0x96f9667e, 0x618344d1), TOBN(0xcc7ce042, 0xa06445af),
+       TOBN(0xa02d8514, 0xd68dbc3a), TOBN(0x4ea109e4, 0x280b5a5b)}},
+     {{TOBN(0x5741a7ac, 0xb40961bf), TOBN(0x4ada5937, 0x6aa56bfa),
+       TOBN(0x7feb9145, 0x02b765d1), TOBN(0x561e97be, 0xe6ad1582)},
+      {TOBN(0xbbc4a5b6, 0xda3982f5), TOBN(0x0c2659ed, 0xb546f468),
+       TOBN(0xb8e7e6aa, 0x59612d20), TOBN(0xd83dfe20, 0xac19e8e0)}},
+     {{TOBN(0x8530c45f, 0xb835398c), TOBN(0x6106a8bf, 0xb38a41c2),
+       TOBN(0x21e8f9a6, 0x35f5dcdb), TOBN(0x39707137, 0xcae498ed)},
+      {TOBN(0x70c23834, 0xd8249f00), TOBN(0x9f14b58f, 0xab2537a0),
+       TOBN(0xd043c365, 0x5f61c0c2), TOBN(0xdc5926d6, 0x09a194a7)}},
+     {{TOBN(0xddec0339, 0x8e77738a), TOBN(0xd07a63ef, 0xfba46426),
+       TOBN(0x2e58e79c, 0xee7f6e86), TOBN(0xe59b0459, 0xff32d241)},
+      {TOBN(0xc5ec84e5, 0x20fa0338), TOBN(0x97939ac8, 0xeaff5ace),
+       TOBN(0x0310a4e3, 0xb4a38313), TOBN(0x9115fba2, 0x8f9d9885)}},
+     {{TOBN(0x8dd710c2, 0x5fadf8c3), TOBN(0x66be38a2, 0xce19c0e2),
+       TOBN(0xd42a279c, 0x4cfe5022), TOBN(0x597bb530, 0x0e24e1b8)},
+      {TOBN(0x3cde86b7, 0xc153ca7f), TOBN(0xa8d30fb3, 0x707d63bd),
+       TOBN(0xac905f92, 0xbd60d21e), TOBN(0x98e7ffb6, 0x7b9a54ab)}},
+     {{TOBN(0xd7147df8, 0xe9726a30), TOBN(0xb5e216ff, 0xafce3533),
+       TOBN(0xb550b799, 0x2ff1ec40), TOBN(0x6b613b87, 0xa1e953fd)},
+      {TOBN(0x87b88dba, 0x792d5610), TOBN(0x2ee1270a, 0xa190fbe1),
+       TOBN(0x02f4e2dc, 0x2ef581da), TOBN(0x016530e4, 0xeff82a95)}},
+     {{TOBN(0xcbb93dfd, 0x8fd6ee89), TOBN(0x16d3d986, 0x46848fff),
+       TOBN(0x600eff24, 0x1da47adf), TOBN(0x1b9754a0, 0x0ad47a71)},
+      {TOBN(0x8f9266df, 0x70c33b98), TOBN(0xaadc87ae, 0xdf34186e),
+       TOBN(0x0d2ce8e1, 0x4ad24132), TOBN(0x8a47cbfc, 0x19946eba)}},
+     {{TOBN(0x47feeb66, 0x62b5f3af), TOBN(0xcefab561, 0x0abb3734),
+       TOBN(0x449de60e, 0x19f35cb1), TOBN(0x39f8db14, 0x157f0eb9)},
+      {TOBN(0xffaecc5b, 0x3c61bfd6), TOBN(0xa5a4d41d, 0x41216703),
+       TOBN(0x7f8fabed, 0x224e1cc2), TOBN(0x0d5a8186, 0x871ad953)}},
+     {{TOBN(0xf10774f7, 0xd22da9a9), TOBN(0x45b8a678, 0xcc8a9b0d),
+       TOBN(0xd9c2e722, 0xbdc32cff), TOBN(0xbf71b5f5, 0x337202a5)},
+      {TOBN(0x95c57f2f, 0x69fc4db9), TOBN(0xb6dad34c, 0x765d01e1),
+       TOBN(0x7e0bd13f, 0xcb904635), TOBN(0x61751253, 0x763a588c)}},
+     {{TOBN(0xd85c2997, 0x81af2c2d), TOBN(0xc0f7d9c4, 0x81b9d7da),
+       TOBN(0x838a34ae, 0x08533e8d), TOBN(0x15c4cb08, 0x311d8311)},
+      {TOBN(0x97f83285, 0x8e121e14), TOBN(0xeea7dc1e, 0x85000a5f),
+       TOBN(0x0c6059b6, 0x5d256274), TOBN(0xec9beace, 0xb95075c0)}},
+     {{TOBN(0x173daad7, 0x1df97828), TOBN(0xbf851cb5, 0xa8937877),
+       TOBN(0xb083c594, 0x01646f3c), TOBN(0x3bad30cf, 0x50c6d352)},
+      {TOBN(0xfeb2b202, 0x496bbcea), TOBN(0x3cf9fd4f, 0x18a1e8ba),
+       TOBN(0xd26de7ff, 0x1c066029), TOBN(0x39c81e9e, 0x4e9ed4f8)}},
+     {{TOBN(0xd8be0cb9, 0x7b390d35), TOBN(0x01df2bbd, 0x964aab27),
+       TOBN(0x3e8c1a65, 0xc3ef64f8), TOBN(0x567291d1, 0x716ed1dd)},
+      {TOBN(0x95499c6c, 0x5f5406d3), TOBN(0x71fdda39, 0x5ba8e23f),
+       TOBN(0xcfeb320e, 0xd5096ece), TOBN(0xbe7ba92b, 0xca66dd16)}},
+     {{TOBN(0x4608d36b, 0xc6fb5a7d), TOBN(0xe3eea15a, 0x6d2dd0e0),
+       TOBN(0x75b0a3eb, 0x8f97a36a), TOBN(0xf59814cc, 0x1c83de1e)},
+      {TOBN(0x56c9c5b0, 0x1c33c23f), TOBN(0xa96c1da4, 0x6faa4136),
+       TOBN(0x46bf2074, 0xde316551), TOBN(0x3b866e7b, 0x1f756c8f)}},
+     {{TOBN(0x727727d8, 0x1495ed6b), TOBN(0xb2394243, 0xb682dce7),
+       TOBN(0x8ab8454e, 0x758610f3), TOBN(0xc243ce84, 0x857d72a4)},
+      {TOBN(0x7b320d71, 0xdbbf370f), TOBN(0xff9afa37, 0x78e0f7ca),
+       TOBN(0x0119d1e0, 0xea7b523f), TOBN(0xb997f8cb, 0x058c7d42)}},
+     {{TOBN(0x285bcd2a, 0x37bbb184), TOBN(0x51dcec49, 0xa45d1fa6),
+       TOBN(0x6ade3b64, 0xe29634cb), TOBN(0x080c94a7, 0x26b86ef1)},
+      {TOBN(0xba583db1, 0x2283fbe3), TOBN(0x902bddc8, 0x5a9315ed),
+       TOBN(0x07c1ccb3, 0x86964bec), TOBN(0x78f4eacf, 0xb6258301)}},
+     {{TOBN(0x4bdf3a49, 0x56f90823), TOBN(0xba0f5080, 0x741d777b),
+       TOBN(0x091d71c3, 0xf38bf760), TOBN(0x9633d50f, 0x9b625b02)},
+      {TOBN(0x03ecb743, 0xb8c9de61), TOBN(0xb4751254, 0x5de74720),
+       TOBN(0x9f9defc9, 0x74ce1cb2), TOBN(0x774a4f6a, 0x00bd32ef)}},
+     {{TOBN(0xaca385f7, 0x73848f22), TOBN(0x53dad716, 0xf3f8558e),
+       TOBN(0xab7b34b0, 0x93c471f9), TOBN(0xf530e069, 0x19644bc7)},
+      {TOBN(0x3d9fb1ff, 0xdd59d31a), TOBN(0x4382e0df, 0x08daa795),
+       TOBN(0x165c6f4b, 0xd5cc88d7), TOBN(0xeaa392d5, 0x4a18c900)}},
+     {{TOBN(0x94203c67, 0x648024ee), TOBN(0x188763f2, 0x8c2fabcd),
+       TOBN(0xa80f87ac, 0xbbaec835), TOBN(0x632c96e0, 0xf29d8d54)},
+      {TOBN(0x29b0a60e, 0x4c00a95e), TOBN(0x2ef17f40, 0xe011e9fa),
+       TOBN(0xf6c0e1d1, 0x15b77223), TOBN(0xaaec2c62, 0x14b04e32)}},
+     {{TOBN(0xd35688d8, 0x3d84e58c), TOBN(0x2af5094c, 0x958571db),
+       TOBN(0x4fff7e19, 0x760682a6), TOBN(0x4cb27077, 0xe39a407c)},
+      {TOBN(0x0f59c547, 0x4ff0e321), TOBN(0x169f34a6, 0x1b34c8ff),
+       TOBN(0x2bff1096, 0x52bc1ba7), TOBN(0xa25423b7, 0x83583544)}},
+     {{TOBN(0x5d55d5d5, 0x0ac8b782), TOBN(0xff6622ec, 0x2db3c892),
+       TOBN(0x48fce741, 0x6b8bb642), TOBN(0x31d6998c, 0x69d7e3dc)},
+      {TOBN(0xdbaf8004, 0xcadcaed0), TOBN(0x801b0142, 0xd81d053c),
+       TOBN(0x94b189fc, 0x59630ec6), TOBN(0x120e9934, 0xaf762c8e)}},
+     {{TOBN(0x53a29aa4, 0xfdc6a404), TOBN(0x19d8e01e, 0xa1909948),
+       TOBN(0x3cfcabf1, 0xd7e89681), TOBN(0x3321a50d, 0x4e132d37)},
+      {TOBN(0xd0496863, 0xe9a86111), TOBN(0x8c0cde61, 0x06a3bc65),
+       TOBN(0xaf866c49, 0xfc9f8eef), TOBN(0x2066350e, 0xff7f5141)}},
+     {{TOBN(0x4f8a4689, 0xe56ddfbd), TOBN(0xea1b0c07, 0xfe32983a),
+       TOBN(0x2b317462, 0x873cb8cb), TOBN(0x658deddc, 0x2d93229f)},
+      {TOBN(0x65efaf4d, 0x0f64ef58), TOBN(0xfe43287d, 0x730cc7a8),
+       TOBN(0xaebc0c72, 0x3d047d70), TOBN(0x92efa539, 0xd92d26c9)}},
+     {{TOBN(0x06e78457, 0x94b56526), TOBN(0x415cb80f, 0x0961002d),
+       TOBN(0x89e5c565, 0x76dcb10f), TOBN(0x8bbb6982, 0xff9259fe)},
+      {TOBN(0x4fe8795b, 0x9abc2668), TOBN(0xb5d4f534, 0x1e678fb1),
+       TOBN(0x6601f3be, 0x7b7da2b9), TOBN(0x98da59e2, 0xa13d6805)}},
+     {{TOBN(0x190d8ea6, 0x01799a52), TOBN(0xa20cec41, 0xb86d2952),
+       TOBN(0x3062ffb2, 0x7fff2a7c), TOBN(0x741b32e5, 0x79f19d37)},
+      {TOBN(0xf80d8181, 0x4eb57d47), TOBN(0x7a2d0ed4, 0x16aef06b),
+       TOBN(0x09735fb0, 0x1cecb588), TOBN(0x1641caaa, 0xc6061f5b)}}},
+    {{{TOBN(0x7f99824f, 0x20151427), TOBN(0x206828b6, 0x92430206),
+       TOBN(0xaa9097d7, 0xe1112357), TOBN(0xacf9a2f2, 0x09e414ec)},
+      {TOBN(0xdbdac9da, 0x27915356), TOBN(0x7e0734b7, 0x001efee3),
+       TOBN(0x54fab5bb, 0xd2b288e2), TOBN(0x4c630fc4, 0xf62dd09c)}},
+     {{TOBN(0x8537107a, 0x1ac2703b), TOBN(0xb49258d8, 0x6bc857b5),
+       TOBN(0x57df14de, 0xbcdaccd1), TOBN(0x24ab68d7, 0xc4ae8529)},
+      {TOBN(0x7ed8b5d4, 0x734e59d0), TOBN(0x5f8740c8, 0xc495cc80),
+       TOBN(0x84aedd5a, 0x291db9b3), TOBN(0x80b360f8, 0x4fb995be)}},
+     {{TOBN(0xae915f5d, 0x5fa067d1), TOBN(0x4134b57f, 0x9668960c),
+       TOBN(0xbd3656d6, 0xa48edaac), TOBN(0xdac1e3e4, 0xfc1d7436)},
+      {TOBN(0x674ff869, 0xd81fbb26), TOBN(0x449ed3ec, 0xb26c33d4),
+       TOBN(0x85138705, 0xd94203e8), TOBN(0xccde538b, 0xbeeb6f4a)}},
+     {{TOBN(0x55d5c68d, 0xa61a76fa), TOBN(0x598b441d, 0xca1554dc),
+       TOBN(0xd39923b9, 0x773b279c), TOBN(0x33331d3c, 0x36bf9efc)},
+      {TOBN(0x2d4c848e, 0x298de399), TOBN(0xcfdb8e77, 0xa1a27f56),
+       TOBN(0x94c855ea, 0x57b8ab70), TOBN(0xdcdb9dae, 0x6f7879ba)}},
+     {{TOBN(0x7bdff8c2, 0x019f2a59), TOBN(0xb3ce5bb3, 0xcb4fbc74),
+       TOBN(0xea907f68, 0x8a9173dd), TOBN(0x6cd3d0d3, 0x95a75439)},
+      {TOBN(0x92ecc4d6, 0xefed021c), TOBN(0x09a9f9b0, 0x6a77339a),
+       TOBN(0x87ca6b15, 0x7188c64a), TOBN(0x10c29968, 0x44899158)}},
+     {{TOBN(0x5859a229, 0xed6e82ef), TOBN(0x16f338e3, 0x65ebaf4e),
+       TOBN(0x0cd31387, 0x5ead67ae), TOBN(0x1c73d228, 0x54ef0bb4)},
+      {TOBN(0x4cb55131, 0x74a5c8c7), TOBN(0x01cd2970, 0x7f69ad6a),
+       TOBN(0xa04d00dd, 0xe966f87e), TOBN(0xd96fe447, 0x0b7b0321)}},
+     {{TOBN(0x342ac06e, 0x88fbd381), TOBN(0x02cd4a84, 0x5c35a493),
+       TOBN(0xe8fa89de, 0x54f1bbcd), TOBN(0x341d6367, 0x2575ed4c)},
+      {TOBN(0xebe357fb, 0xd238202b), TOBN(0x600b4d1a, 0xa984ead9),
+       TOBN(0xc35c9f44, 0x52436ea0), TOBN(0x96fe0a39, 0xa370751b)}},
+     {{TOBN(0x4c4f0736, 0x7f636a38), TOBN(0x9f943fb7, 0x0e76d5cb),
+       TOBN(0xb03510ba, 0xa8b68b8b), TOBN(0xc246780a, 0x9ed07a1f)},
+      {TOBN(0x3c051415, 0x6d549fc2), TOBN(0xc2953f31, 0x607781ca),
+       TOBN(0x955e2c69, 0xd8d95413), TOBN(0xb300fadc, 0x7bd282e3)}},
+     {{TOBN(0x81fe7b50, 0x87e9189f), TOBN(0xdb17375c, 0xf42dda27),
+       TOBN(0x22f7d896, 0xcf0a5904), TOBN(0xa0e57c5a, 0xebe348e6)},
+      {TOBN(0xa61011d3, 0xf40e3c80), TOBN(0xb1189321, 0x8db705c5),
+       TOBN(0x4ed9309e, 0x50fedec3), TOBN(0xdcf14a10, 0x4d6d5c1d)}},
+     {{TOBN(0x056c265b, 0x55691342), TOBN(0xe8e08504, 0x91049dc7),
+       TOBN(0x131329f5, 0xc9bae20a), TOBN(0x96c8b3e8, 0xd9dccdb4)},
+      {TOBN(0x8c5ff838, 0xfb4ee6b4), TOBN(0xfc5a9aeb, 0x41e8ccf0),
+       TOBN(0x7417b764, 0xfae050c6), TOBN(0x0953c3d7, 0x00452080)}},
+     {{TOBN(0x21372682, 0x38dfe7e8), TOBN(0xea417e15, 0x2bb79d4b),
+       TOBN(0x59641f1c, 0x76e7cf2d), TOBN(0x271e3059, 0xea0bcfcc)},
+      {TOBN(0x624c7dfd, 0x7253ecbd), TOBN(0x2f552e25, 0x4fca6186),
+       TOBN(0xcbf84ecd, 0x4d866e9c), TOBN(0x73967709, 0xf68d4610)}},
+     {{TOBN(0xa14b1163, 0xc27901b4), TOBN(0xfd9236e0, 0x899b8bf3),
+       TOBN(0x42b091ec, 0xcbc6da0a), TOBN(0xbb1dac6f, 0x5ad1d297)},
+      {TOBN(0x80e61d53, 0xa91cf76e), TOBN(0x4110a412, 0xd31f1ee7),
+       TOBN(0x2d87c3ba, 0x13efcf77), TOBN(0x1f374bb4, 0xdf450d76)}},
+     {{TOBN(0x5e78e2f2, 0x0d188dab), TOBN(0xe3968ed0, 0xf4b885ef),
+       TOBN(0x46c0568e, 0x7314570f), TOBN(0x31616338, 0x01170521)},
+      {TOBN(0x18e1e7e2, 0x4f0c8afe), TOBN(0x4caa75ff, 0xdeea78da),
+       TOBN(0x82db67f2, 0x7c5d8a51), TOBN(0x36a44d86, 0x6f505370)}},
+     {{TOBN(0xd72c5bda, 0x0333974f), TOBN(0x5db516ae, 0x27a70146),
+       TOBN(0x34705281, 0x210ef921), TOBN(0xbff17a8f, 0x0c9c38e5)},
+      {TOBN(0x78f4814e, 0x12476da1), TOBN(0xc1e16613, 0x33c16980),
+       TOBN(0x9e5b386f, 0x424d4bca), TOBN(0x4c274e87, 0xc85740de)}},
+     {{TOBN(0xb6a9b88d, 0x6c2f5226), TOBN(0x14d1b944, 0x550d7ca8),
+       TOBN(0x580c85fc, 0x1fc41709), TOBN(0xc1da368b, 0x54c6d519)},
+      {TOBN(0x2b0785ce, 0xd5113cf7), TOBN(0x0670f633, 0x5a34708f),
+       TOBN(0x46e23767, 0x15cc3f88), TOBN(0x1b480cfa, 0x50c72c8f)}},
+     {{TOBN(0x20288602, 0x4147519a), TOBN(0xd0981eac, 0x26b372f0),
+       TOBN(0xa9d4a7ca, 0xa785ebc8), TOBN(0xd953c50d, 0xdbdf58e9)},
+      {TOBN(0x9d6361cc, 0xfd590f8f), TOBN(0x72e9626b, 0x44e6c917),
+       TOBN(0x7fd96110, 0x22eb64cf), TOBN(0x863ebb7e, 0x9eb288f3)}},
+     {{TOBN(0x6e6ab761, 0x6aca8ee7), TOBN(0x97d10b39, 0xd7b40358),
+       TOBN(0x1687d377, 0x1e5feb0d), TOBN(0xc83e50e4, 0x8265a27a)},
+      {TOBN(0x8f75a9fe, 0xc954b313), TOBN(0xcc2e8f47, 0x310d1f61),
+       TOBN(0xf5ba81c5, 0x6557d0e0), TOBN(0x25f9680c, 0x3eaf6207)}},
+     {{TOBN(0xf95c6609, 0x4354080b), TOBN(0x5225bfa5, 0x7bf2fe1c),
+       TOBN(0xc5c004e2, 0x5c7d98fa), TOBN(0x3561bf1c, 0x019aaf60)},
+      {TOBN(0x5e6f9f17, 0xba151474), TOBN(0xdec2f934, 0xb04f6eca),
+       TOBN(0x64e368a1, 0x269acb1e), TOBN(0x1332d9e4, 0x0cdda493)}},
+     {{TOBN(0x60d6cf69, 0xdf23de05), TOBN(0x66d17da2, 0x009339a0),
+       TOBN(0x9fcac985, 0x0a693923), TOBN(0xbcf057fc, 0xed7c6a6d)},
+      {TOBN(0xc3c5c8c5, 0xf0b5662c), TOBN(0x25318dd8, 0xdcba4f24),
+       TOBN(0x60e8cb75, 0x082b69ff), TOBN(0x7c23b3ee, 0x1e728c01)}},
+     {{TOBN(0x15e10a0a, 0x097e4403), TOBN(0xcb3d0a86, 0x19854665),
+       TOBN(0x88d8e211, 0xd67d4826), TOBN(0xb39af66e, 0x0b9d2839)},
+      {TOBN(0xa5f94588, 0xbd475ca8), TOBN(0xe06b7966, 0xc077b80b),
+       TOBN(0xfedb1485, 0xda27c26c), TOBN(0xd290d33a, 0xfe0fd5e0)}},
+     {{TOBN(0xa40bcc47, 0xf34fb0fa), TOBN(0xb4760cc8, 0x1fb1ab09),
+       TOBN(0x8fca0993, 0xa273bfe3), TOBN(0x13e4fe07, 0xf70b213c)},
+      {TOBN(0x3bcdb992, 0xfdb05163), TOBN(0x8c484b11, 0x0c2b19b6),
+       TOBN(0x1acb815f, 0xaaf2e3e2), TOBN(0xc6905935, 0xb89ff1b4)}},
+     {{TOBN(0xb2ad6f9d, 0x586e74e1), TOBN(0x488883ad, 0x67b80484),
+       TOBN(0x758aa2c7, 0x369c3ddb), TOBN(0x8ab74e69, 0x9f9afd31)},
+      {TOBN(0x10fc2d28, 0x5e21beb1), TOBN(0x3484518a, 0x318c42f9),
+       TOBN(0x377427dc, 0x53cf40c3), TOBN(0x9de0781a, 0x391bc1d9)}},
+     {{TOBN(0x8faee858, 0x693807e1), TOBN(0xa3865327, 0x4e81ccc7),
+       TOBN(0x02c30ff2, 0x6f835b84), TOBN(0xb604437b, 0x0d3d38d4)},
+      {TOBN(0xb3fc8a98, 0x5ca1823d), TOBN(0xb82f7ec9, 0x03be0324),
+       TOBN(0xee36d761, 0xcf684a33), TOBN(0x5a01df0e, 0x9f29bf7d)}},
+     {{TOBN(0x686202f3, 0x1306583d), TOBN(0x05b10da0, 0x437c622e),
+       TOBN(0xbf9aaa0f, 0x076a7bc8), TOBN(0x25e94efb, 0x8f8f4e43)},
+      {TOBN(0x8a35c9b7, 0xfa3dc26d), TOBN(0xe0e5fb93, 0x96ff03c5),
+       TOBN(0xa77e3843, 0xebc394ce), TOBN(0xcede6595, 0x8361de60)}},
+     {{TOBN(0xd27c22f6, 0xa1993545), TOBN(0xab01cc36, 0x24d671ba),
+       TOBN(0x63fa2877, 0xa169c28e), TOBN(0x925ef904, 0x2eb08376)},
+      {TOBN(0x3b2fa3cf, 0x53aa0b32), TOBN(0xb27beb5b, 0x71c49d7a),
+       TOBN(0xb60e1834, 0xd105e27f), TOBN(0xd6089788, 0x4f68570d)}},
+     {{TOBN(0x23094ce0, 0xd6fbc2ac), TOBN(0x738037a1, 0x815ff551),
+       TOBN(0xda73b1bb, 0x6bef119c), TOBN(0xdcf6c430, 0xeef506ba)},
+      {TOBN(0x00e4fe7b, 0xe3ef104a), TOBN(0xebdd9a2c, 0x0a065628),
+       TOBN(0x853a81c3, 0x8792043e), TOBN(0x22ad6ece, 0xb3b59108)}},
+     {{TOBN(0x9fb813c0, 0x39cd297d), TOBN(0x8ec7e16e, 0x05bda5d9),
+       TOBN(0x2834797c, 0x0d104b96), TOBN(0xcc11a2e7, 0x7c511510)},
+      {TOBN(0x96ca5a53, 0x96ee6380), TOBN(0x054c8655, 0xcea38742),
+       TOBN(0xb5946852, 0xd54dfa7d), TOBN(0x97c422e7, 0x1f4ab207)}},
+     {{TOBN(0xbf907509, 0x0c22b540), TOBN(0x2cde42aa, 0xb7c267d4),
+       TOBN(0xba18f9ed, 0x5ab0d693), TOBN(0x3ba62aa6, 0x6e4660d9)},
+      {TOBN(0xb24bf97b, 0xab9ea96a), TOBN(0x5d039642, 0xe3b60e32),
+       TOBN(0x4e6a4506, 0x7c4d9bd5), TOBN(0x666c5b9e, 0x7ed4a6a4)}},
+     {{TOBN(0xfa3fdcd9, 0x8edbd7cc), TOBN(0x4660bb87, 0xc6ccd753),
+       TOBN(0x9ae90820, 0x21e6b64f), TOBN(0x8a56a713, 0xb36bfb3f)},
+      {TOBN(0xabfce096, 0x5726d47f), TOBN(0x9eed01b2, 0x0b1a9a7f),
+       TOBN(0x30e9cad4, 0x4eb74a37), TOBN(0x7b2524cc, 0x53e9666d)}},
+     {{TOBN(0x6a29683b, 0x8f4b002f), TOBN(0xc2200d7a, 0x41f4fc20),
+       TOBN(0xcf3af47a, 0x3a338acc), TOBN(0x6539a4fb, 0xe7128975)},
+      {TOBN(0xcec31c14, 0xc33c7fcf), TOBN(0x7eb6799b, 0xc7be322b),
+       TOBN(0x119ef4e9, 0x6646f623), TOBN(0x7b7a26a5, 0x54d7299b)}},
+     {{TOBN(0xcb37f08d, 0x403f46f2), TOBN(0x94b8fc43, 0x1a0ec0c7),
+       TOBN(0xbb8514e3, 0xc332142f), TOBN(0xf3ed2c33, 0xe80d2a7a)},
+      {TOBN(0x8d2080af, 0xb639126c), TOBN(0xf7b6be60, 0xe3553ade),
+       TOBN(0x3950aa9f, 0x1c7e2b09), TOBN(0x847ff958, 0x6410f02b)}},
+     {{TOBN(0x877b7cf5, 0x678a31b0), TOBN(0xd50301ae, 0x3998b620),
+       TOBN(0x734257c5, 0xc00fb396), TOBN(0xf9fb18a0, 0x04e672a6)},
+      {TOBN(0xff8bd8eb, 0xe8758851), TOBN(0x1e64e4c6, 0x5d99ba44),
+       TOBN(0x4b8eaedf, 0x7dfd93b7), TOBN(0xba2f2a98, 0x04e76b8c)}},
+     {{TOBN(0x7d790cba, 0xe8053433), TOBN(0xc8e725a0, 0x3d2c9585),
+       TOBN(0x58c5c476, 0xcdd8f5ed), TOBN(0xd106b952, 0xefa9fe1d)},
+      {TOBN(0x3c5c775b, 0x0eff13a9), TOBN(0x242442ba, 0xe057b930),
+       TOBN(0xe9f458d4, 0xc9b70cbd), TOBN(0x69b71448, 0xa3cdb89a)}},
+     {{TOBN(0x41ee46f6, 0x0e2ed742), TOBN(0x573f1045, 0x40067493),
+       TOBN(0xb1e154ff, 0x9d54c304), TOBN(0x2ad0436a, 0x8d3a7502)},
+      {TOBN(0xee4aaa2d, 0x431a8121), TOBN(0xcd38b3ab, 0x886f11ed),
+       TOBN(0x57d49ea6, 0x034a0eb7), TOBN(0xd2b773bd, 0xf7e85e58)}},
+     {{TOBN(0x4a559ac4, 0x9b5c1f14), TOBN(0xc444be1a, 0x3e54df2b),
+       TOBN(0x13aad704, 0xeda41891), TOBN(0xcd927bec, 0x5eb5c788)},
+      {TOBN(0xeb3c8516, 0xe48c8a34), TOBN(0x1b7ac812, 0x4b546669),
+       TOBN(0x1815f896, 0x594df8ec), TOBN(0x87c6a79c, 0x79227865)}},
+     {{TOBN(0xae02a2f0, 0x9b56ddbd), TOBN(0x1339b5ac, 0x8a2f1cf3),
+       TOBN(0xf2b569c7, 0x839dff0d), TOBN(0xb0b9e864, 0xfee9a43d)},
+      {TOBN(0x4ff8ca41, 0x77bb064e), TOBN(0x145a2812, 0xfd249f63),
+       TOBN(0x3ab7beac, 0xf86f689a), TOBN(0x9bafec27, 0x01d35f5e)}},
+     {{TOBN(0x28054c65, 0x4265aa91), TOBN(0xa4b18304, 0x035efe42),
+       TOBN(0x6887b0e6, 0x9639dec7), TOBN(0xf4b8f6ad, 0x3d52aea5)},
+      {TOBN(0xfb9293cc, 0x971a8a13), TOBN(0x3f159e5d, 0x4c934d07),
+       TOBN(0x2c50e9b1, 0x09acbc29), TOBN(0x08eb65e6, 0x7154d129)}},
+     {{TOBN(0x4feff589, 0x30b75c3e), TOBN(0x0bb82fe2, 0x94491c93),
+       TOBN(0xd8ac377a, 0x89af62bb), TOBN(0xd7b51490, 0x9685e49f)},
+      {TOBN(0xabca9a7b, 0x04497f19), TOBN(0x1b35ed0a, 0x1a7ad13f),
+       TOBN(0x6b601e21, 0x3ec86ed6), TOBN(0xda91fcb9, 0xce0c76f1)}},
+     {{TOBN(0x9e28507b, 0xd7ab27e1), TOBN(0x7c19a555, 0x63945b7b),
+       TOBN(0x6b43f0a1, 0xaafc9827), TOBN(0x443b4fbd, 0x3aa55b91)},
+      {TOBN(0x962b2e65, 0x6962c88f), TOBN(0x139da8d4, 0xce0db0ca),
+       TOBN(0xb93f05dd, 0x1b8d6c4f), TOBN(0x779cdff7, 0x180b9824)}},
+     {{TOBN(0xbba23fdd, 0xae57c7b7), TOBN(0x345342f2, 0x1b932522),
+       TOBN(0xfd9c80fe, 0x556d4aa3), TOBN(0xa03907ba, 0x6525bb61)},
+      {TOBN(0x38b010e1, 0xff218933), TOBN(0xc066b654, 0xaa52117b),
+       TOBN(0x8e141920, 0x94f2e6ea), TOBN(0x66a27dca, 0x0d32f2b2)}},
+     {{TOBN(0x69c7f993, 0x048b3717), TOBN(0xbf5a989a, 0xb178ae1c),
+       TOBN(0x49fa9058, 0x564f1d6b), TOBN(0x27ec6e15, 0xd31fde4e)},
+      {TOBN(0x4cce0373, 0x7276e7fc), TOBN(0x64086d79, 0x89d6bf02),
+       TOBN(0x5a72f046, 0x4ccdd979), TOBN(0x909c3566, 0x47775631)}},
+     {{TOBN(0x1c07bc6b, 0x75dd7125), TOBN(0xb4c6bc97, 0x87a0428d),
+       TOBN(0x507ece52, 0xfdeb6b9d), TOBN(0xfca56512, 0xb2c95432)},
+      {TOBN(0x15d97181, 0xd0e8bd06), TOBN(0x384dd317, 0xc6bb46ea),
+       TOBN(0x5441ea20, 0x3952b624), TOBN(0xbcf70dee, 0x4e7dc2fb)}},
+     {{TOBN(0x372b016e, 0x6628e8c3), TOBN(0x07a0d667, 0xb60a7522),
+       TOBN(0xcf05751b, 0x0a344ee2), TOBN(0x0ec09a48, 0x118bdeec)},
+      {TOBN(0x6e4b3d4e, 0xd83dce46), TOBN(0x43a6316d, 0x99d2fc6e),
+       TOBN(0xa99d8989, 0x56cf044c), TOBN(0x7c7f4454, 0xae3e5fb7)}},
+     {{TOBN(0xb2e6b121, 0xfbabbe92), TOBN(0x281850fb, 0xe1330076),
+       TOBN(0x093581ec, 0x97890015), TOBN(0x69b1dded, 0x75ff77f5)},
+      {TOBN(0x7cf0b18f, 0xab105105), TOBN(0x953ced31, 0xa89ccfef),
+       TOBN(0x3151f85f, 0xeb914009), TOBN(0x3c9f1b87, 0x88ed48ad)}},
+     {{TOBN(0xc9aba1a1, 0x4a7eadcb), TOBN(0x928e7501, 0x522e71cf),
+       TOBN(0xeaede727, 0x3a2e4f83), TOBN(0x467e10d1, 0x1ce3bbd3)},
+      {TOBN(0xf3442ac3, 0xb955dcf0), TOBN(0xba96307d, 0xd3d5e527),
+       TOBN(0xf763a10e, 0xfd77f474), TOBN(0x5d744bd0, 0x6a6e1ff0)}},
+     {{TOBN(0xd287282a, 0xa777899e), TOBN(0xe20eda8f, 0xd03f3cde),
+       TOBN(0x6a7e75bb, 0x50b07d31), TOBN(0x0b7e2a94, 0x6f379de4)},
+      {TOBN(0x31cb64ad, 0x19f593cf), TOBN(0x7b1a9e4f, 0x1e76ef1d),
+       TOBN(0xe18c9c9d, 0xb62d609c), TOBN(0x439bad6d, 0xe779a650)}},
+     {{TOBN(0x219d9066, 0xe032f144), TOBN(0x1db632b8, 0xe8b2ec6a),
+       TOBN(0xff0d0fd4, 0xfda12f78), TOBN(0x56fb4c2d, 0x2a25d265)},
+      {TOBN(0x5f4e2ee1, 0x255a03f1), TOBN(0x61cd6af2, 0xe96af176),
+       TOBN(0xe0317ba8, 0xd068bc97), TOBN(0x927d6bab, 0x264b988e)}},
+     {{TOBN(0xa18f07e0, 0xe90fb21e), TOBN(0x00fd2b80, 0xbba7fca1),
+       TOBN(0x20387f27, 0x95cd67b5), TOBN(0x5b89a4e7, 0xd39707f7)},
+      {TOBN(0x8f83ad3f, 0x894407ce), TOBN(0xa0025b94, 0x6c226132),
+       TOBN(0xc79563c7, 0xf906c13b), TOBN(0x5f548f31, 0x4e7bb025)}},
+     {{TOBN(0x2b4c6b8f, 0xeac6d113), TOBN(0xa67e3f9c, 0x0e813c76),
+       TOBN(0x3982717c, 0x3fe1f4b9), TOBN(0x58865819, 0x26d8050e)},
+      {TOBN(0x99f3640c, 0xf7f06f20), TOBN(0xdc610216, 0x2a66ebc2),
+       TOBN(0x52f2c175, 0x767a1e08), TOBN(0x05660e1a, 0x5999871b)}},
+     {{TOBN(0x6b0f1762, 0x6d3c4693), TOBN(0xf0e7d627, 0x37ed7bea),
+       TOBN(0xc51758c7, 0xb75b226d), TOBN(0x40a88628, 0x1f91613b)},
+      {TOBN(0x889dbaa7, 0xbbb38ce0), TOBN(0xe0404b65, 0xbddcad81),
+       TOBN(0xfebccd3a, 0x8bc9671f), TOBN(0xfbf9a357, 0xee1f5375)}},
+     {{TOBN(0x5dc169b0, 0x28f33398), TOBN(0xb07ec11d, 0x72e90f65),
+       TOBN(0xae7f3b4a, 0xfaab1eb1), TOBN(0xd970195e, 0x5f17538a)},
+      {TOBN(0x52b05cbe, 0x0181e640), TOBN(0xf5debd62, 0x2643313d),
+       TOBN(0x76148154, 0x5df31f82), TOBN(0x23e03b33, 0x3a9e13c5)}},
+     {{TOBN(0xff758949, 0x4fde0c1f), TOBN(0xbf8a1abe, 0xe5b6ec20),
+       TOBN(0x702278fb, 0x87e1db6c), TOBN(0xc447ad7a, 0x35ed658f)},
+      {TOBN(0x48d4aa38, 0x03d0ccf2), TOBN(0x80acb338, 0x819a7c03),
+       TOBN(0x9bc7c89e, 0x6e17cecc), TOBN(0x46736b8b, 0x03be1d82)}},
+     {{TOBN(0xd65d7b60, 0xc0432f96), TOBN(0xddebe7a3, 0xdeb5442f),
+       TOBN(0x79a25307, 0x7dff69a2), TOBN(0x37a56d94, 0x02cf3122)},
+      {TOBN(0x8bab8aed, 0xf2350d0a), TOBN(0x13c3f276, 0x037b0d9a),
+       TOBN(0xc664957c, 0x44c65cae), TOBN(0x88b44089, 0xc2e71a88)}},
+     {{TOBN(0xdb88e5a3, 0x5cb02664), TOBN(0x5d4c0bf1, 0x8686c72e),
+       TOBN(0xea3d9b62, 0xa682d53e), TOBN(0x9b605ef4, 0x0b2ad431)},
+      {TOBN(0x71bac202, 0xc69645d0), TOBN(0xa115f03a, 0x6a1b66e7),
+       TOBN(0xfe2c563a, 0x158f4dc4), TOBN(0xf715b3a0, 0x4d12a78c)}},
+     {{TOBN(0x8f7f0a48, 0xd413213a), TOBN(0x2035806d, 0xc04becdb),
+       TOBN(0xecd34a99, 0x5d8587f5), TOBN(0x4d8c3079, 0x9f6d3a71)},
+      {TOBN(0x1b2a2a67, 0x8d95a8f6), TOBN(0xc58c9d7d, 0xf2110d0d),
+       TOBN(0xdeee81d5, 0xcf8fba3f), TOBN(0xa42be3c0, 0x0c7cdf68)}},
+     {{TOBN(0x2126f742, 0xd43b5eaa), TOBN(0x054a0766, 0xdfa59b85),
+       TOBN(0x9d0d5e36, 0x126bfd45), TOBN(0xa1f8fbd7, 0x384f8a8f)},
+      {TOBN(0x317680f5, 0xd563fccc), TOBN(0x48ca5055, 0xf280a928),
+       TOBN(0xe00b81b2, 0x27b578cf), TOBN(0x10aad918, 0x2994a514)}},
+     {{TOBN(0xd9e07b62, 0xb7bdc953), TOBN(0x9f0f6ff2, 0x5bc086dd),
+       TOBN(0x09d1ccff, 0x655eee77), TOBN(0x45475f79, 0x5bef7df1)},
+      {TOBN(0x3faa28fa, 0x86f702cc), TOBN(0x92e60905, 0x0f021f07),
+       TOBN(0xe9e62968, 0x7f8fa8c6), TOBN(0xbd71419a, 0xf036ea2c)}},
+     {{TOBN(0x171ee1cc, 0x6028da9a), TOBN(0x5352fe1a, 0xc251f573),
+       TOBN(0xf8ff236e, 0x3fa997f4), TOBN(0xd831b6c9, 0xa5749d5f)},
+      {TOBN(0x7c872e1d, 0xe350e2c2), TOBN(0xc56240d9, 0x1e0ce403),
+       TOBN(0xf9deb077, 0x6974f5cb), TOBN(0x7d50ba87, 0x961c3728)}},
+     {{TOBN(0xd6f89426, 0x5a3a2518), TOBN(0xcf817799, 0xc6303d43),
+       TOBN(0x510a0471, 0x619e5696), TOBN(0xab049ff6, 0x3a5e307b)},
+      {TOBN(0xe4cdf9b0, 0xfeb13ec7), TOBN(0xd5e97117, 0x9d8ff90c),
+       TOBN(0xf6f64d06, 0x9afa96af), TOBN(0x00d0bf5e, 0x9d2012a2)}},
+     {{TOBN(0xe63f301f, 0x358bcdc0), TOBN(0x07689e99, 0x0a9d47f8),
+       TOBN(0x1f689e2f, 0x4f43d43a), TOBN(0x4d542a16, 0x90920904)},
+      {TOBN(0xaea293d5, 0x9ca0a707), TOBN(0xd061fe45, 0x8ac68065),
+       TOBN(0x1033bf1b, 0x0090008c), TOBN(0x29749558, 0xc08a6db6)}},
+     {{TOBN(0x74b5fc59, 0xc1d5d034), TOBN(0xf712e9f6, 0x67e215e0),
+       TOBN(0xfd520cbd, 0x860200e6), TOBN(0x0229acb4, 0x3ea22588)},
+      {TOBN(0x9cd1e14c, 0xfff0c82e), TOBN(0x87684b62, 0x59c69e73),
+       TOBN(0xda85e61c, 0x96ccb989), TOBN(0x2d5dbb02, 0xa3d06493)}},
+     {{TOBN(0xf22ad33a, 0xe86b173c), TOBN(0xe8e41ea5, 0xa79ff0e3),
+       TOBN(0x01d2d725, 0xdd0d0c10), TOBN(0x31f39088, 0x032d28f9)},
+      {TOBN(0x7b3f71e1, 0x7829839e), TOBN(0x0cf691b4, 0x4502ae58),
+       TOBN(0xef658dbd, 0xbefc6115), TOBN(0xa5cd6ee5, 0xb3ab5314)}},
+     {{TOBN(0x206c8d7b, 0x5f1d2347), TOBN(0x794645ba, 0x4cc2253a),
+       TOBN(0xd517d8ff, 0x58389e08), TOBN(0x4fa20dee, 0x9f847288)},
+      {TOBN(0xeba072d8, 0xd797770a), TOBN(0x7360c91d, 0xbf429e26),
+       TOBN(0x7200a3b3, 0x80af8279), TOBN(0x6a1c9150, 0x82dadce3)}},
+     {{TOBN(0x0ee6d3a7, 0xc35d8794), TOBN(0x042e6558, 0x0356bae5),
+       TOBN(0x9f59698d, 0x643322fd), TOBN(0x9379ae15, 0x50a61967)},
+      {TOBN(0x64b9ae62, 0xfcc9981e), TOBN(0xaed3d631, 0x6d2934c6),
+       TOBN(0x2454b302, 0x5e4e65eb), TOBN(0xab09f647, 0xf9950428)}}},
+    {{{TOBN(0xb2083a12, 0x22248acc), TOBN(0x1f6ec0ef, 0x3264e366),
+       TOBN(0x5659b704, 0x5afdee28), TOBN(0x7a823a40, 0xe6430bb5)},
+      {TOBN(0x24592a04, 0xe1900a79), TOBN(0xcde09d4a, 0xc9ee6576),
+       TOBN(0x52b6463f, 0x4b5ea54a), TOBN(0x1efe9ed3, 0xd3ca65a7)}},
+     {{TOBN(0xe27a6dbe, 0x305406dd), TOBN(0x8eb7dc7f, 0xdd5d1957),
+       TOBN(0xf54a6876, 0x387d4d8f), TOBN(0x9c479409, 0xc7762de4)},
+      {TOBN(0xbe4d5b5d, 0x99b30778), TOBN(0x25380c56, 0x6e793682),
+       TOBN(0x602d37f3, 0xdac740e3), TOBN(0x140deabe, 0x1566e4ae)}},
+     {{TOBN(0x4481d067, 0xafd32acf), TOBN(0xd8f0fcca, 0xe1f71ccf),
+       TOBN(0xd208dd0c, 0xb596f2da), TOBN(0xd049d730, 0x9aad93f9)},
+      {TOBN(0xc79f263d, 0x42ab580e), TOBN(0x09411bb1, 0x23f707b4),
+       TOBN(0x8cfde1ff, 0x835e0eda), TOBN(0x72707490, 0x90f03402)}},
+     {{TOBN(0xeaee6126, 0xc49a861e), TOBN(0x024f3b65, 0xe14f0d06),
+       TOBN(0x51a3f1e8, 0xc69bfc17), TOBN(0xc3c3a8e9, 0xa7686381)},
+      {TOBN(0x3400752c, 0xb103d4c8), TOBN(0x02bc4613, 0x9218b36b),
+       TOBN(0xc67f75eb, 0x7651504a), TOBN(0xd6848b56, 0xd02aebfa)}},
+     {{TOBN(0xbd9802e6, 0xc30fa92b), TOBN(0x5a70d96d, 0x9a552784),
+       TOBN(0x9085c4ea, 0x3f83169b), TOBN(0xfa9423bb, 0x06908228)},
+      {TOBN(0x2ffebe12, 0xfe97a5b9), TOBN(0x85da6049, 0x71b99118),
+       TOBN(0x9cbc2f7f, 0x63178846), TOBN(0xfd96bc70, 0x9153218e)}},
+     {{TOBN(0x958381db, 0x1782269b), TOBN(0xae34bf79, 0x2597e550),
+       TOBN(0xbb5c6064, 0x5f385153), TOBN(0x6f0e96af, 0xe3088048)},
+      {TOBN(0xbf6a0215, 0x77884456), TOBN(0xb3b5688c, 0x69310ea7),
+       TOBN(0x17c94295, 0x04fad2de), TOBN(0xe020f0e5, 0x17896d4d)}},
+     {{TOBN(0x730ba0ab, 0x0976505f), TOBN(0x567f6813, 0x095e2ec5),
+       TOBN(0x47062010, 0x6331ab71), TOBN(0x72cfa977, 0x41d22b9f)},
+      {TOBN(0x33e55ead, 0x8a2373da), TOBN(0xa8d0d5f4, 0x7ba45a68),
+       TOBN(0xba1d8f9c, 0x03029d15), TOBN(0x8f34f1cc, 0xfc55b9f3)}},
+     {{TOBN(0xcca4428d, 0xbbe5a1a9), TOBN(0x8187fd5f, 0x3126bd67),
+       TOBN(0x0036973a, 0x48105826), TOBN(0xa39b6663, 0xb8bd61a0)},
+      {TOBN(0x6d42deef, 0x2d65a808), TOBN(0x4969044f, 0x94636b19),
+       TOBN(0xf611ee47, 0xdd5d564c), TOBN(0x7b2f3a49, 0xd2873077)}},
+     {{TOBN(0x94157d45, 0x300eb294), TOBN(0x2b2a656e, 0x169c1494),
+       TOBN(0xc000dd76, 0xd3a47aa9), TOBN(0xa2864e4f, 0xa6243ea4)},
+      {TOBN(0x82716c47, 0xdb89842e), TOBN(0x12dfd7d7, 0x61479fb7),
+       TOBN(0x3b9a2c56, 0xe0b2f6dc), TOBN(0x46be862a, 0xd7f85d67)}},
+     {{TOBN(0x03b0d8dd, 0x0f82b214), TOBN(0x460c34f9, 0xf103cbc6),
+       TOBN(0xf32e5c03, 0x18d79e19), TOBN(0x8b8888ba, 0xa84117f8)},
+      {TOBN(0x8f3c37dc, 0xc0722677), TOBN(0x10d21be9, 0x1c1c0f27),
+       TOBN(0xd47c8468, 0xe0f7a0c6), TOBN(0x9bf02213, 0xadecc0e0)}},
+     {{TOBN(0x0baa7d12, 0x42b48b99), TOBN(0x1bcb665d, 0x48424096),
+       TOBN(0x8b847cd6, 0xebfb5cfb), TOBN(0x87c2ae56, 0x9ad4d10d)},
+      {TOBN(0xf1cbb122, 0x0de36726), TOBN(0xe7043c68, 0x3fdfbd21),
+       TOBN(0x4bd0826a, 0x4e79d460), TOBN(0x11f5e598, 0x4bd1a2cb)}},
+     {{TOBN(0x97554160, 0xb7fe7b6e), TOBN(0x7d16189a, 0x400a3fb2),
+       TOBN(0xd73e9bea, 0xe328ca1e), TOBN(0x0dd04b97, 0xe793d8cc)},
+      {TOBN(0xa9c83c9b, 0x506db8cc), TOBN(0x5cd47aae, 0xcf38814c),
+       TOBN(0x26fc430d, 0xb64b45e6), TOBN(0x079b5499, 0xd818ea84)}},
+     {{TOBN(0xebb01102, 0xc1c24a3b), TOBN(0xca24e568, 0x1c161c1a),
+       TOBN(0x103eea69, 0x36f00a4a), TOBN(0x9ad76ee8, 0x76176c7b)},
+      {TOBN(0x97451fc2, 0x538e0ff7), TOBN(0x94f89809, 0x6604b3b0),
+       TOBN(0x6311436e, 0x3249cfd7), TOBN(0x27b4a7bd, 0x41224f69)}},
+     {{TOBN(0x03b5d21a, 0xe0ac2941), TOBN(0x279b0254, 0xc2d31937),
+       TOBN(0x3307c052, 0xcac992d0), TOBN(0x6aa7cb92, 0xefa8b1f3)},
+      {TOBN(0x5a182580, 0x0d37c7a5), TOBN(0x13380c37, 0x342d5422),
+       TOBN(0x92ac2d66, 0xd5d2ef92), TOBN(0x035a70c9, 0x030c63c6)}},
+     {{TOBN(0xc16025dd, 0x4ce4f152), TOBN(0x1f419a71, 0xf9df7c06),
+       TOBN(0x6d5b2214, 0x91e4bb14), TOBN(0xfc43c6cc, 0x839fb4ce)},
+      {TOBN(0x49f06591, 0x925d6b2d), TOBN(0x4b37d9d3, 0x62186598),
+       TOBN(0x8c54a971, 0xd01b1629), TOBN(0xe1a9c29f, 0x51d50e05)}},
+     {{TOBN(0x5109b785, 0x71ba1861), TOBN(0x48b22d5c, 0xd0c8f93d),
+       TOBN(0xe8fa84a7, 0x8633bb93), TOBN(0x53fba6ba, 0x5aebbd08)},
+      {TOBN(0x7ff27df3, 0xe5eea7d8), TOBN(0x521c8796, 0x68ca7158),
+       TOBN(0xb9d5133b, 0xce6f1a05), TOBN(0x2d50cd53, 0xfd0ebee4)}},
+     {{TOBN(0xc82115d6, 0xc5a3ef16), TOBN(0x993eff9d, 0xba079221),
+       TOBN(0xe4da2c5e, 0x4b5da81c), TOBN(0x9a89dbdb, 0x8033fd85)},
+      {TOBN(0x60819ebf, 0x2b892891), TOBN(0x53902b21, 0x5d14a4d5),
+       TOBN(0x6ac35051, 0xd7fda421), TOBN(0xcc6ab885, 0x61c83284)}},
+     {{TOBN(0x14eba133, 0xf74cff17), TOBN(0x240aaa03, 0xecb813f2),
+       TOBN(0xcfbb6540, 0x6f665bee), TOBN(0x084b1fe4, 0xa425ad73)},
+      {TOBN(0x009d5d16, 0xd081f6a6), TOBN(0x35304fe8, 0xeef82c90),
+       TOBN(0xf20346d5, 0xaa9eaa22), TOBN(0x0ada9f07, 0xac1c91e3)}},
+     {{TOBN(0xa6e21678, 0x968a6144), TOBN(0x54c1f77c, 0x07b31a1e),
+       TOBN(0xd6bb787e, 0x5781fbe1), TOBN(0x61bd2ee0, 0xe31f1c4a)},
+      {TOBN(0xf25aa1e9, 0x781105fc), TOBN(0x9cf2971f, 0x7b2f8e80),
+       TOBN(0x26d15412, 0xcdff919b), TOBN(0x01db4ebe, 0x34bc896e)}},
+     {{TOBN(0x7d9b3e23, 0xb40df1cf), TOBN(0x59337373, 0x94e971b4),
+       TOBN(0xbf57bd14, 0x669cf921), TOBN(0x865daedf, 0x0c1a1064)},
+      {TOBN(0x3eb70bd3, 0x83279125), TOBN(0xbc3d5b9f, 0x34ecdaab),
+       TOBN(0x91e3ed7e, 0x5f755caf), TOBN(0x49699f54, 0xd41e6f02)}},
+     {{TOBN(0x185770e1, 0xd4a7a15b), TOBN(0x08f3587a, 0xeaac87e7),
+       TOBN(0x352018db, 0x473133ea), TOBN(0x674ce719, 0x04fd30fc)},
+      {TOBN(0x7b8d9835, 0x088b3e0e), TOBN(0x7a0356a9, 0x5d0d47a1),
+       TOBN(0x9d9e7659, 0x6474a3c4), TOBN(0x61ea48a7, 0xff66966c)}},
+     {{TOBN(0x30417758, 0x0f3e4834), TOBN(0xfdbb21c2, 0x17a9afcb),
+       TOBN(0x756fa17f, 0x2f9a67b3), TOBN(0x2a6b2421, 0xa245c1a8)},
+      {TOBN(0x64be2794, 0x4af02291), TOBN(0xade465c6, 0x2a5804fe),
+       TOBN(0x8dffbd39, 0xa6f08fd7), TOBN(0xc4efa84c, 0xaa14403b)}},
+     {{TOBN(0xa1b91b2a, 0x442b0f5c), TOBN(0xb748e317, 0xcf997736),
+       TOBN(0x8d1b62bf, 0xcee90e16), TOBN(0x907ae271, 0x0b2078c0)},
+      {TOBN(0xdf31534b, 0x0c9bcddd), TOBN(0x043fb054, 0x39adce83),
+       TOBN(0x99031043, 0xd826846a), TOBN(0x61a9c0d6, 0xb144f393)}},
+     {{TOBN(0xdab48046, 0x47718427), TOBN(0xdf17ff9b, 0x6e830f8b),
+       TOBN(0x408d7ee8, 0xe49a1347), TOBN(0x6ac71e23, 0x91c1d4ae)},
+      {TOBN(0xc8cbb9fd, 0x1defd73c), TOBN(0x19840657, 0xbbbbfec5),
+       TOBN(0x39db1cb5, 0x9e7ef8ea), TOBN(0x78aa8296, 0x64105f30)}},
+     {{TOBN(0xa3d9b7f0, 0xa3738c29), TOBN(0x0a2f235a, 0xbc3250a3),
+       TOBN(0x55e506f6, 0x445e4caf), TOBN(0x0974f73d, 0x33475f7a)},
+      {TOBN(0xd37dbba3, 0x5ba2f5a8), TOBN(0x542c6e63, 0x6af40066),
+       TOBN(0x26d99b53, 0xc5d73e2c), TOBN(0x06060d7d, 0x6c3ca33e)}},
+     {{TOBN(0xcdbef1c2, 0x065fef4a), TOBN(0x77e60f7d, 0xfd5b92e3),
+       TOBN(0xd7c549f0, 0x26708350), TOBN(0x201b3ad0, 0x34f121bf)},
+      {TOBN(0x5fcac2a1, 0x0334fc14), TOBN(0x8a9a9e09, 0x344552f6),
+       TOBN(0x7dd8a1d3, 0x97653082), TOBN(0x5fc0738f, 0x79d4f289)}},
+     {{TOBN(0x787d244d, 0x17d2d8c3), TOBN(0xeffc6345, 0x70830684),
+       TOBN(0x5ddb96dd, 0xe4f73ae5), TOBN(0x8efb14b1, 0x172549a5)},
+      {TOBN(0x6eb73eee, 0x2245ae7a), TOBN(0xbca4061e, 0xea11f13e),
+       TOBN(0xb577421d, 0x30b01f5d), TOBN(0xaa688b24, 0x782e152c)}},
+     {{TOBN(0x67608e71, 0xbd3502ba), TOBN(0x4ef41f24, 0xb4de75a0),
+       TOBN(0xb08dde5e, 0xfd6125e5), TOBN(0xde484825, 0xa409543f)},
+      {TOBN(0x1f198d98, 0x65cc2295), TOBN(0x428a3771, 0x6e0edfa2),
+       TOBN(0x4f9697a2, 0xadf35fc7), TOBN(0x01a43c79, 0xf7cac3c7)}},
+     {{TOBN(0xb05d7059, 0x0fd3659a), TOBN(0x8927f30c, 0xbb7f2d9a),
+       TOBN(0x4023d1ac, 0x8cf984d3), TOBN(0x32125ed3, 0x02897a45)},
+      {TOBN(0xfb572dad, 0x3d414205), TOBN(0x73000ef2, 0xe3fa82a9),
+       TOBN(0x4c0868e9, 0xf10a5581), TOBN(0x5b61fc67, 0x6b0b3ca5)}},
+     {{TOBN(0xc1258d5b, 0x7cae440c), TOBN(0x21c08b41, 0x402b7531),
+       TOBN(0xf61a8955, 0xde932321), TOBN(0x3568faf8, 0x2d1408af)},
+      {TOBN(0x71b15e99, 0x9ecf965b), TOBN(0xf14ed248, 0xe917276f),
+       TOBN(0xc6f4caa1, 0x820cf9e2), TOBN(0x681b20b2, 0x18d83c7e)}},
+     {{TOBN(0x6cde738d, 0xc6c01120), TOBN(0x71db0813, 0xae70e0db),
+       TOBN(0x95fc0644, 0x74afe18c), TOBN(0x34619053, 0x129e2be7)},
+      {TOBN(0x80615cea, 0xdb2a3b15), TOBN(0x0a49a19e, 0xdb4c7073),
+       TOBN(0x0e1b84c8, 0x8fd2d367), TOBN(0xd74bf462, 0x033fb8aa)}},
+     {{TOBN(0x889f6d65, 0x533ef217), TOBN(0x7158c7e4, 0xc3ca2e87),
+       TOBN(0xfb670dfb, 0xdc2b4167), TOBN(0x75910a01, 0x844c257f)},
+      {TOBN(0xf336bf07, 0xcf88577d), TOBN(0x22245250, 0xe45e2ace),
+       TOBN(0x2ed92e8d, 0x7ca23d85), TOBN(0x29f8be4c, 0x2b812f58)}},
+     {{TOBN(0xdd9ebaa7, 0x076fe12b), TOBN(0x3f2400cb, 0xae1537f9),
+       TOBN(0x1aa93528, 0x17bdfb46), TOBN(0xc0f98430, 0x67883b41)},
+      {TOBN(0x5590ede1, 0x0170911d), TOBN(0x7562f5bb, 0x34d4b17f),
+       TOBN(0xe1fa1df2, 0x1826b8d2), TOBN(0xb40b796a, 0x6bd80d59)}},
+     {{TOBN(0xd65bf197, 0x3467ba92), TOBN(0x8c9b46db, 0xf70954b0),
+       TOBN(0x97c8a0f3, 0x0e78f15d), TOBN(0xa8f3a69a, 0x85a4c961)},
+      {TOBN(0x4242660f, 0x61e4ce9b), TOBN(0xbf06aab3, 0x6ea6790c),
+       TOBN(0xc6706f8e, 0xec986416), TOBN(0x9e56dec1, 0x9a9fc225)}},
+     {{TOBN(0x527c46f4, 0x9a9898d9), TOBN(0xd799e77b, 0x5633cdef),
+       TOBN(0x24eacc16, 0x7d9e4297), TOBN(0xabb61cea, 0x6b1cb734)},
+      {TOBN(0xbee2e8a7, 0xf778443c), TOBN(0x3bb42bf1, 0x29de2fe6),
+       TOBN(0xcbed86a1, 0x3003bb6f), TOBN(0xd3918e6c, 0xd781cdf6)}},
+     {{TOBN(0x4bee3271, 0x9a5103f1), TOBN(0x5243efc6, 0xf50eac06),
+       TOBN(0xb8e122cb, 0x6adcc119), TOBN(0x1b7faa84, 0xc0b80a08)},
+      {TOBN(0x32c3d1bd, 0x6dfcd08c), TOBN(0x129dec4e, 0x0be427de),
+       TOBN(0x98ab679c, 0x1d263c83), TOBN(0xafc83cb7, 0xcef64eff)}},
+     {{TOBN(0x85eb6088, 0x2fa6be76), TOBN(0x892585fb, 0x1328cbfe),
+       TOBN(0xc154d3ed, 0xcf618dda), TOBN(0xc44f601b, 0x3abaf26e)},
+      {TOBN(0x7bf57d0b, 0x2be1fdfd), TOBN(0xa833bd2d, 0x21137fee),
+       TOBN(0x9353af36, 0x2db591a8), TOBN(0xc76f26dc, 0x5562a056)}},
+     {{TOBN(0x1d87e47d, 0x3fdf5a51), TOBN(0x7afb5f93, 0x55c9cab0),
+       TOBN(0x91bbf58f, 0x89e0586e), TOBN(0x7c72c018, 0x0d843709)},
+      {TOBN(0xa9a5aafb, 0x99b5c3dc), TOBN(0xa48a0f1d, 0x3844aeb0),
+       TOBN(0x7178b7dd, 0xb667e482), TOBN(0x453985e9, 0x6e23a59a)}},
+     {{TOBN(0x4a54c860, 0x01b25dd8), TOBN(0x0dd37f48, 0xfb897c8a),
+       TOBN(0x5f8aa610, 0x0ea90cd9), TOBN(0xc8892c68, 0x16d5830d)},
+      {TOBN(0xeb4befc0, 0xef514ca5), TOBN(0x478eb679, 0xe72c9ee6),
+       TOBN(0x9bca20da, 0xdbc40d5f), TOBN(0xf015de21, 0xdde4f64a)}},
+     {{TOBN(0xaa6a4de0, 0xeaf4b8a5), TOBN(0x68cfd9ca, 0x4bc60e32),
+       TOBN(0x668a4b01, 0x7fd15e70), TOBN(0xd9f0694a, 0xf27dc09d)},
+      {TOBN(0xf6c3cad5, 0xba708bcd), TOBN(0x5cd2ba69, 0x5bb95c2a),
+       TOBN(0xaa28c1d3, 0x33c0a58f), TOBN(0x23e274e3, 0xabc77870)}},
+     {{TOBN(0x44c3692d, 0xdfd20a4a), TOBN(0x091c5fd3, 0x81a66653),
+       TOBN(0x6c0bb691, 0x09a0757d), TOBN(0x9072e8b9, 0x667343ea)},
+      {TOBN(0x31d40eb0, 0x80848bec), TOBN(0x95bd480a, 0x79fd36cc),
+       TOBN(0x01a77c61, 0x65ed43f5), TOBN(0xafccd127, 0x2e0d40bf)}},
+     {{TOBN(0xeccfc82d, 0x1cc1884b), TOBN(0xc85ac201, 0x5d4753b4),
+       TOBN(0xc7a6caac, 0x658e099f), TOBN(0xcf46369e, 0x04b27390)},
+      {TOBN(0xe2e7d049, 0x506467ea), TOBN(0x481b63a2, 0x37cdeccc),
+       TOBN(0x4029abd8, 0xed80143a), TOBN(0x28bfe3c7, 0xbcb00b88)}},
+     {{TOBN(0x3bec1009, 0x0643d84a), TOBN(0x885f3668, 0xabd11041),
+       TOBN(0xdb02432c, 0xf83a34d6), TOBN(0x32f7b360, 0x719ceebe)},
+      {TOBN(0xf06c7837, 0xdad1fe7a), TOBN(0x60a157a9, 0x5441a0b0),
+       TOBN(0x704970e9, 0xe2d47550), TOBN(0xcd2bd553, 0x271b9020)}},
+     {{TOBN(0xff57f82f, 0x33e24a0b), TOBN(0x9cbee23f, 0xf2565079),
+       TOBN(0x16353427, 0xeb5f5825), TOBN(0x276feec4, 0xe948d662)},
+      {TOBN(0xd1b62bc6, 0xda10032b), TOBN(0x718351dd, 0xf0e72a53),
+       TOBN(0x93452076, 0x2420e7ba), TOBN(0x96368fff, 0x3a00118d)}},
+     {{TOBN(0x00ce2d26, 0x150a49e4), TOBN(0x0c28b636, 0x3f04706b),
+       TOBN(0xbad65a46, 0x58b196d0), TOBN(0x6c8455fc, 0xec9f8b7c)},
+      {TOBN(0xe90c895f, 0x2d71867e), TOBN(0x5c0be31b, 0xedf9f38c),
+       TOBN(0x2a37a15e, 0xd8f6ec04), TOBN(0x239639e7, 0x8cd85251)}},
+     {{TOBN(0xd8975315, 0x9c7c4c6b), TOBN(0x603aa3c0, 0xd7409af7),
+       TOBN(0xb8d53d0c, 0x007132fb), TOBN(0x68d12af7, 0xa6849238)},
+      {TOBN(0xbe0607e7, 0xbf5d9279), TOBN(0x9aa50055, 0xaada74ce),
+       TOBN(0xe81079cb, 0xba7e8ccb), TOBN(0x610c71d1, 0xa5f4ff5e)}},
+     {{TOBN(0x9e2ee1a7, 0x5aa07093), TOBN(0xca84004b, 0xa75da47c),
+       TOBN(0x074d3951, 0x3de75401), TOBN(0xf938f756, 0xbb311592)},
+      {TOBN(0x96197618, 0x00a43421), TOBN(0x39a25362, 0x07bc78c8),
+       TOBN(0x278f710a, 0x0a171276), TOBN(0xb28446ea, 0x8d1a8f08)}},
+     {{TOBN(0x184781bf, 0xe3b6a661), TOBN(0x7751cb1d, 0xe6d279f7),
+       TOBN(0xf8ff95d6, 0xc59eb662), TOBN(0x186d90b7, 0x58d3dea7)},
+      {TOBN(0x0e4bb6c1, 0xdfb4f754), TOBN(0x5c5cf56b, 0x2b2801dc),
+       TOBN(0xc561e452, 0x1f54564d), TOBN(0xb4fb8c60, 0xf0dd7f13)}},
+     {{TOBN(0xf8849630, 0x33ff98c7), TOBN(0x9619fffa, 0xcf17769c),
+       TOBN(0xf8090bf6, 0x1bfdd80a), TOBN(0x14d9a149, 0x422cfe63)},
+      {TOBN(0xb354c360, 0x6f6df9ea), TOBN(0xdbcf770d, 0x218f17ea),
+       TOBN(0x207db7c8, 0x79eb3480), TOBN(0x213dbda8, 0x559b6a26)}},
+     {{TOBN(0xac4c200b, 0x29fc81b3), TOBN(0xebc3e09f, 0x171d87c1),
+       TOBN(0x91799530, 0x1481aa9e), TOBN(0x051b92e1, 0x92e114fa)},
+      {TOBN(0xdf8f92e9, 0xecb5537f), TOBN(0x44b1b2cc, 0x290c7483),
+       TOBN(0xa711455a, 0x2adeb016), TOBN(0x964b6856, 0x81a10c2c)}},
+     {{TOBN(0x4f159d99, 0xcec03623), TOBN(0x05532225, 0xef3271ea),
+       TOBN(0xb231bea3, 0xc5ee4849), TOBN(0x57a54f50, 0x7094f103)},
+      {TOBN(0x3e2d421d, 0x9598b352), TOBN(0xe865a49c, 0x67412ab4),
+       TOBN(0xd2998a25, 0x1cc3a912), TOBN(0x5d092808, 0x0c74d65d)}},
+     {{TOBN(0x73f45908, 0x4088567a), TOBN(0xeb6b280e, 0x1f214a61),
+       TOBN(0x8c9adc34, 0xcaf0c13d), TOBN(0x39d12938, 0xf561fb80)},
+      {TOBN(0xb2dc3a5e, 0xbc6edfb4), TOBN(0x7485b1b1, 0xfe4d210e),
+       TOBN(0x062e0400, 0xe186ae72), TOBN(0x91e32d5c, 0x6eeb3b88)}},
+     {{TOBN(0x6df574d7, 0x4be59224), TOBN(0xebc88ccc, 0x716d55f3),
+       TOBN(0x26c2e6d0, 0xcad6ed33), TOBN(0xc6e21e7d, 0x0d3e8b10)},
+      {TOBN(0x2cc5840e, 0x5bcc36bb), TOBN(0x9292445e, 0x7da74f69),
+       TOBN(0x8be8d321, 0x4e5193a8), TOBN(0x3ec23629, 0x8df06413)}},
+     {{TOBN(0xc7e9ae85, 0xb134defa), TOBN(0x6073b1d0, 0x1bb2d475),
+       TOBN(0xb9ad615e, 0x2863c00d), TOBN(0x9e29493d, 0x525f4ac4)},
+      {TOBN(0xc32b1dea, 0x4e9acf4f), TOBN(0x3e1f01c8, 0xa50db88d),
+       TOBN(0xb05d70ea, 0x04da916c), TOBN(0x714b0d0a, 0xd865803e)}},
+     {{TOBN(0x4bd493fc, 0x9920cb5e), TOBN(0x5b44b1f7, 0x92c7a3ac),
+       TOBN(0xa2a77293, 0xbcec9235), TOBN(0x5ee06e87, 0xcd378553)},
+      {TOBN(0xceff8173, 0xda621607), TOBN(0x2bb03e4c, 0x99f5d290),
+       TOBN(0x2945106a, 0xa6f734ac), TOBN(0xb5056604, 0xd25c4732)}},
+     {{TOBN(0x5945920c, 0xe079afee), TOBN(0x686e17a0, 0x6789831f),
+       TOBN(0x5966bee8, 0xb74a5ae5), TOBN(0x38a673a2, 0x1e258d46)},
+      {TOBN(0xbd1cc1f2, 0x83141c95), TOBN(0x3b2ecf4f, 0x0e96e486),
+       TOBN(0xcd3aa896, 0x74e5fc78), TOBN(0x415ec10c, 0x2482fa7a)}},
+     {{TOBN(0x15234419, 0x80503380), TOBN(0x513d917a, 0xd314b392),
+       TOBN(0xb0b52f4e, 0x63caecae), TOBN(0x07bf22ad, 0x2dc7780b)},
+      {TOBN(0xe761e8a1, 0xe4306839), TOBN(0x1b3be962, 0x5dd7feaa),
+       TOBN(0x4fe728de, 0x74c778f1), TOBN(0xf1fa0bda, 0x5e0070f6)}},
+     {{TOBN(0x85205a31, 0x6ec3f510), TOBN(0x2c7e4a14, 0xd2980475),
+       TOBN(0xde3c19c0, 0x6f30ebfd), TOBN(0xdb1c1f38, 0xd4b7e644)},
+      {TOBN(0xfe291a75, 0x5dce364a), TOBN(0xb7b22a3c, 0x058f5be3),
+       TOBN(0x2cd2c302, 0x37fea38c), TOBN(0x2930967a, 0x2e17be17)}},
+     {{TOBN(0x87f009de, 0x0c061c65), TOBN(0xcb014aac, 0xedc6ed44),
+       TOBN(0x49bd1cb4, 0x3bafb1eb), TOBN(0x81bd8b5c, 0x282d3688)},
+      {TOBN(0x1cdab87e, 0xf01a17af), TOBN(0x21f37ac4, 0xe710063b),
+       TOBN(0x5a6c5676, 0x42fc8193), TOBN(0xf4753e70, 0x56a6015c)}},
+     {{TOBN(0x020f795e, 0xa15b0a44), TOBN(0x8f37c8d7, 0x8958a958),
+       TOBN(0x63b7e89b, 0xa4b675b5), TOBN(0xb4fb0c0c, 0x0fc31aea)},
+      {TOBN(0xed95e639, 0xa7ff1f2e), TOBN(0x9880f5a3, 0x619614fb),
+       TOBN(0xdeb6ff02, 0x947151ab), TOBN(0x5bc5118c, 0xa868dcdb)}},
+     {{TOBN(0xd8da2055, 0x4c20cea5), TOBN(0xcac2776e, 0x14c4d69a),
+       TOBN(0xcccb22c1, 0x622d599b), TOBN(0xa4ddb653, 0x68a9bb50)},
+      {TOBN(0x2c4ff151, 0x1b4941b4), TOBN(0xe1ff19b4, 0x6efba588),
+       TOBN(0x35034363, 0xc48345e0), TOBN(0x45542e3d, 0x1e29dfc4)}},
+     {{TOBN(0xf197cb91, 0x349f7aed), TOBN(0x3b2b5a00, 0x8fca8420),
+       TOBN(0x7c175ee8, 0x23aaf6d8), TOBN(0x54dcf421, 0x35af32b6)},
+      {TOBN(0x0ba14307, 0x27d6561e), TOBN(0x879d5ee4, 0xd175b1e2),
+       TOBN(0xc7c43673, 0x99807db5), TOBN(0x77a54455, 0x9cd55bcd)}},
+     {{TOBN(0xe6c2ff13, 0x0105c072), TOBN(0x18f7a99f, 0x8dda7da4),
+       TOBN(0x4c301820, 0x0e2d35c1), TOBN(0x06a53ca0, 0xd9cc6c82)},
+      {TOBN(0xaa21cc1e, 0xf1aa1d9e), TOBN(0x32414334, 0x4a75b1e8),
+       TOBN(0x2a6d1328, 0x0ebe9fdc), TOBN(0x16bd173f, 0x98a4755a)}},
+     {{TOBN(0xfbb9b245, 0x2133ffd9), TOBN(0x39a8b2f1, 0x830f1a20),
+       TOBN(0x484bc97d, 0xd5a1f52a), TOBN(0xd6aebf56, 0xa40eddf8)},
+      {TOBN(0x32257acb, 0x76ccdac6), TOBN(0xaf4d36ec, 0x1586ff27),
+       TOBN(0x8eaa8863, 0xf8de7dd1), TOBN(0x0045d5cf, 0x88647c16)}}},
+    {{{TOBN(0xa6f3d574, 0xc005979d), TOBN(0xc2072b42, 0x6a40e350),
+       TOBN(0xfca5c156, 0x8de2ecf9), TOBN(0xa8c8bf5b, 0xa515344e)},
+      {TOBN(0x97aee555, 0x114df14a), TOBN(0xd4374a4d, 0xfdc5ec6b),
+       TOBN(0x754cc28f, 0x2ca85418), TOBN(0x71cb9e27, 0xd3c41f78)}},
+     {{TOBN(0x89105079, 0x03605c39), TOBN(0xf0843d9e, 0xa142c96c),
+       TOBN(0xf3744934, 0x16923684), TOBN(0x732caa2f, 0xfa0a2893)},
+      {TOBN(0xb2e8c270, 0x61160170), TOBN(0xc32788cc, 0x437fbaa3),
+       TOBN(0x39cd818e, 0xa6eda3ac), TOBN(0xe2e94239, 0x9e2b2e07)}},
+     {{TOBN(0x6967d39b, 0x0260e52a), TOBN(0xd42585cc, 0x90653325),
+       TOBN(0x0d9bd605, 0x21ca7954), TOBN(0x4fa20877, 0x81ed57b3)},
+      {TOBN(0x60c1eff8, 0xe34a0bbe), TOBN(0x56b0040c, 0x84f6ef64),
+       TOBN(0x28be2b24, 0xb1af8483), TOBN(0xb2278163, 0xf5531614)}},
+     {{TOBN(0x8df27545, 0x5922ac1c), TOBN(0xa7b3ef5c, 0xa52b3f63),
+       TOBN(0x8e77b214, 0x71de57c4), TOBN(0x31682c10, 0x834c008b)},
+      {TOBN(0xc76824f0, 0x4bd55d31), TOBN(0xb6d1c086, 0x17b61c71),
+       TOBN(0x31db0903, 0xc2a5089d), TOBN(0x9c092172, 0x184e5d3f)}},
+     {{TOBN(0xdd7ced5b, 0xc00cc638), TOBN(0x1a2015eb, 0x61278fc2),
+       TOBN(0x2e8e5288, 0x6a37f8d6), TOBN(0xc457786f, 0xe79933ad)},
+      {TOBN(0xb3fe4cce, 0x2c51211a), TOBN(0xad9b10b2, 0x24c20498),
+       TOBN(0x90d87a4f, 0xd28db5e5), TOBN(0x698cd105, 0x3aca2fc3)}},
+     {{TOBN(0x4f112d07, 0xe91b536d), TOBN(0xceb982f2, 0x9eba09d6),
+       TOBN(0x3c157b2c, 0x197c396f), TOBN(0xe23c2d41, 0x7b66eb24)},
+      {TOBN(0x480c57d9, 0x3f330d37), TOBN(0xb3a4c8a1, 0x79108deb),
+       TOBN(0x702388de, 0xcb199ce5), TOBN(0x0b019211, 0xb944a8d4)}},
+     {{TOBN(0x24f2a692, 0x840bb336), TOBN(0x7c353bdc, 0xa669fa7b),
+       TOBN(0xda20d6fc, 0xdec9c300), TOBN(0x625fbe2f, 0xa13a4f17)},
+      {TOBN(0xa2b1b61a, 0xdbc17328), TOBN(0x008965bf, 0xa9515621),
+       TOBN(0x49690939, 0xc620ff46), TOBN(0x182dd27d, 0x8717e91c)}},
+     {{TOBN(0x5ace5035, 0xea6c3997), TOBN(0x54259aaa, 0xc2610bef),
+       TOBN(0xef18bb3f, 0x3c80dd39), TOBN(0x6910b95b, 0x5fc3fa39)},
+      {TOBN(0xfce2f510, 0x43e09aee), TOBN(0xced56c9f, 0xa7675665),
+       TOBN(0x10e265ac, 0xd872db61), TOBN(0x6982812e, 0xae9fce69)}},
+     {{TOBN(0x29be11c6, 0xce800998), TOBN(0x72bb1752, 0xb90360d9),
+       TOBN(0x2c193197, 0x5a4ad590), TOBN(0x2ba2f548, 0x9fc1dbc0)},
+      {TOBN(0x7fe4eebb, 0xe490ebe0), TOBN(0x12a0a4cd, 0x7fae11c0),
+       TOBN(0x7197cf81, 0xe903ba37), TOBN(0xcf7d4aa8, 0xde1c6dd8)}},
+     {{TOBN(0x92af6bf4, 0x3fd5684c), TOBN(0x2b26eecf, 0x80360aa1),
+       TOBN(0xbd960f30, 0x00546a82), TOBN(0x407b3c43, 0xf59ad8fe)},
+      {TOBN(0x86cae5fe, 0x249c82ba), TOBN(0x9e0faec7, 0x2463744c),
+       TOBN(0x87f551e8, 0x94916272), TOBN(0x033f9344, 0x6ceb0615)}},
+     {{TOBN(0x1e5eb0d1, 0x8be82e84), TOBN(0x89967f0e, 0x7a582fef),
+       TOBN(0xbcf687d5, 0xa6e921fa), TOBN(0xdfee4cf3, 0xd37a09ba)},
+      {TOBN(0x94f06965, 0xb493c465), TOBN(0x638b9a1c, 0x7635c030),
+       TOBN(0x76667864, 0x66f05e9f), TOBN(0xccaf6808, 0xc04da725)}},
+     {{TOBN(0xca2eb690, 0x768fccfc), TOBN(0xf402d37d, 0xb835b362),
+       TOBN(0x0efac0d0, 0xe2fdfcce), TOBN(0xefc9cdef, 0xb638d990)},
+      {TOBN(0x2af12b72, 0xd1669a8b), TOBN(0x33c536bc, 0x5774ccbd),
+       TOBN(0x30b21909, 0xfb34870e), TOBN(0xc38fa2f7, 0x7df25aca)}},
+     {{TOBN(0x74c5f02b, 0xbf81f3f5), TOBN(0x0525a5ae, 0xaf7e4581),
+       TOBN(0x88d2aaba, 0x433c54ae), TOBN(0xed9775db, 0x806a56c5)},
+      {TOBN(0xd320738a, 0xc0edb37d), TOBN(0x25fdb6ee, 0x66cc1f51),
+       TOBN(0xac661d17, 0x10600d76), TOBN(0x931ec1f3, 0xbdd1ed76)}},
+     {{TOBN(0x65c11d62, 0x19ee43f1), TOBN(0x5cd57c3e, 0x60829d97),
+       TOBN(0xd26c91a3, 0x984be6e8), TOBN(0xf08d9309, 0x8b0c53bd)},
+      {TOBN(0x94bc9e5b, 0xc016e4ea), TOBN(0xd3916839, 0x11d43d2b),
+       TOBN(0x886c5ad7, 0x73701155), TOBN(0xe0377626, 0x20b00715)}},
+     {{TOBN(0x7f01c9ec, 0xaa80ba59), TOBN(0x3083411a, 0x68538e51),
+       TOBN(0x970370f1, 0xe88128af), TOBN(0x625cc3db, 0x91dec14b)},
+      {TOBN(0xfef9666c, 0x01ac3107), TOBN(0xb2a8d577, 0xd5057ac3),
+       TOBN(0xb0f26299, 0x92be5df7), TOBN(0xf579c8e5, 0x00353924)}},
+     {{TOBN(0xb8fa3d93, 0x1341ed7a), TOBN(0x4223272c, 0xa7b59d49),
+       TOBN(0x3dcb1947, 0x83b8c4a4), TOBN(0x4e413c01, 0xed1302e4)},
+      {TOBN(0x6d999127, 0xe17e44ce), TOBN(0xee86bf75, 0x33b3adfb),
+       TOBN(0xf6902fe6, 0x25aa96ca), TOBN(0xb73540e4, 0xe5aae47d)}},
+     {{TOBN(0x32801d7b, 0x1b4a158c), TOBN(0xe571c99e, 0x27e2a369),
+       TOBN(0x40cb76c0, 0x10d9f197), TOBN(0xc308c289, 0x3167c0ae)},
+      {TOBN(0xa6ef9dd3, 0xeb7958f2), TOBN(0xa7226dfc, 0x300879b1),
+       TOBN(0x6cd0b362, 0x7edf0636), TOBN(0x4efbce6c, 0x7bc37eed)}},
+     {{TOBN(0x75f92a05, 0x8d699021), TOBN(0x586d4c79, 0x772566e3),
+       TOBN(0x378ca5f1, 0x761ad23a), TOBN(0x650d86fc, 0x1465a8ac)},
+      {TOBN(0x7a4ed457, 0x842ba251), TOBN(0x6b65e3e6, 0x42234933),
+       TOBN(0xaf1543b7, 0x31aad657), TOBN(0xa4cefe98, 0xcbfec369)}},
+     {{TOBN(0xb587da90, 0x9f47befb), TOBN(0x6562e9fb, 0x41312d13),
+       TOBN(0xa691ea59, 0xeff1cefe), TOBN(0xcc30477a, 0x05fc4cf6)},
+      {TOBN(0xa1632461, 0x0b0ffd3d), TOBN(0xa1f16f3b, 0x5b355956),
+       TOBN(0x5b148d53, 0x4224ec24), TOBN(0xdc834e7b, 0xf977012a)}},
+     {{TOBN(0x7bfc5e75, 0xb2c69dbc), TOBN(0x3aa77a29, 0x03c3da6c),
+       TOBN(0xde0df03c, 0xca910271), TOBN(0xcbd5ca4a, 0x7806dc55)},
+      {TOBN(0xe1ca5807, 0x6db476cb), TOBN(0xfde15d62, 0x5f37a31e),
+       TOBN(0xf49af520, 0xf41af416), TOBN(0x96c5c5b1, 0x7d342db5)}},
+     {{TOBN(0x155c43b7, 0xeb4ceb9b), TOBN(0x2e993010, 0x4e77371a),
+       TOBN(0x1d2987da, 0x675d43af), TOBN(0xef2bc1c0, 0x8599fd72)},
+      {TOBN(0x96894b7b, 0x9342f6b2), TOBN(0x201eadf2, 0x7c8e71f0),
+       TOBN(0xf3479d9f, 0x4a1f3efc), TOBN(0xe0f8a742, 0x702a9704)}},
+     {{TOBN(0xeafd44b6, 0xb3eba40c), TOBN(0xf9739f29, 0xc1c1e0d0),
+       TOBN(0x0091471a, 0x619d505e), TOBN(0xc15f9c96, 0x9d7c263e)},
+      {TOBN(0x5be47285, 0x83afbe33), TOBN(0xa3b6d6af, 0x04f1e092),
+       TOBN(0xe76526b9, 0x751a9d11), TOBN(0x2ec5b26d, 0x9a4ae4d2)}},
+     {{TOBN(0xeb66f4d9, 0x02f6fb8d), TOBN(0x4063c561, 0x96912164),
+       TOBN(0xeb7050c1, 0x80ef3000), TOBN(0x288d1c33, 0xeaa5b3f0)},
+      {TOBN(0xe87c68d6, 0x07806fd8), TOBN(0xb2f7f9d5, 0x4bbbf50f),
+       TOBN(0x25972f3a, 0xac8d6627), TOBN(0xf8547774, 0x10e8c13b)}},
+     {{TOBN(0xcc50ef6c, 0x872b4a60), TOBN(0xab2a34a4, 0x4613521b),
+       TOBN(0x39c5c190, 0x983e15d1), TOBN(0x61dde5df, 0x59905512)},
+      {TOBN(0xe417f621, 0x9f2275f3), TOBN(0x0750c8b6, 0x451d894b),
+       TOBN(0x75b04ab9, 0x78b0bdaa), TOBN(0x3bfd9fd4, 0x458589bd)}},
+     {{TOBN(0xf1013e30, 0xee9120b6), TOBN(0x2b51af93, 0x23a4743e),
+       TOBN(0xea96ffae, 0x48d14d9e), TOBN(0x71dc0dbe, 0x698a1d32)},
+      {TOBN(0x914962d2, 0x0180cca4), TOBN(0x1ae60677, 0xc3568963),
+       TOBN(0x8cf227b1, 0x437bc444), TOBN(0xc650c83b, 0xc9962c7a)}},
+     {{TOBN(0x23c2c7dd, 0xfe7ccfc4), TOBN(0xf925c89d, 0x1b929d48),
+       TOBN(0x4460f74b, 0x06783c33), TOBN(0xac2c8d49, 0xa590475a)},
+      {TOBN(0xfb40b407, 0xb807bba0), TOBN(0x9d1e362d, 0x69ff8f3a),
+       TOBN(0xa33e9681, 0xcbef64a4), TOBN(0x67ece5fa, 0x332fb4b2)}},
+     {{TOBN(0x6900a99b, 0x739f10e3), TOBN(0xc3341ca9, 0xff525925),
+       TOBN(0xee18a626, 0xa9e2d041), TOBN(0xa5a83685, 0x29580ddd)},
+      {TOBN(0xf3470c81, 0x9d7de3cd), TOBN(0xedf02586, 0x2062cf9c),
+       TOBN(0xf43522fa, 0xc010edb0), TOBN(0x30314135, 0x13a4b1ae)}},
+     {{TOBN(0xc792e02a, 0xdb22b94b), TOBN(0x993d8ae9, 0xa1eaa45b),
+       TOBN(0x8aad6cd3, 0xcd1e1c63), TOBN(0x89529ca7, 0xc5ce688a)},
+      {TOBN(0x2ccee3aa, 0xe572a253), TOBN(0xe02b6438, 0x02a21efb),
+       TOBN(0xa7091b6e, 0xc9430358), TOBN(0x06d1b1fa, 0x9d7db504)}},
+     {{TOBN(0x58846d32, 0xc4744733), TOBN(0x40517c71, 0x379f9e34),
+       TOBN(0x2f65655f, 0x130ef6ca), TOBN(0x526e4488, 0xf1f3503f)},
+      {TOBN(0x8467bd17, 0x7ee4a976), TOBN(0x1d9dc913, 0x921363d1),
+       TOBN(0xd8d24c33, 0xb069e041), TOBN(0x5eb5da0a, 0x2cdf7f51)}},
+     {{TOBN(0x1c0f3cb1, 0x197b994f), TOBN(0x3c95a6c5, 0x2843eae9),
+       TOBN(0x7766ffc9, 0xa6097ea5), TOBN(0x7bea4093, 0xd723b867)},
+      {TOBN(0xb48e1f73, 0x4db378f9), TOBN(0x70025b00, 0xe37b77ac),
+       TOBN(0x943dc8e7, 0xaf24ad46), TOBN(0xb98a15ac, 0x16d00a85)}},
+     {{TOBN(0x3adc38ba, 0x2743b004), TOBN(0xb1c7f4f7, 0x334415ee),
+       TOBN(0xea43df8f, 0x1e62d05a), TOBN(0x32618905, 0x9d76a3b6)},
+      {TOBN(0x2fbd0bb5, 0xa23a0f46), TOBN(0x5bc971db, 0x6a01918c),
+       TOBN(0x7801d94a, 0xb4743f94), TOBN(0xb94df65e, 0x676ae22b)}},
+     {{TOBN(0xaafcbfab, 0xaf95894c), TOBN(0x7b9bdc07, 0x276b2241),
+       TOBN(0xeaf98362, 0x5bdda48b), TOBN(0x5977faf2, 0xa3fcb4df)},
+      {TOBN(0xbed042ef, 0x052c4b5b), TOBN(0x9fe87f71, 0x067591f0),
+       TOBN(0xc89c73ca, 0x22f24ec7), TOBN(0x7d37fa9e, 0xe64a9f1b)}},
+     {{TOBN(0x2710841a, 0x15562627), TOBN(0x2c01a613, 0xc243b034),
+       TOBN(0x1d135c56, 0x2bc68609), TOBN(0xc2ca1715, 0x8b03f1f6)},
+      {TOBN(0xc9966c2d, 0x3eb81d82), TOBN(0xc02abf4a, 0x8f6df13e),
+       TOBN(0x77b34bd7, 0x8f72b43b), TOBN(0xaff6218f, 0x360c82b0)}},
+     {{TOBN(0x0aa5726c, 0x8d55b9d2), TOBN(0xdc0adbe9, 0x99e9bffb),
+       TOBN(0x9097549c, 0xefb9e72a), TOBN(0x16755712, 0x9dfb3111)},
+      {TOBN(0xdd8bf984, 0xf26847f9), TOBN(0xbcb8e387, 0xdfb30cb7),
+       TOBN(0xc1fd32a7, 0x5171ef9c), TOBN(0x977f3fc7, 0x389b363f)}},
+     {{TOBN(0x116eaf2b, 0xf4babda0), TOBN(0xfeab68bd, 0xf7113c8e),
+       TOBN(0xd1e3f064, 0xb7def526), TOBN(0x1ac30885, 0xe0b3fa02)},
+      {TOBN(0x1c5a6e7b, 0x40142d9d), TOBN(0x839b5603, 0x30921c0b),
+       TOBN(0x48f301fa, 0x36a116a3), TOBN(0x380e1107, 0xcfd9ee6d)}},
+     {{TOBN(0x7945ead8, 0x58854be1), TOBN(0x4111c12e, 0xcbd4d49d),
+       TOBN(0xece3b1ec, 0x3a29c2ef), TOBN(0x6356d404, 0x8d3616f5)},
+      {TOBN(0x9f0d6a8f, 0x594d320e), TOBN(0x0989316d, 0xf651ccd2),
+       TOBN(0x6c32117a, 0x0f8fdde4), TOBN(0x9abe5cc5, 0xa26a9bbc)}},
+     {{TOBN(0xcff560fb, 0x9723f671), TOBN(0x21b2a12d, 0x7f3d593c),
+       TOBN(0xe4cb18da, 0x24ba0696), TOBN(0x186e2220, 0xc3543384)},
+      {TOBN(0x722f64e0, 0x88312c29), TOBN(0x94282a99, 0x17dc7752),
+       TOBN(0x62467bbf, 0x5a85ee89), TOBN(0xf435c650, 0xf10076a0)}},
+     {{TOBN(0xc9ff1539, 0x43b3a50b), TOBN(0x7132130c, 0x1a53efbc),
+       TOBN(0x31bfe063, 0xf7b0c5b7), TOBN(0xb0179a7d, 0x4ea994cc)},
+      {TOBN(0x12d064b3, 0xc85f455b), TOBN(0x47259328, 0x8f6e0062),
+       TOBN(0xf64e590b, 0xb875d6d9), TOBN(0x22dd6225, 0xad92bcc7)}},
+     {{TOBN(0xb658038e, 0xb9c3bd6d), TOBN(0x00cdb0d6, 0xfbba27c8),
+       TOBN(0x0c681337, 0x1062c45d), TOBN(0xd8515b8c, 0x2d33407d)},
+      {TOBN(0xcb8f699e, 0x8cbb5ecf), TOBN(0x8c4347f8, 0xc608d7d8),
+       TOBN(0x2c11850a, 0xbb3e00db), TOBN(0x20a8dafd, 0xecb49d19)}},
+     {{TOBN(0xbd781480, 0x45ee2f40), TOBN(0x75e354af, 0x416b60cf),
+       TOBN(0xde0b58a1, 0x8d49a8c4), TOBN(0xe40e94e2, 0xfa359536)},
+      {TOBN(0xbd4fa59f, 0x62accd76), TOBN(0x05cf466a, 0x8c762837),
+       TOBN(0xb5abda99, 0x448c277b), TOBN(0x5a9e01bf, 0x48b13740)}},
+     {{TOBN(0x9d457798, 0x326aad8d), TOBN(0xbdef4954, 0xc396f7e7),
+       TOBN(0x6fb274a2, 0xc253e292), TOBN(0x2800bf0a, 0x1cfe53e7)},
+      {TOBN(0x22426d31, 0x44438fd4), TOBN(0xef233923, 0x5e259f9a),
+       TOBN(0x4188503c, 0x03f66264), TOBN(0x9e5e7f13, 0x7f9fdfab)}},
+     {{TOBN(0x565eb76c, 0x5fcc1aba), TOBN(0xea632548, 0x59b5bff8),
+       TOBN(0x5587c087, 0xaab6d3fa), TOBN(0x92b639ea, 0x6ce39c1b)},
+      {TOBN(0x0706e782, 0x953b135c), TOBN(0x7308912e, 0x425268ef),
+       TOBN(0x599e92c7, 0x090e7469), TOBN(0x83b90f52, 0x9bc35e75)}},
+     {{TOBN(0x4750b3d0, 0x244975b3), TOBN(0xf3a44358, 0x11965d72),
+       TOBN(0x179c6774, 0x9c8dc751), TOBN(0xff18cdfe, 0xd23d9ff0)},
+      {TOBN(0xc4013833, 0x2028e247), TOBN(0x96e280e2, 0xf3bfbc79),
+       TOBN(0xf60417bd, 0xd0880a84), TOBN(0x263c9f3d, 0x2a568151)}},
+     {{TOBN(0x36be15b3, 0x2d2ce811), TOBN(0x846dc0c2, 0xf8291d21),
+       TOBN(0x5cfa0ecb, 0x789fcfdb), TOBN(0x45a0beed, 0xd7535b9a)},
+      {TOBN(0xec8e9f07, 0x96d69af1), TOBN(0x31a7c5b8, 0x599ab6dc),
+       TOBN(0xd36d45ef, 0xf9e2e09f), TOBN(0x3cf49ef1, 0xdcee954b)}},
+     {{TOBN(0x6be34cf3, 0x086cff9b), TOBN(0x88dbd491, 0x39a3360f),
+       TOBN(0x1e96b8cc, 0x0dbfbd1d), TOBN(0xc1e5f7bf, 0xcb7e2552)},
+      {TOBN(0x0547b214, 0x28819d98), TOBN(0xc770dd9c, 0x7aea9dcb),
+       TOBN(0xaef0d4c7, 0x041d68c8), TOBN(0xcc2b9818, 0x13cb9ba8)}},
+     {{TOBN(0x7fc7bc76, 0xfe86c607), TOBN(0x6b7b9337, 0x502a9a95),
+       TOBN(0x1948dc27, 0xd14dab63), TOBN(0x249dd198, 0xdae047be)},
+      {TOBN(0xe8356584, 0xa981a202), TOBN(0x3531dd18, 0x3a893387),
+       TOBN(0x1be11f90, 0xc85c7209), TOBN(0x93d2fe1e, 0xe2a52b5a)}},
+     {{TOBN(0x8225bfe2, 0xec6d6b97), TOBN(0x9cf6d6f4, 0xbd0aa5de),
+       TOBN(0x911459cb, 0x54779f5f), TOBN(0x5649cddb, 0x86aeb1f3)},
+      {TOBN(0x32133579, 0x3f26ce5a), TOBN(0xc289a102, 0x550f431e),
+       TOBN(0x559dcfda, 0x73b84c6f), TOBN(0x84973819, 0xee3ac4d7)}},
+     {{TOBN(0xb51e55e6, 0xf2606a82), TOBN(0xe25f7061, 0x90f2fb57),
+       TOBN(0xacef6c2a, 0xb1a4e37c), TOBN(0x864e359d, 0x5dcf2706)},
+      {TOBN(0x479e6b18, 0x7ce57316), TOBN(0x2cab2500, 0x3a96b23d),
+       TOBN(0xed489862, 0x8ef16df7), TOBN(0x2056538c, 0xef3758b5)}},
+     {{TOBN(0xa7df865e, 0xf15d3101), TOBN(0x80c5533a, 0x61b553d7),
+       TOBN(0x366e1997, 0x4ed14294), TOBN(0x6620741f, 0xb3c0bcd6)},
+      {TOBN(0x21d1d9c4, 0xedc45418), TOBN(0x005b859e, 0xc1cc4a9d),
+       TOBN(0xdf01f630, 0xa1c462f0), TOBN(0x15d06cf3, 0xf26820c7)}},
+     {{TOBN(0x9f7f24ee, 0x3484be47), TOBN(0x2ff33e96, 0x4a0c902f),
+       TOBN(0x00bdf457, 0x5a0bc453), TOBN(0x2378dfaf, 0x1aa238db)},
+      {TOBN(0x272420ec, 0x856720f2), TOBN(0x2ad9d95b, 0x96797291),
+       TOBN(0xd1242cc6, 0x768a1558), TOBN(0x2e287f8b, 0x5cc86aa8)}},
+     {{TOBN(0x796873d0, 0x990cecaa), TOBN(0xade55f81, 0x675d4080),
+       TOBN(0x2645eea3, 0x21f0cd84), TOBN(0x7a1efa0f, 0xb4e17d02)},
+      {TOBN(0xf6858420, 0x037cc061), TOBN(0x682e05f0, 0xd5d43e12),
+       TOBN(0x59c36994, 0x27218710), TOBN(0x85cbba4d, 0x3f7cd2fc)}},
+     {{TOBN(0x726f9729, 0x7a3cd22a), TOBN(0x9f8cd5dc, 0x4a628397),
+       TOBN(0x17b93ab9, 0xc23165ed), TOBN(0xff5f5dbf, 0x122823d4)},
+      {TOBN(0xc1e4e4b5, 0x654a446d), TOBN(0xd1a9496f, 0x677257ba),
+       TOBN(0x6387ba94, 0xde766a56), TOBN(0x23608bc8, 0x521ec74a)}},
+     {{TOBN(0x16a522d7, 0x6688c4d4), TOBN(0x9d6b4282, 0x07373abd),
+       TOBN(0xa62f07ac, 0xb42efaa3), TOBN(0xf73e00f7, 0xe3b90180)},
+      {TOBN(0x36175fec, 0x49421c3e), TOBN(0xc4e44f9b, 0x3dcf2678),
+       TOBN(0x76df436b, 0x7220f09f), TOBN(0x172755fb, 0x3aa8b6cf)}},
+     {{TOBN(0xbab89d57, 0x446139cc), TOBN(0x0a0a6e02, 0x5fe0208f),
+       TOBN(0xcdbb63e2, 0x11e5d399), TOBN(0x33ecaa12, 0xa8977f0b)},
+      {TOBN(0x59598b21, 0xf7c42664), TOBN(0xb3e91b32, 0xab65d08a),
+       TOBN(0x035822ee, 0xf4502526), TOBN(0x1dcf0176, 0x720a82a9)}},
+     {{TOBN(0x50f8598f, 0x3d589e02), TOBN(0xdf0478ff, 0xb1d63d2c),
+       TOBN(0x8b8068bd, 0x1571cd07), TOBN(0x30c3aa4f, 0xd79670cd)},
+      {TOBN(0x25e8fd4b, 0x941ade7f), TOBN(0x3d1debdc, 0x32790011),
+       TOBN(0x65b6dcbd, 0x3a3f9ff0), TOBN(0x282736a4, 0x793de69c)}},
+     {{TOBN(0xef69a0c3, 0xd41d3bd3), TOBN(0xb533b8c9, 0x07a26bde),
+       TOBN(0xe2801d97, 0xdb2edf9f), TOBN(0xdc4a8269, 0xe1877af0)},
+      {TOBN(0x6c1c5851, 0x3d590dbe), TOBN(0x84632f6b, 0xee4e9357),
+       TOBN(0xd36d36b7, 0x79b33374), TOBN(0xb46833e3, 0x9bbca2e6)}},
+     {{TOBN(0x37893913, 0xf7fc0586), TOBN(0x385315f7, 0x66bf4719),
+       TOBN(0x72c56293, 0xb31855dc), TOBN(0xd1416d4e, 0x849061fe)},
+      {TOBN(0xbeb3ab78, 0x51047213), TOBN(0x447f6e61, 0xf040c996),
+       TOBN(0xd06d310d, 0x638b1d0c), TOBN(0xe28a413f, 0xbad1522e)}},
+     {{TOBN(0x685a76cb, 0x82003f86), TOBN(0x610d07f7, 0x0bcdbca3),
+       TOBN(0x6ff66021, 0x9ca4c455), TOBN(0x7df39b87, 0xcea10eec)},
+      {TOBN(0xb9255f96, 0xe22db218), TOBN(0x8cc6d9eb, 0x08a34c44),
+       TOBN(0xcd4ffb86, 0x859f9276), TOBN(0x8fa15eb2, 0x50d07335)}},
+     {{TOBN(0xdf553845, 0xcf2c24b5), TOBN(0x89f66a9f, 0x52f9c3ba),
+       TOBN(0x8f22b5b9, 0xe4a7ceb3), TOBN(0xaffef809, 0x0e134686)},
+      {TOBN(0x3e53e1c6, 0x8eb8fac2), TOBN(0x93c1e4eb, 0x28aec98e),
+       TOBN(0xb6b91ec5, 0x32a43bcb), TOBN(0x2dbfa947, 0xb2d74a51)}},
+     {{TOBN(0xe065d190, 0xca84bad7), TOBN(0xfb13919f, 0xad58e65c),
+       TOBN(0x3c41718b, 0xf1cb6e31), TOBN(0x688969f0, 0x06d05c3f)},
+      {TOBN(0xd4f94ce7, 0x21264d45), TOBN(0xfdfb65e9, 0x7367532b),
+       TOBN(0x5b1be8b1, 0x0945a39d), TOBN(0x229f789c, 0x2b8baf3b)}},
+     {{TOBN(0xd8f41f3e, 0x6f49f15d), TOBN(0x678ce828, 0x907f0792),
+       TOBN(0xc69ace82, 0xfca6e867), TOBN(0x106451ae, 0xd01dcc89)},
+      {TOBN(0x1bb4f7f0, 0x19fc32d2), TOBN(0x64633dfc, 0xb00c52d2),
+       TOBN(0x8f13549a, 0xad9ea445), TOBN(0x99a3bf50, 0xfb323705)}},
+     {{TOBN(0x0c9625a2, 0x534d4dbc), TOBN(0x45b8f1d1, 0xc2a2fea3),
+       TOBN(0x76ec21a1, 0xa530fc1a), TOBN(0x4bac9c2a, 0x9e5bd734)},
+      {TOBN(0x5996d76a, 0x7b4e3587), TOBN(0x0045cdee, 0x1182d9e3),
+       TOBN(0x1aee24b9, 0x1207f13d), TOBN(0x66452e97, 0x97345a41)}},
+     {{TOBN(0x16e5b054, 0x9f950cd0), TOBN(0x9cc72fb1, 0xd7fdd075),
+       TOBN(0x6edd61e7, 0x66249663), TOBN(0xde4caa4d, 0xf043cccb)},
+      {TOBN(0x11b1f57a, 0x55c7ac17), TOBN(0x779cbd44, 0x1a85e24d),
+       TOBN(0x78030f86, 0xe46081e7), TOBN(0xfd4a6032, 0x8e20f643)}},
+     {{TOBN(0xcc7a6488, 0x0a750c0f), TOBN(0x39bacfe3, 0x4e548e83),
+       TOBN(0x3d418c76, 0x0c110f05), TOBN(0x3e4daa4c, 0xb1f11588)},
+      {TOBN(0x2733e7b5, 0x5ffc69ff), TOBN(0x46f147bc, 0x92053127),
+       TOBN(0x885b2434, 0xd722df94), TOBN(0x6a444f65, 0xe6fc6b7c)}}},
+    {{{TOBN(0x7a1a465a, 0xc3f16ea8), TOBN(0x115a461d, 0xb2f1d11c),
+       TOBN(0x4767dd95, 0x6c68a172), TOBN(0x3392f2eb, 0xd13a4698)},
+      {TOBN(0xc7a99ccd, 0xe526cdc7), TOBN(0x8e537fdc, 0x22292b81),
+       TOBN(0x76d8cf69, 0xa6d39198), TOBN(0xffc5ff43, 0x2446852d)}},
+     {{TOBN(0x97b14f7e, 0xa90567e6), TOBN(0x513257b7, 0xb6ae5cb7),
+       TOBN(0x85454a3c, 0x9f10903d), TOBN(0xd8d2c9ad, 0x69bc3724)},
+      {TOBN(0x38da9324, 0x6b29cb44), TOBN(0xb540a21d, 0x77c8cbac),
+       TOBN(0x9bbfe435, 0x01918e42), TOBN(0xfffa707a, 0x56c3614e)}},
+     {{TOBN(0x0ce4e3f1, 0xd4e353b7), TOBN(0x062d8a14, 0xef46b0a0),
+       TOBN(0x6408d5ab, 0x574b73fd), TOBN(0xbc41d1c9, 0xd3273ffd)},
+      {TOBN(0x3538e1e7, 0x6be77800), TOBN(0x71fe8b37, 0xc5655031),
+       TOBN(0x1cd91621, 0x6b9b331a), TOBN(0xad825d0b, 0xbb388f73)}},
+     {{TOBN(0x56c2e05b, 0x1cb76219), TOBN(0x0ec0bf91, 0x71567e7e),
+       TOBN(0xe7076f86, 0x61c4c910), TOBN(0xd67b085b, 0xbabc04d9)},
+      {TOBN(0x9fb90459, 0x5e93a96a), TOBN(0x7526c1ea, 0xfbdc249a),
+       TOBN(0x0d44d367, 0xecdd0bb7), TOBN(0x95399917, 0x9dc0d695)}},
+     {{TOBN(0x61360ee9, 0x9e240d18), TOBN(0x057cdcac, 0xb4b94466),
+       TOBN(0xe7667cd1, 0x2fe5325c), TOBN(0x1fa297b5, 0x21974e3b)},
+      {TOBN(0xfa4081e7, 0xdb083d76), TOBN(0x31993be6, 0xf206bd15),
+       TOBN(0x8949269b, 0x14c19f8c), TOBN(0x21468d72, 0xa9d92357)}},
+     {{TOBN(0x2ccbc583, 0xa4c506ec), TOBN(0x957ed188, 0xd1acfe97),
+       TOBN(0x8baed833, 0x12f1aea2), TOBN(0xef2a6cb4, 0x8325362d)},
+      {TOBN(0x130dde42, 0x8e195c43), TOBN(0xc842025a, 0x0e6050c6),
+       TOBN(0x2da972a7, 0x08686a5d), TOBN(0xb52999a1, 0xe508b4a8)}},
+     {{TOBN(0xd9f090b9, 0x10a5a8bd), TOBN(0xca91d249, 0x096864da),
+       TOBN(0x8e6a93be, 0x3f67dbc1), TOBN(0xacae6fba, 0xf5f4764c)},
+      {TOBN(0x1563c6e0, 0xd21411a0), TOBN(0x28fa787f, 0xda0a4ad8),
+       TOBN(0xd524491c, 0x908c8030), TOBN(0x1257ba0e, 0x4c795f07)}},
+     {{TOBN(0x83f49167, 0xceca9754), TOBN(0x426d2cf6, 0x4b7939a0),
+       TOBN(0x2555e355, 0x723fd0bf), TOBN(0xa96e6d06, 0xc4f144e2)},
+      {TOBN(0x4768a8dd, 0x87880e61), TOBN(0x15543815, 0xe508e4d5),
+       TOBN(0x09d7e772, 0xb1b65e15), TOBN(0x63439dd6, 0xac302fa0)}},
+     {{TOBN(0xb93f802f, 0xc14e35c2), TOBN(0x71735b7c, 0x4341333c),
+       TOBN(0x03a25104, 0x16d4f362), TOBN(0x3f4d069b, 0xbf433c8e)},
+      {TOBN(0x0d83ae01, 0xf78f5a7c), TOBN(0x50a8ffbe, 0x7c4eed07),
+       TOBN(0xc74f8906, 0x76e10f83), TOBN(0x7d080966, 0x9ddaf8e1)}},
+     {{TOBN(0xb11df8e1, 0x698e04cc), TOBN(0x877be203, 0x169005c8),
+       TOBN(0x32749e8c, 0x4f3c6179), TOBN(0x2dbc9d0a, 0x7853fc05)},
+      {TOBN(0x187d4f93, 0x9454d937), TOBN(0xe682ce9d, 0xb4800e1b),
+       TOBN(0xa9129ad8, 0x165e68e8), TOBN(0x0fe29735, 0xbe7f785b)}},
+     {{TOBN(0x5303f40c, 0x5b9e02b7), TOBN(0xa37c9692, 0x35ee04e8),
+       TOBN(0x5f46cc20, 0x34d6632b), TOBN(0x55ef72b2, 0x96ac545b)},
+      {TOBN(0xabec5c1f, 0x7b91b062), TOBN(0x0a79e1c7, 0xbb33e821),
+       TOBN(0xbb04b428, 0x3a9f4117), TOBN(0x0de1f28f, 0xfd2a475a)}},
+     {{TOBN(0x31019ccf, 0x3a4434b4), TOBN(0xa3458111, 0x1a7954dc),
+       TOBN(0xa9dac80d, 0xe34972a7), TOBN(0xb043d054, 0x74f6b8dd)},
+      {TOBN(0x021c319e, 0x11137b1a), TOBN(0x00a754ce, 0xed5cc03f),
+       TOBN(0x0aa2c794, 0xcbea5ad4), TOBN(0x093e67f4, 0x70c015b6)}},
+     {{TOBN(0x72cdfee9, 0xc97e3f6b), TOBN(0xc10bcab4, 0xb6da7461),
+       TOBN(0x3b02d2fc, 0xb59806b9), TOBN(0x85185e89, 0xa1de6f47)},
+      {TOBN(0x39e6931f, 0x0eb6c4d4), TOBN(0x4d4440bd, 0xd4fa5b04),
+       TOBN(0x5418786e, 0x34be7eb8), TOBN(0x6380e521, 0x9d7259bc)}},
+     {{TOBN(0x20ac0351, 0xd598d710), TOBN(0x272c4166, 0xcb3a4da4),
+       TOBN(0xdb82fe1a, 0xca71de1f), TOBN(0x746e79f2, 0xd8f54b0f)},
+      {TOBN(0x6e7fc736, 0x4b573e9b), TOBN(0x75d03f46, 0xfd4b5040),
+       TOBN(0x5c1cc36d, 0x0b98d87b), TOBN(0x513ba3f1, 0x1f472da1)}},
+     {{TOBN(0x79d0af26, 0xabb177dd), TOBN(0xf82ab568, 0x7891d564),
+       TOBN(0x2b6768a9, 0x72232173), TOBN(0xefbb3bb0, 0x8c1f6619)},
+      {TOBN(0xb29c11db, 0xa6d18358), TOBN(0x519e2797, 0xb0916d3a),
+       TOBN(0xd4dc18f0, 0x9188e290), TOBN(0x648e86e3, 0x98b0ca7f)}},
+     {{TOBN(0x859d3145, 0x983c38b5), TOBN(0xb14f176c, 0x637abc8b),
+       TOBN(0x2793fb9d, 0xcaff7be6), TOBN(0xebe5a55f, 0x35a66a5a)},
+      {TOBN(0x7cec1dcd, 0x9f87dc59), TOBN(0x7c595cd3, 0xfbdbf560),
+       TOBN(0x5b543b22, 0x26eb3257), TOBN(0x69080646, 0xc4c935fd)}},
+     {{TOBN(0x7f2e4403, 0x81e9ede3), TOBN(0x243c3894, 0xcaf6df0a),
+       TOBN(0x7c605bb1, 0x1c073b11), TOBN(0xcd06a541, 0xba6a4a62)},
+      {TOBN(0x29168949, 0x49d4e2e5), TOBN(0x33649d07, 0x4af66880),
+       TOBN(0xbfc0c885, 0xe9a85035), TOBN(0xb4e52113, 0xfc410f4b)}},
+     {{TOBN(0xdca3b706, 0x78a6513b), TOBN(0x92ea4a2a, 0x9edb1943),
+       TOBN(0x02642216, 0xdb6e2dd8), TOBN(0x9b45d0b4, 0x9fd57894)},
+      {TOBN(0x114e70db, 0xc69d11ae), TOBN(0x1477dd19, 0x4c57595f),
+       TOBN(0xbc2208b4, 0xec77c272), TOBN(0x95c5b4d7, 0xdb68f59c)}},
+     {{TOBN(0xb8c4fc63, 0x42e532b7), TOBN(0x386ba422, 0x9ae35290),
+       TOBN(0xfb5dda42, 0xd201ecbc), TOBN(0x2353dc8b, 0xa0e38fd6)},
+      {TOBN(0x9a0b85ea, 0x68f7e978), TOBN(0x96ec5682, 0x2ad6d11f),
+       TOBN(0x5e279d6c, 0xe5f6886d), TOBN(0xd3fe03cd, 0x3cb1914d)}},
+     {{TOBN(0xfe541fa4, 0x7ea67c77), TOBN(0x952bd2af, 0xe3ea810c),
+       TOBN(0x791fef56, 0x8d01d374), TOBN(0xa3a1c621, 0x0f11336e)},
+      {TOBN(0x5ad0d5a9, 0xc7ec6d79), TOBN(0xff7038af, 0x3225c342),
+       TOBN(0x003c6689, 0xbc69601b), TOBN(0x25059bc7, 0x45e8747d)}},
+     {{TOBN(0xfa4965b2, 0xf2086fbf), TOBN(0xf6840ea6, 0x86916078),
+       TOBN(0xd7ac7620, 0x70081d6c), TOBN(0xe600da31, 0xb5328645)},
+      {TOBN(0x01916f63, 0x529b8a80), TOBN(0xe80e4858, 0x2d7d6f3e),
+       TOBN(0x29eb0fe8, 0xd664ca7c), TOBN(0xf017637b, 0xe7b43b0c)}},
+     {{TOBN(0x9a75c806, 0x76cb2566), TOBN(0x8f76acb1, 0xb24892d9),
+       TOBN(0x7ae7b9cc, 0x1f08fe45), TOBN(0x19ef7329, 0x6a4907d8)},
+      {TOBN(0x2db4ab71, 0x5f228bf0), TOBN(0xf3cdea39, 0x817032d7),
+       TOBN(0x0b1f482e, 0xdcabe3c0), TOBN(0x3baf76b4, 0xbb86325c)}},
+     {{TOBN(0xd49065e0, 0x10089465), TOBN(0x3bab5d29, 0x8e77c596),
+       TOBN(0x7636c3a6, 0x193dbd95), TOBN(0xdef5d294, 0xb246e499)},
+      {TOBN(0xb22c58b9, 0x286b2475), TOBN(0xa0b93939, 0xcd80862b),
+       TOBN(0x3002c83a, 0xf0992388), TOBN(0x6de01f9b, 0xeacbe14c)}},
+     {{TOBN(0x6aac688e, 0xadd70482), TOBN(0x708de92a, 0x7b4a4e8a),
+       TOBN(0x75b6dd73, 0x758a6eef), TOBN(0xea4bf352, 0x725b3c43)},
+      {TOBN(0x10041f2c, 0x87912868), TOBN(0xb1b1be95, 0xef09297a),
+       TOBN(0x19ae23c5, 0xa9f3860a), TOBN(0xc4f0f839, 0x515dcf4b)}},
+     {{TOBN(0x3c7ecca3, 0x97f6306a), TOBN(0x744c44ae, 0x68a3a4b0),
+       TOBN(0x69cd13a0, 0xb3a1d8a2), TOBN(0x7cad0a1e, 0x5256b578)},
+      {TOBN(0xea653fcd, 0x33791d9e), TOBN(0x9cc2a05d, 0x74b2e05f),
+       TOBN(0x73b391dc, 0xfd7affa2), TOBN(0xddb7091e, 0xb6b05442)}},
+     {{TOBN(0xc71e27bf, 0x8538a5c6), TOBN(0x195c63dd, 0x89abff17),
+       TOBN(0xfd315285, 0x1b71e3da), TOBN(0x9cbdfda7, 0xfa680fa0)},
+      {TOBN(0x9db876ca, 0x849d7eab), TOBN(0xebe2764b, 0x3c273271),
+       TOBN(0x663357e3, 0xf208dcea), TOBN(0x8c5bd833, 0x565b1b70)}},
+     {{TOBN(0xccc3b4f5, 0x9837fc0d), TOBN(0x9b641ba8, 0xa79cf00f),
+       TOBN(0x7428243d, 0xdfdf3990), TOBN(0x83a594c4, 0x020786b1)},
+      {TOBN(0xb712451a, 0x526c4502), TOBN(0x9d39438e, 0x6adb3f93),
+       TOBN(0xfdb261e3, 0xe9ff0ccd), TOBN(0x80344e3c, 0xe07af4c3)}},
+     {{TOBN(0x75900d7c, 0x2fa4f126), TOBN(0x08a3b865, 0x5c99a232),
+       TOBN(0x2478b6bf, 0xdb25e0c3), TOBN(0x482cc2c2, 0x71db2edf)},
+      {TOBN(0x37df7e64, 0x5f321bb8), TOBN(0x8a93821b, 0x9a8005b4),
+       TOBN(0x3fa2f10c, 0xcc8c1958), TOBN(0x0d332218, 0x2c269d0a)}},
+     {{TOBN(0x20ab8119, 0xe246b0e6), TOBN(0xb39781e4, 0xd349fd17),
+       TOBN(0xd293231e, 0xb31aa100), TOBN(0x4b779c97, 0xbb032168)},
+      {TOBN(0x4b3f19e1, 0xc8470500), TOBN(0x45b7efe9, 0x0c4c869d),
+       TOBN(0xdb84f38a, 0xa1a6bbcc), TOBN(0x3b59cb15, 0xb2fddbc1)}},
+     {{TOBN(0xba5514df, 0x3fd165e8), TOBN(0x499fd6a9, 0x061f8811),
+       TOBN(0x72cd1fe0, 0xbfef9f00), TOBN(0x120a4bb9, 0x79ad7e8a)},
+      {TOBN(0xf2ffd095, 0x5f4a5ac5), TOBN(0xcfd174f1, 0x95a7a2f0),
+       TOBN(0xd42301ba, 0x9d17baf1), TOBN(0xd2fa487a, 0x77f22089)}},
+     {{TOBN(0x9cb09efe, 0xb1dc77e1), TOBN(0xe9566939, 0x21c99682),
+       TOBN(0x8c546901, 0x6c6067bb), TOBN(0xfd378574, 0x61c24456)},
+      {TOBN(0x2b6a6cbe, 0x81796b33), TOBN(0x62d550f6, 0x58e87f8b),
+       TOBN(0x1b763e1c, 0x7f1b01b4), TOBN(0x4b93cfea, 0x1b1b5e12)}},
+     {{TOBN(0xb9345238, 0x1d531696), TOBN(0x57201c00, 0x88cdde69),
+       TOBN(0xdde92251, 0x9a86afc7), TOBN(0xe3043895, 0xbd35cea8)},
+      {TOBN(0x7608c1e1, 0x8555970d), TOBN(0x8267dfa9, 0x2535935e),
+       TOBN(0xd4c60a57, 0x322ea38b), TOBN(0xe0bf7977, 0x804ef8b5)}},
+     {{TOBN(0x1a0dab28, 0xc06fece4), TOBN(0xd405991e, 0x94e7b49d),
+       TOBN(0xc542b6d2, 0x706dab28), TOBN(0xcb228da3, 0xa91618fb)},
+      {TOBN(0x224e4164, 0x107d1cea), TOBN(0xeb9fdab3, 0xd0f5d8f1),
+       TOBN(0xc02ba386, 0x0d6e41cd), TOBN(0x676a72c5, 0x9b1f7146)}},
+     {{TOBN(0xffd6dd98, 0x4d6cb00b), TOBN(0xcef9c5ca, 0xde2e8d7c),
+       TOBN(0xa1bbf5d7, 0x641c7936), TOBN(0x1b95b230, 0xee8f772e)},
+      {TOBN(0xf765a92e, 0xe8ac25b1), TOBN(0xceb04cfc, 0x3a18b7c6),
+       TOBN(0x27944cef, 0x0acc8966), TOBN(0xcbb3c957, 0x434c1004)}},
+     {{TOBN(0x9c9971a1, 0xa43ff93c), TOBN(0x5bc2db17, 0xa1e358a9),
+       TOBN(0x45b4862e, 0xa8d9bc82), TOBN(0x70ebfbfb, 0x2201e052)},
+      {TOBN(0xafdf64c7, 0x92871591), TOBN(0xea5bcae6, 0xb42d0219),
+       TOBN(0xde536c55, 0x2ad8f03c), TOBN(0xcd6c3f4d, 0xa76aa33c)}},
+     {{TOBN(0xbeb5f623, 0x0bca6de3), TOBN(0xdd20dd99, 0xb1e706fd),
+       TOBN(0x90b3ff9d, 0xac9059d4), TOBN(0x2d7b2902, 0x7ccccc4e)},
+      {TOBN(0x8a090a59, 0xce98840f), TOBN(0xa5d947e0, 0x8410680a),
+       TOBN(0x49ae346a, 0x923379a5), TOBN(0x7dbc84f9, 0xb28a3156)}},
+     {{TOBN(0xfd40d916, 0x54a1aff2), TOBN(0xabf318ba, 0x3a78fb9b),
+       TOBN(0x50152ed8, 0x3029f95e), TOBN(0x9fc1dd77, 0xc58ad7fa)},
+      {TOBN(0x5fa57915, 0x13595c17), TOBN(0xb9504668, 0x8f62b3a9),
+       TOBN(0x907b5b24, 0xff3055b0), TOBN(0x2e995e35, 0x9a84f125)}},
+     {{TOBN(0x87dacf69, 0x7e9bbcfb), TOBN(0x95d0c1d6, 0xe86d96e3),
+       TOBN(0x65726e3c, 0x2d95a75c), TOBN(0x2c3c9001, 0xacd27f21)},
+      {TOBN(0x1deab561, 0x6c973f57), TOBN(0x108b7e2c, 0xa5221643),
+       TOBN(0x5fee9859, 0xc4ef79d4), TOBN(0xbd62b88a, 0x40d4b8c6)}},
+     {{TOBN(0xb4dd29c4, 0x197c75d6), TOBN(0x266a6df2, 0xb7076feb),
+       TOBN(0x9512d0ea, 0x4bf2df11), TOBN(0x1320c24f, 0x6b0cc9ec)},
+      {TOBN(0x6bb1e0e1, 0x01a59596), TOBN(0x8317c5bb, 0xeff9aaac),
+       TOBN(0x65bb405e, 0x385aa6c9), TOBN(0x613439c1, 0x8f07988f)}},
+     {{TOBN(0xd730049f, 0x16a66e91), TOBN(0xe97f2820, 0xfa1b0e0d),
+       TOBN(0x4131e003, 0x304c28ea), TOBN(0x820ab732, 0x526bac62)},
+      {TOBN(0xb2ac9ef9, 0x28714423), TOBN(0x54ecfffa, 0xadb10cb2),
+       TOBN(0x8781476e, 0xf886a4cc), TOBN(0x4b2c87b5, 0xdb2f8d49)}},
+     {{TOBN(0xe857cd20, 0x0a44295d), TOBN(0x707d7d21, 0x58c6b044),
+       TOBN(0xae8521f9, 0xf596757c), TOBN(0x87448f03, 0x67b2b714)},
+      {TOBN(0x13a9bc45, 0x5ebcd58d), TOBN(0x79bcced9, 0x9122d3c1),
+       TOBN(0x3c644247, 0x9e076642), TOBN(0x0cf22778, 0x2df4767d)}},
+     {{TOBN(0x5e61aee4, 0x71d444b6), TOBN(0x211236bf, 0xc5084a1d),
+       TOBN(0x7e15bc9a, 0x4fd3eaf6), TOBN(0x68df2c34, 0xab622bf5)},
+      {TOBN(0x9e674f0f, 0x59bf4f36), TOBN(0xf883669b, 0xd7f34d73),
+       TOBN(0xc48ac1b8, 0x31497b1d), TOBN(0x323b925d, 0x5106703b)}},
+     {{TOBN(0x22156f42, 0x74082008), TOBN(0xeffc521a, 0xc8482bcb),
+       TOBN(0x5c6831bf, 0x12173479), TOBN(0xcaa2528f, 0xc4739490)},
+      {TOBN(0x84d2102a, 0x8f1b3c4d), TOBN(0xcf64dfc1, 0x2d9bec0d),
+       TOBN(0x433febad, 0x78a546ef), TOBN(0x1f621ec3, 0x7b73cef1)}},
+     {{TOBN(0x6aecd627, 0x37338615), TOBN(0x162082ab, 0x01d8edf6),
+       TOBN(0x833a8119, 0x19e86b66), TOBN(0x6023a251, 0xd299b5db)},
+      {TOBN(0xf5bb0c3a, 0xbbf04b89), TOBN(0x6735eb69, 0xae749a44),
+       TOBN(0xd0e058c5, 0x4713de3b), TOBN(0xfdf2593e, 0x2c3d4ccd)}},
+     {{TOBN(0x1b8f414e, 0xfdd23667), TOBN(0xdd52aaca, 0xfa2015ee),
+       TOBN(0x3e31b517, 0xbd9625ff), TOBN(0x5ec9322d, 0x8db5918c)},
+      {TOBN(0xbc73ac85, 0xa96f5294), TOBN(0x82aa5bf3, 0x61a0666a),
+       TOBN(0x49755810, 0xbf08ac42), TOBN(0xd21cdfd5, 0x891cedfc)}},
+     {{TOBN(0x918cb57b, 0x67f8be10), TOBN(0x365d1a7c, 0x56ffa726),
+       TOBN(0x2435c504, 0x6532de93), TOBN(0xc0fc5e10, 0x2674cd02)},
+      {TOBN(0x6e51fcf8, 0x9cbbb142), TOBN(0x1d436e5a, 0xafc50692),
+       TOBN(0x766bffff, 0x3fbcae22), TOBN(0x3148c2fd, 0xfd55d3b8)}},
+     {{TOBN(0x52c7fdc9, 0x233222fa), TOBN(0x89ff1092, 0xe419fb6b),
+       TOBN(0x3cd6db99, 0x25254977), TOBN(0x2e85a161, 0x1cf12ca7)},
+      {TOBN(0xadd2547c, 0xdc810bc9), TOBN(0xea3f458f, 0x9d257c22),
+       TOBN(0x642c1fbe, 0x27d6b19b), TOBN(0xed07e6b5, 0x140481a6)}},
+     {{TOBN(0x6ada1d42, 0x86d2e0f8), TOBN(0xe5920122, 0x0e8a9fd5),
+       TOBN(0x02c936af, 0x708c1b49), TOBN(0x60f30fee, 0x2b4bfaff)},
+      {TOBN(0x6637ad06, 0x858e6a61), TOBN(0xce4c7767, 0x3fd374d0),
+       TOBN(0x39d54b2d, 0x7188defb), TOBN(0xa8c9d250, 0xf56a6b66)}},
+     {{TOBN(0x58fc0f5e, 0xb24fe1dc), TOBN(0x9eaf9dee, 0x6b73f24c),
+       TOBN(0xa90d588b, 0x33650705), TOBN(0xde5b62c5, 0xaf2ec729)},
+      {TOBN(0x5c72cfae, 0xd3c2b36e), TOBN(0x868c19d5, 0x034435da),
+       TOBN(0x88605f93, 0xe17ee145), TOBN(0xaa60c4ee, 0x77a5d5b1)}},
+     {{TOBN(0xbcf5bfd2, 0x3b60c472), TOBN(0xaf4ef13c, 0xeb1d3049),
+       TOBN(0x373f44fc, 0xe13895c9), TOBN(0xf29b382f, 0x0cbc9822)},
+      {TOBN(0x1bfcb853, 0x73efaef6), TOBN(0xcf56ac9c, 0xa8c96f40),
+       TOBN(0xd7adf109, 0x7a191e24), TOBN(0x98035f44, 0xbf8a8dc2)}},
+     {{TOBN(0xf40a71b9, 0x1e750c84), TOBN(0xc57f7b0c, 0x5dc6c469),
+       TOBN(0x49a0e79c, 0x6fbc19c1), TOBN(0x6b0f5889, 0xa48ebdb8)},
+      {TOBN(0x5d3fd084, 0xa07c4e9f), TOBN(0xc3830111, 0xab27de14),
+       TOBN(0x0e4929fe, 0x33e08dcc), TOBN(0xf4a5ad24, 0x40bb73a3)}},
+     {{TOBN(0xde86c2bf, 0x490f97ca), TOBN(0x288f09c6, 0x67a1ce18),
+       TOBN(0x364bb886, 0x1844478d), TOBN(0x7840fa42, 0xceedb040)},
+      {TOBN(0x1269fdd2, 0x5a631b37), TOBN(0x94761f1e, 0xa47c8b7d),
+       TOBN(0xfc0c2e17, 0x481c6266), TOBN(0x85e16ea2, 0x3daa5fa7)}},
+     {{TOBN(0xccd86033, 0x92491048), TOBN(0x0c2f6963, 0xf4d402d7),
+       TOBN(0x6336f7df, 0xdf6a865c), TOBN(0x0a2a463c, 0xb5c02a87)},
+      {TOBN(0xb0e29be7, 0xbf2f12ee), TOBN(0xf0a22002, 0x66bad988),
+       TOBN(0x27f87e03, 0x9123c1d7), TOBN(0x21669c55, 0x328a8c98)}},
+     {{TOBN(0x186b9803, 0x92f14529), TOBN(0xd3d056cc, 0x63954df3),
+       TOBN(0x2f03fd58, 0x175a46f6), TOBN(0x63e34ebe, 0x11558558)},
+      {TOBN(0xe13fedee, 0x5b80cfa5), TOBN(0xe872a120, 0xd401dbd1),
+       TOBN(0x52657616, 0xe8a9d667), TOBN(0xbc8da4b6, 0xe08d6693)}},
+     {{TOBN(0x370fb9bb, 0x1b703e75), TOBN(0x6773b186, 0xd4338363),
+       TOBN(0x18dad378, 0xecef7bff), TOBN(0xaac787ed, 0x995677da)},
+      {TOBN(0x4801ea8b, 0x0437164b), TOBN(0xf430ad20, 0x73fe795e),
+       TOBN(0xb164154d, 0x8ee5eb73), TOBN(0x0884ecd8, 0x108f7c0e)}},
+     {{TOBN(0x0e6ec096, 0x5f520698), TOBN(0x640631fe, 0x44f7b8d9),
+       TOBN(0x92fd34fc, 0xa35a68b9), TOBN(0x9c5a4b66, 0x4d40cf4e)},
+      {TOBN(0x949454bf, 0x80b6783d), TOBN(0x80e701fe, 0x3a320a10),
+       TOBN(0x8d1a564a, 0x1a0a39b2), TOBN(0x1436d53d, 0x320587db)}},
+     {{TOBN(0xf5096e6d, 0x6556c362), TOBN(0xbc23a3c0, 0xe2455d7e),
+       TOBN(0x3a7aee54, 0x807230f9), TOBN(0x9ba1cfa6, 0x22ae82fd)},
+      {TOBN(0x833a057a, 0x99c5d706), TOBN(0x8be85f4b, 0x842315c9),
+       TOBN(0xd083179a, 0x66a72f12), TOBN(0x2fc77d5d, 0xcdcc73cd)}},
+     {{TOBN(0x22b88a80, 0x5616ee30), TOBN(0xfb09548f, 0xe7ab1083),
+       TOBN(0x8ad6ab0d, 0x511270cd), TOBN(0x61f6c57a, 0x6924d9ab)},
+      {TOBN(0xa0f7bf72, 0x90aecb08), TOBN(0x849f87c9, 0x0df784a4),
+       TOBN(0x27c79c15, 0xcfaf1d03), TOBN(0xbbf9f675, 0xc463face)}},
+     {{TOBN(0x91502c65, 0x765ba543), TOBN(0x18ce3cac, 0x42ea60dd),
+       TOBN(0xe5cee6ac, 0x6e43ecb3), TOBN(0x63e4e910, 0x68f2aeeb)},
+      {TOBN(0x26234fa3, 0xc85932ee), TOBN(0x96883e8b, 0x4c90c44d),
+       TOBN(0x29b9e738, 0xa18a50f6), TOBN(0xbfc62b2a, 0x3f0420df)}},
+     {{TOBN(0xd22a7d90, 0x6d3e1fa9), TOBN(0x17115618, 0xfe05b8a3),
+       TOBN(0x2a0c9926, 0xbb2b9c01), TOBN(0xc739fcc6, 0xe07e76a2)},
+      {TOBN(0x540e9157, 0x165e439a), TOBN(0x06353a62, 0x6a9063d8),
+       TOBN(0x84d95594, 0x61e927a3), TOBN(0x013b9b26, 0xe2e0be7f)}},
+     {{TOBN(0x4feaec3b, 0x973497f1), TOBN(0x15c0f94e, 0x093ebc2d),
+       TOBN(0x6af5f227, 0x33af0583), TOBN(0x0c2af206, 0xc61f3340)},
+      {TOBN(0xd25dbdf1, 0x4457397c), TOBN(0x2e8ed017, 0xcabcbae0),
+       TOBN(0xe3010938, 0xc2815306), TOBN(0xbaa99337, 0xe8c6cd68)}},
+     {{TOBN(0x08513182, 0x3b0ec7de), TOBN(0x1e1b822b, 0x58df05df),
+       TOBN(0x5c14842f, 0xa5c3b683), TOBN(0x98fe977e, 0x3eba34ce)},
+      {TOBN(0xfd2316c2, 0x0d5e8873), TOBN(0xe48d839a, 0xbd0d427d),
+       TOBN(0x495b2218, 0x623fc961), TOBN(0x24ee56e7, 0xb46fba5e)}},
+     {{TOBN(0x9184a55b, 0x91e4de58), TOBN(0xa7488ca5, 0xdfdea288),
+       TOBN(0xa723862e, 0xa8dcc943), TOBN(0x92d762b2, 0x849dc0fc)},
+      {TOBN(0x3c444a12, 0x091ff4a9), TOBN(0x581113fa, 0x0cada274),
+       TOBN(0xb9de0a45, 0x30d8eae2), TOBN(0x5e0fcd85, 0xdf6b41ea)}},
+     {{TOBN(0x6233ea68, 0xc094dbb5), TOBN(0xb77d062e, 0xd968d410),
+       TOBN(0x3e719bbc, 0x58b3002d), TOBN(0x68e7dd3d, 0x3dc49d58)},
+      {TOBN(0x8d825740, 0x013a5e58), TOBN(0x21311747, 0x3c9e3c1b),
+       TOBN(0x0cb0a2a7, 0x7c99b6ab), TOBN(0x5c48a3b3, 0xc2f888f2)}}},
+    {{{TOBN(0xc7913e91, 0x991724f3), TOBN(0x5eda799c, 0x39cbd686),
+       TOBN(0xddb595c7, 0x63d4fc1e), TOBN(0x6b63b80b, 0xac4fed54)},
+      {TOBN(0x6ea0fc69, 0x7e5fb516), TOBN(0x737708ba, 0xd0f1c964),
+       TOBN(0x9628745f, 0x11a92ca5), TOBN(0x61f37958, 0x9a86967a)}},
+     {{TOBN(0x9af39b2c, 0xaa665072), TOBN(0x78322fa4, 0xefd324ef),
+       TOBN(0x3d153394, 0xc327bd31), TOBN(0x81d5f271, 0x3129dab0)},
+      {TOBN(0xc72e0c42, 0xf48027f5), TOBN(0xaa40cdbc, 0x8536e717),
+       TOBN(0xf45a657a, 0x2d369d0f), TOBN(0xb03bbfc4, 0xea7f74e6)}},
+     {{TOBN(0x46a8c418, 0x0d738ded), TOBN(0x6f1a5bb0, 0xe0de5729),
+       TOBN(0xf10230b9, 0x8ba81675), TOBN(0x32c6f30c, 0x112b33d4)},
+      {TOBN(0x7559129d, 0xd8fffb62), TOBN(0x6a281b47, 0xb459bf05),
+       TOBN(0x77c1bd3a, 0xfa3b6776), TOBN(0x0709b380, 0x7829973a)}},
+     {{TOBN(0x8c26b232, 0xa3326505), TOBN(0x38d69272, 0xee1d41bf),
+       TOBN(0x0459453e, 0xffe32afa), TOBN(0xce8143ad, 0x7cb3ea87)},
+      {TOBN(0x932ec1fa, 0x7e6ab666), TOBN(0x6cd2d230, 0x22286264),
+       TOBN(0x459a46fe, 0x6736f8ed), TOBN(0x50bf0d00, 0x9eca85bb)}},
+     {{TOBN(0x0b825852, 0x877a21ec), TOBN(0x300414a7, 0x0f537a94),
+       TOBN(0x3f1cba40, 0x21a9a6a2), TOBN(0x50824eee, 0x76943c00)},
+      {TOBN(0xa0dbfcec, 0xf83cba5d), TOBN(0xf9538148, 0x93b4f3c0),
+       TOBN(0x61744162, 0x48f24dd7), TOBN(0x5322d64d, 0xe4fb09dd)}},
+     {{TOBN(0x57447384, 0x3d9325f3), TOBN(0xa9bef2d0, 0xf371cb84),
+       TOBN(0x77d2188b, 0xa61e36c5), TOBN(0xbbd6a7d7, 0xc602df72)},
+      {TOBN(0xba3aa902, 0x8f61bc0b), TOBN(0xf49085ed, 0x6ed0b6a1),
+       TOBN(0x8bc625d6, 0xae6e8298), TOBN(0x832b0b1d, 0xa2e9c01d)}},
+     {{TOBN(0xa337c447, 0xf1f0ced1), TOBN(0x800cc793, 0x9492dd2b),
+       TOBN(0x4b93151d, 0xbea08efa), TOBN(0x820cf3f8, 0xde0a741e)},
+      {TOBN(0xff1982dc, 0x1c0f7d13), TOBN(0xef921960, 0x84dde6ca),
+       TOBN(0x1ad7d972, 0x45f96ee3), TOBN(0x319c8dbe, 0x29dea0c7)}},
+     {{TOBN(0xd3ea3871, 0x7b82b99b), TOBN(0x75922d4d, 0x470eb624),
+       TOBN(0x8f66ec54, 0x3b95d466), TOBN(0x66e673cc, 0xbee1e346)},
+      {TOBN(0x6afe67c4, 0xb5f2b89a), TOBN(0x3de9c1e6, 0x290e5cd3),
+       TOBN(0x8c278bb6, 0x310a2ada), TOBN(0x420fa384, 0x0bdb323b)}},
+     {{TOBN(0x0ae1d63b, 0x0eb919b0), TOBN(0xd74ee51d, 0xa74b9620),
+       TOBN(0x395458d0, 0xa674290c), TOBN(0x324c930f, 0x4620a510)},
+      {TOBN(0x2d1f4d19, 0xfbac27d4), TOBN(0x4086e8ca, 0x9bedeeac),
+       TOBN(0x0cdd211b, 0x9b679ab8), TOBN(0x5970167d, 0x7090fec4)}},
+     {{TOBN(0x3420f2c9, 0xfaf1fc63), TOBN(0x616d333a, 0x328c8bb4),
+       TOBN(0x7d65364c, 0x57f1fe4a), TOBN(0x9343e877, 0x55e5c73a)},
+      {TOBN(0x5795176b, 0xe970e78c), TOBN(0xa36ccebf, 0x60533627),
+       TOBN(0xfc7c7380, 0x09cdfc1b), TOBN(0xb39a2afe, 0xb3fec326)}},
+     {{TOBN(0xb7ff1ba1, 0x6224408a), TOBN(0xcc856e92, 0x247cfc5e),
+       TOBN(0x01f102e7, 0xc18bc493), TOBN(0x4613ab74, 0x2091c727)},
+      {TOBN(0xaa25e89c, 0xc420bf2b), TOBN(0x00a53176, 0x90337ec2),
+       TOBN(0xd2be9f43, 0x7d025fc7), TOBN(0x3316fb85, 0x6e6fe3dc)}},
+     {{TOBN(0x27520af5, 0x9ac50814), TOBN(0xfdf95e78, 0x9a8e4223),
+       TOBN(0xb7e7df2a, 0x56bec5a0), TOBN(0xf7022f7d, 0xdf159e5d)},
+      {TOBN(0x93eeeab1, 0xcac1fe8f), TOBN(0x8040188c, 0x37451168),
+       TOBN(0x7ee8aa8a, 0xd967dce6), TOBN(0xfa0e79e7, 0x3abc9299)}},
+     {{TOBN(0x67332cfc, 0x2064cfd1), TOBN(0x339c31de, 0xb0651934),
+       TOBN(0x719b28d5, 0x2a3bcbea), TOBN(0xee74c82b, 0x9d6ae5c6)},
+      {TOBN(0x0927d05e, 0xbaf28ee6), TOBN(0x82cecf2c, 0x9d719028),
+       TOBN(0x0b0d353e, 0xddb30289), TOBN(0xfe4bb977, 0xfddb2e29)}},
+     {{TOBN(0xbb5bb990, 0x640bfd9e), TOBN(0xd226e277, 0x82f62108),
+       TOBN(0x4bf00985, 0x02ffdd56), TOBN(0x7756758a, 0x2ca1b1b5)},
+      {TOBN(0xc32b62a3, 0x5285fe91), TOBN(0xedbc546a, 0x8c9cd140),
+       TOBN(0x1e47a013, 0xaf5cb008), TOBN(0xbca7e720, 0x073ce8f2)}},
+     {{TOBN(0xe10b2ab8, 0x17a91cae), TOBN(0xb89aab65, 0x08e27f63),
+       TOBN(0x7b3074a7, 0xdba3ddf9), TOBN(0x1c20ce09, 0x330c2972)},
+      {TOBN(0x6b9917b4, 0x5fcf7e33), TOBN(0xe6793743, 0x945ceb42),
+       TOBN(0x18fc2215, 0x5c633d19), TOBN(0xad1adb3c, 0xc7485474)}},
+     {{TOBN(0x646f9679, 0x6424c49b), TOBN(0xf888dfe8, 0x67c241c9),
+       TOBN(0xe12d4b93, 0x24f68b49), TOBN(0x9a6b62d8, 0xa571df20)},
+      {TOBN(0x81b4b26d, 0x179483cb), TOBN(0x666f9632, 0x9511fae2),
+       TOBN(0xd281b3e4, 0xd53aa51f), TOBN(0x7f96a765, 0x7f3dbd16)}},
+     {{TOBN(0xa7f8b5bf, 0x074a30ce), TOBN(0xd7f52107, 0x005a32e6),
+       TOBN(0x6f9e0907, 0x50237ed4), TOBN(0x2f21da47, 0x8096fa2b)},
+      {TOBN(0xf3e19cb4, 0xeec863a0), TOBN(0xd18f77fd, 0x9527620a),
+       TOBN(0x9505c81c, 0x407c1cf8), TOBN(0x9998db4e, 0x1b6ec284)}},
+     {{TOBN(0x7e3389e5, 0xc247d44d), TOBN(0x12507141, 0x3f4f3d80),
+       TOBN(0xd4ba0110, 0x4a78a6c7), TOBN(0x312874a0, 0x767720be)},
+      {TOBN(0xded059a6, 0x75944370), TOBN(0xd6123d90, 0x3b2c0bdd),
+       TOBN(0xa56b717b, 0x51c108e3), TOBN(0x9bb7940e, 0x070623e9)}},
+     {{TOBN(0x794e2d59, 0x84ac066c), TOBN(0xf5954a92, 0xe68c69a0),
+       TOBN(0x28c52458, 0x4fd99dcc), TOBN(0x60e639fc, 0xb1012517)},
+      {TOBN(0xc2e60125, 0x7de79248), TOBN(0xe9ef6404, 0xf12fc6d7),
+       TOBN(0x4c4f2808, 0x2a3b5d32), TOBN(0x865ad32e, 0xc768eb8a)}},
+     {{TOBN(0xac02331b, 0x13fb70b6), TOBN(0x037b44c1, 0x95599b27),
+       TOBN(0x1a860fc4, 0x60bd082c), TOBN(0xa2e25745, 0xc980cd01)},
+      {TOBN(0xee3387a8, 0x1da0263e), TOBN(0x931bfb95, 0x2d10f3d6),
+       TOBN(0x5b687270, 0xa1f24a32), TOBN(0xf140e65d, 0xca494b86)}},
+     {{TOBN(0x4f4ddf91, 0xb2f1ac7a), TOBN(0xf99eaabb, 0x760fee27),
+       TOBN(0x57f4008a, 0x49c228e5), TOBN(0x090be440, 0x1cf713bb)},
+      {TOBN(0xac91fbe4, 0x5004f022), TOBN(0xd838c2c2, 0x569e1af6),
+       TOBN(0xd6c7d20b, 0x0f1daaa5), TOBN(0xaa063ac1, 0x1bbb02c0)}},
+     {{TOBN(0x0938a422, 0x59558a78), TOBN(0x5343c669, 0x8435da2f),
+       TOBN(0x96f67b18, 0x034410dc), TOBN(0x7cc1e424, 0x84510804)},
+      {TOBN(0x86a1543f, 0x16dfbb7d), TOBN(0x921fa942, 0x5b5bd592),
+       TOBN(0x9dcccb6e, 0xb33dd03c), TOBN(0x8581ddd9, 0xb843f51e)}},
+     {{TOBN(0x54935fcb, 0x81d73c9e), TOBN(0x6d07e979, 0x0a5e97ab),
+       TOBN(0x4dc7b30a, 0xcf3a6bab), TOBN(0x147ab1f3, 0x170bee11)},
+      {TOBN(0x0aaf8e3d, 0x9fafdee4), TOBN(0xfab3dbcb, 0x538a8b95),
+       TOBN(0x405df4b3, 0x6ef13871), TOBN(0xf1f4e9cb, 0x088d5a49)}},
+     {{TOBN(0x9bcd24d3, 0x66b33f1d), TOBN(0x3b97b820, 0x5ce445c0),
+       TOBN(0xe2926549, 0xba93ff61), TOBN(0xd9c341ce, 0x4dafe616)},
+      {TOBN(0xfb30a76e, 0x16efb6f3), TOBN(0xdf24b8ca, 0x605b953c),
+       TOBN(0x8bd52afe, 0xc2fffb9f), TOBN(0xbbac5ff7, 0xe19d0b96)}},
+     {{TOBN(0x43c01b87, 0x459afccd), TOBN(0x6bd45143, 0xb7432652),
+       TOBN(0x84734530, 0x55b5d78e), TOBN(0x81088fdb, 0x1554ba7d)},
+      {TOBN(0xada0a52c, 0x1e269375), TOBN(0xf9f037c4, 0x2dc5ec10),
+       TOBN(0xc0660607, 0x94bfbc11), TOBN(0xc0a630bb, 0xc9c40d2f)}},
+     {{TOBN(0x5efc797e, 0xab64c31e), TOBN(0xffdb1dab, 0x74507144),
+       TOBN(0xf6124287, 0x1ca6790c), TOBN(0xe9609d81, 0xe69bf1bf)},
+      {TOBN(0xdb898595, 0x00d24fc9), TOBN(0x9c750333, 0xe51fb417),
+       TOBN(0x51830a91, 0xfef7bbde), TOBN(0x0ce67dc8, 0x945f585c)}},
+     {{TOBN(0x9a730ed4, 0x4763eb50), TOBN(0x24a0e221, 0xc1ab0d66),
+       TOBN(0x643b6393, 0x648748f3), TOBN(0x1982daa1, 0x6d3c6291)},
+      {TOBN(0x6f00a9f7, 0x8bbc5549), TOBN(0x7a1783e1, 0x7f36384e),
+       TOBN(0xe8346323, 0xde977f50), TOBN(0x91ab688d, 0xb245502a)}},
+     {{TOBN(0x331ab6b5, 0x6d0bdd66), TOBN(0x0a6ef32e, 0x64b71229),
+       TOBN(0x1028150e, 0xfe7c352f), TOBN(0x27e04350, 0xce7b39d3)},
+      {TOBN(0x2a3c8acd, 0xc1070c82), TOBN(0xfb2034d3, 0x80c9feef),
+       TOBN(0x2d729621, 0x709f3729), TOBN(0x8df290bf, 0x62cb4549)}},
+     {{TOBN(0x02f99f33, 0xfc2e4326), TOBN(0x3b30076d, 0x5eddf032),
+       TOBN(0xbb21f8cf, 0x0c652fb5), TOBN(0x314fb49e, 0xed91cf7b)},
+      {TOBN(0xa013eca5, 0x2f700750), TOBN(0x2b9e3c23, 0x712a4575),
+       TOBN(0xe5355557, 0xaf30fbb0), TOBN(0x1ada3516, 0x7c77e771)}},
+     {{TOBN(0x45f6ecb2, 0x7b135670), TOBN(0xe85d19df, 0x7cfc202e),
+       TOBN(0x0f1b50c7, 0x58d1be9f), TOBN(0x5ebf2c0a, 0xead2e344)},
+      {TOBN(0x1531fe4e, 0xabc199c9), TOBN(0xc7032592, 0x56bab0ae),
+       TOBN(0x16ab2e48, 0x6c1fec54), TOBN(0x0f87fda8, 0x04280188)}},
+     {{TOBN(0xdc9f46fc, 0x609e4a74), TOBN(0x2a44a143, 0xba667f91),
+       TOBN(0xbc3d8b95, 0xb4d83436), TOBN(0xa01e4bd0, 0xc7bd2958)},
+      {TOBN(0x7b182932, 0x73483c90), TOBN(0xa79c6aa1, 0xa7c7b598),
+       TOBN(0xbf3983c6, 0xeaaac07e), TOBN(0x8f18181e, 0x96e0d4e6)}},
+     {{TOBN(0x8553d37c, 0x051af62b), TOBN(0xe9a998eb, 0x0bf94496),
+       TOBN(0xe0844f9f, 0xb0d59aa1), TOBN(0x983fd558, 0xe6afb813)},
+      {TOBN(0x9670c0ca, 0x65d69804), TOBN(0x732b22de, 0x6ea5ff2d),
+       TOBN(0xd7640ba9, 0x5fd8623b), TOBN(0x9f619163, 0xa6351782)}},
+     {{TOBN(0x0bfc27ee, 0xacee5043), TOBN(0xae419e73, 0x2eb10f02),
+       TOBN(0x19c028d1, 0x8943fb05), TOBN(0x71f01cf7, 0xff13aa2a)},
+      {TOBN(0x7790737e, 0x8887a132), TOBN(0x67513309, 0x66318410),
+       TOBN(0x9819e8a3, 0x7ddb795e), TOBN(0xfecb8ef5, 0xdad100b2)}},
+     {{TOBN(0x59f74a22, 0x3021926a), TOBN(0xb7c28a49, 0x6f9b4c1c),
+       TOBN(0xed1a733f, 0x912ad0ab), TOBN(0x42a910af, 0x01a5659c)},
+      {TOBN(0x3842c6e0, 0x7bd68cab), TOBN(0x2b57fa38, 0x76d70ac8),
+       TOBN(0x8a6707a8, 0x3c53aaeb), TOBN(0x62c1c510, 0x65b4db18)}},
+     {{TOBN(0x8de2c1fb, 0xb2d09dc7), TOBN(0xc3dfed12, 0x266bd23b),
+       TOBN(0x927d039b, 0xd5b27db6), TOBN(0x2fb2f0f1, 0x103243da)},
+      {TOBN(0xf855a07b, 0x80be7399), TOBN(0xed9327ce, 0x1f9f27a8),
+       TOBN(0xa0bd99c7, 0x729bdef7), TOBN(0x2b67125e, 0x28250d88)}},
+     {{TOBN(0x784b26e8, 0x8670ced7), TOBN(0xe3dfe41f, 0xc31bd3b4),
+       TOBN(0x9e353a06, 0xbcc85cbc), TOBN(0x302e2909, 0x60178a9d)},
+      {TOBN(0x860abf11, 0xa6eac16e), TOBN(0x76447000, 0xaa2b3aac),
+       TOBN(0x46ff9d19, 0x850afdab), TOBN(0x35bdd6a5, 0xfdb2d4c1)}},
+     {{TOBN(0xe82594b0, 0x7e5c9ce9), TOBN(0x0f379e53, 0x20af346e),
+       TOBN(0x608b31e3, 0xbc65ad4a), TOBN(0x710c6b12, 0x267c4826)},
+      {TOBN(0x51c966f9, 0x71954cf1), TOBN(0xb1cec793, 0x0d0aa215),
+       TOBN(0x1f155989, 0x86bd23a8), TOBN(0xae2ff99c, 0xf9452e86)}},
+     {{TOBN(0xd8dd953c, 0x340ceaa2), TOBN(0x26355275, 0x2e2e9333),
+       TOBN(0x15d4e5f9, 0x8586f06d), TOBN(0xd6bf94a8, 0xf7cab546)},
+      {TOBN(0x33c59a0a, 0xb76a9af0), TOBN(0x52740ab3, 0xba095af7),
+       TOBN(0xc444de8a, 0x24389ca0), TOBN(0xcc6f9863, 0x706da0cb)}},
+     {{TOBN(0xb5a741a7, 0x6b2515cf), TOBN(0x71c41601, 0x9585c749),
+       TOBN(0x78350d4f, 0xe683de97), TOBN(0x31d61524, 0x63d0b5f5)},
+      {TOBN(0x7a0cc5e1, 0xfbce090b), TOBN(0xaac927ed, 0xfbcb2a5b),
+       TOBN(0xe920de49, 0x20d84c35), TOBN(0x8c06a0b6, 0x22b4de26)}},
+     {{TOBN(0xd34dd58b, 0xafe7ddf3), TOBN(0x55851fed, 0xc1e6e55b),
+       TOBN(0xd1395616, 0x960696e7), TOBN(0x940304b2, 0x5f22705f)},
+      {TOBN(0x6f43f861, 0xb0a2a860), TOBN(0xcf121282, 0x0e7cc981),
+       TOBN(0x12186212, 0x0ab64a96), TOBN(0x09215b9a, 0xb789383c)}},
+     {{TOBN(0x311eb305, 0x37387c09), TOBN(0xc5832fce, 0xf03ee760),
+       TOBN(0x30358f58, 0x32f7ea19), TOBN(0xe01d3c34, 0x91d53551)},
+      {TOBN(0x1ca5ee41, 0xda48ea80), TOBN(0x34e71e8e, 0xcf4fa4c1),
+       TOBN(0x312abd25, 0x7af1e1c7), TOBN(0xe3afcdeb, 0x2153f4a5)}},
+     {{TOBN(0x9d5c84d7, 0x00235e9a), TOBN(0x0308d3f4, 0x8c4c836f),
+       TOBN(0xc0a66b04, 0x89332de5), TOBN(0x610dd399, 0x89e566ef)},
+      {TOBN(0xf8eea460, 0xd1ac1635), TOBN(0x84cbb3fb, 0x20a2c0df),
+       TOBN(0x40afb488, 0xe74a48c5), TOBN(0x29738198, 0xd326b150)}},
+     {{TOBN(0x2a17747f, 0xa6d74081), TOBN(0x60ea4c05, 0x55a26214),
+       TOBN(0x53514bb4, 0x1f88c5fe), TOBN(0xedd64567, 0x7e83426c)},
+      {TOBN(0xd5d6cbec, 0x96460b25), TOBN(0xa12fd0ce, 0x68dc115e),
+       TOBN(0xc5bc3ed2, 0x697840ea), TOBN(0x969876a8, 0xa6331e31)}},
+     {{TOBN(0x60c36217, 0x472ff580), TOBN(0xf4229705, 0x4ad41393),
+       TOBN(0x4bd99ef0, 0xa03b8b92), TOBN(0x501c7317, 0xc144f4f6)},
+      {TOBN(0x159009b3, 0x18464945), TOBN(0x6d5e594c, 0x74c5c6be),
+       TOBN(0x2d587011, 0x321a3660), TOBN(0xd1e184b1, 0x3898d022)}},
+     {{TOBN(0x5ba04752, 0x4c6a7e04), TOBN(0x47fa1e2b, 0x45550b65),
+       TOBN(0x9419daf0, 0x48c0a9a5), TOBN(0x66362953, 0x7c243236)},
+      {TOBN(0xcd0744b1, 0x5cb12a88), TOBN(0x561b6f9a, 0x2b646188),
+       TOBN(0x599415a5, 0x66c2c0c0), TOBN(0xbe3f0859, 0x0f83f09a)}},
+     {{TOBN(0x9141c5be, 0xb92041b8), TOBN(0x01ae38c7, 0x26477d0d),
+       TOBN(0xca8b71f3, 0xd12c7a94), TOBN(0xfab5b31f, 0x765c70db)},
+      {TOBN(0x76ae7492, 0x487443e9), TOBN(0x8595a310, 0x990d1349),
+       TOBN(0xf8dbeda8, 0x7d460a37), TOBN(0x7f7ad082, 0x1e45a38f)}},
+     {{TOBN(0xed1d4db6, 0x1059705a), TOBN(0xa3dd492a, 0xe6b9c697),
+       TOBN(0x4b92ee3a, 0x6eb38bd5), TOBN(0xbab2609d, 0x67cc0bb7)},
+      {TOBN(0x7fc4fe89, 0x6e70ee82), TOBN(0xeff2c56e, 0x13e6b7e3),
+       TOBN(0x9b18959e, 0x34d26fca), TOBN(0x2517ab66, 0x889d6b45)}},
+     {{TOBN(0xf167b4e0, 0xbdefdd4f), TOBN(0x69958465, 0xf366e401),
+       TOBN(0x5aa368ab, 0xa73bbec0), TOBN(0x12148709, 0x7b240c21)},
+      {TOBN(0x378c3233, 0x18969006), TOBN(0xcb4d73ce, 0xe1fe53d1),
+       TOBN(0x5f50a80e, 0x130c4361), TOBN(0xd67f5951, 0x7ef5212b)}},
+     {{TOBN(0xf145e21e, 0x9e70c72e), TOBN(0xb2e52e29, 0x5566d2fb),
+       TOBN(0x44eaba4a, 0x032397f5), TOBN(0x5e56937b, 0x7e31a7de)},
+      {TOBN(0x68dcf517, 0x456c61e1), TOBN(0xbc2e954a, 0xa8b0a388),
+       TOBN(0xe3552fa7, 0x60a8b755), TOBN(0x03442dae, 0x73ad0cde)}},
+     {{TOBN(0x37ffe747, 0xceb26210), TOBN(0x983545e8, 0x787baef9),
+       TOBN(0x8b8c8535, 0x86a3de31), TOBN(0xc621dbcb, 0xfacd46db)},
+      {TOBN(0x82e442e9, 0x59266fbb), TOBN(0xa3514c37, 0x339d471c),
+       TOBN(0x3a11b771, 0x62cdad96), TOBN(0xf0cb3b3c, 0xecf9bdf0)}},
+     {{TOBN(0x3fcbdbce, 0x478e2135), TOBN(0x7547b5cf, 0xbda35342),
+       TOBN(0xa97e81f1, 0x8a677af6), TOBN(0xc8c2bf83, 0x28817987)},
+      {TOBN(0xdf07eaaf, 0x45580985), TOBN(0xc68d1f05, 0xc93b45cb),
+       TOBN(0x106aa2fe, 0xc77b4cac), TOBN(0x4c1d8afc, 0x04a7ae86)}},
+     {{TOBN(0xdb41c3fd, 0x9eb45ab2), TOBN(0x5b234b5b, 0xd4b22e74),
+       TOBN(0xda253dec, 0xf215958a), TOBN(0x67e0606e, 0xa04edfa0)},
+      {TOBN(0xabbbf070, 0xef751b11), TOBN(0xf352f175, 0xf6f06dce),
+       TOBN(0xdfc4b6af, 0x6839f6b4), TOBN(0x53ddf9a8, 0x9959848e)}},
+     {{TOBN(0xda49c379, 0xc21520b0), TOBN(0x90864ff0, 0xdbd5d1b6),
+       TOBN(0x2f055d23, 0x5f49c7f7), TOBN(0xe51e4e6a, 0xa796b2d8)},
+      {TOBN(0xc361a67f, 0x5c9dc340), TOBN(0x5ad53c37, 0xbca7c620),
+       TOBN(0xda1d6588, 0x32c756d0), TOBN(0xad60d911, 0x8bb67e13)}},
+     {{TOBN(0xd6c47bdf, 0x0eeec8c6), TOBN(0x4a27fec1, 0x078a1821),
+       TOBN(0x081f7415, 0xc3099524), TOBN(0x8effdf0b, 0x82cd8060)},
+      {TOBN(0xdb70ec1c, 0x65842df8), TOBN(0x8821b358, 0xd319a901),
+       TOBN(0x72ee56ee, 0xde42b529), TOBN(0x5bb39592, 0x236e4286)}},
+     {{TOBN(0xd1183316, 0xfd6f7140), TOBN(0xf9fadb5b, 0xbd8e81f7),
+       TOBN(0x701d5e0c, 0x5a02d962), TOBN(0xfdee4dbf, 0x1b601324)},
+      {TOBN(0xbed17407, 0x35d7620e), TOBN(0x04e3c2c3, 0xf48c0012),
+       TOBN(0x9ee29da7, 0x3455449a), TOBN(0x562cdef4, 0x91a836c4)}},
+     {{TOBN(0x8f682a5f, 0x47701097), TOBN(0x617125d8, 0xff88d0c2),
+       TOBN(0x948fda24, 0x57bb86dd), TOBN(0x348abb8f, 0x289f7286)},
+      {TOBN(0xeb10eab5, 0x99d94bbd), TOBN(0xd51ba28e, 0x4684d160),
+       TOBN(0xabe0e51c, 0x30c8f41a), TOBN(0x66588b45, 0x13254f4a)}},
+     {{TOBN(0x147ebf01, 0xfad097a5), TOBN(0x49883ea8, 0x610e815d),
+       TOBN(0xe44d60ba, 0x8a11de56), TOBN(0xa970de6e, 0x827a7a6d)},
+      {TOBN(0x2be41424, 0x5e17fc19), TOBN(0xd833c657, 0x01214057),
+       TOBN(0x1375813b, 0x363e723f), TOBN(0x6820bb88, 0xe6a52e9b)}},
+     {{TOBN(0x7e7f6970, 0xd875d56a), TOBN(0xd6a0a9ac, 0x51fbf6bf),
+       TOBN(0x54ba8790, 0xa3083c12), TOBN(0xebaeb23d, 0x6ae7eb64)},
+      {TOBN(0xa8685c3a, 0xb99a907a), TOBN(0xf1e74550, 0x026bf40b),
+       TOBN(0x7b73a027, 0xc802cd9e), TOBN(0x9a8a927c, 0x4fef4635)}},
+     {{TOBN(0xe1b6f60c, 0x08191224), TOBN(0xc4126ebb, 0xde4ec091),
+       TOBN(0xe1dff4dc, 0x4ae38d84), TOBN(0xde3f57db, 0x4f2ef985)},
+      {TOBN(0x34964337, 0xd446a1dd), TOBN(0x7bf217a0, 0x859e77f6),
+       TOBN(0x8ff10527, 0x8e1d13f5), TOBN(0xa304ef03, 0x74eeae27)}},
+     {{TOBN(0xfc6f5e47, 0xd19dfa5a), TOBN(0xdb007de3, 0x7fad982b),
+       TOBN(0x28205ad1, 0x613715f5), TOBN(0x251e6729, 0x7889529e)},
+      {TOBN(0x72705184, 0x1ae98e78), TOBN(0xf818537d, 0x271cac32),
+       TOBN(0xc8a15b7e, 0xb7f410f5), TOBN(0xc474356f, 0x81f62393)}},
+     {{TOBN(0x92dbdc5a, 0xc242316b), TOBN(0xabe060ac, 0xdbf4aff5),
+       TOBN(0x6e8c38fe, 0x909a8ec6), TOBN(0x43e514e5, 0x6116cb94)},
+      {TOBN(0x2078fa38, 0x07d784f9), TOBN(0x1161a880, 0xf4b5b357),
+       TOBN(0x5283ce79, 0x13adea3d), TOBN(0x0756c3e6, 0xcc6a910b)}},
+     {{TOBN(0x60bcfe01, 0xaaa79697), TOBN(0x04a73b29, 0x56391db1),
+       TOBN(0xdd8dad47, 0x189b45a0), TOBN(0xbfac0dd0, 0x48d5b8d9)},
+      {TOBN(0x34ab3af5, 0x7d3d2ec2), TOBN(0x6fa2fc2d, 0x207bd3af),
+       TOBN(0x9ff40092, 0x66550ded), TOBN(0x719b3e87, 0x1fd5b913)}},
+     {{TOBN(0xa573a496, 0x6d17fbc7), TOBN(0x0cd1a70a, 0x73d2b24e),
+       TOBN(0x34e2c5ca, 0xb2676937), TOBN(0xe7050b06, 0xbf669f21)},
+      {TOBN(0xfbe948b6, 0x1ede9046), TOBN(0xa0530051, 0x97662659),
+       TOBN(0x58cbd4ed, 0xf10124c5), TOBN(0xde2646e4, 0xdd6c06c8)}},
+     {{TOBN(0x332f8108, 0x8cad38c0), TOBN(0x471b7e90, 0x6bd68ae2),
+       TOBN(0x56ac3fb2, 0x0d8e27a3), TOBN(0xb54660db, 0x136b4b0d)},
+      {TOBN(0x123a1e11, 0xa6fd8de4), TOBN(0x44dbffea, 0xa37799ef),
+       TOBN(0x4540b977, 0xce6ac17c), TOBN(0x495173a8, 0xaf60acef)}}},
+    {{{TOBN(0x9ebb284d, 0x391c2a82), TOBN(0xbcdd4863, 0x158308e8),
+       TOBN(0x006f16ec, 0x83f1edca), TOBN(0xa13e2c37, 0x695dc6c8)},
+      {TOBN(0x2ab756f0, 0x4a057a87), TOBN(0xa8765500, 0xa6b48f98),
+       TOBN(0x4252face, 0x68651c44), TOBN(0xa52b540b, 0xe1765e02)}},
+     {{TOBN(0x4f922fc5, 0x16a0d2bb), TOBN(0x0d5cc16c, 0x1a623499),
+       TOBN(0x9241cf3a, 0x57c62c8b), TOBN(0x2f5e6961, 0xfd1b667f)},
+      {TOBN(0x5c15c70b, 0xf5a01797), TOBN(0x3d20b44d, 0x60956192),
+       TOBN(0x04911b37, 0x071fdb52), TOBN(0xf648f916, 0x8d6f0f7b)}},
+     {{TOBN(0x6dc1acaf, 0xe60b7cf7), TOBN(0x25860a50, 0x84a9d869),
+       TOBN(0x56fc6f09, 0xe7ba8ac4), TOBN(0x828c5bd0, 0x6148d29e)},
+      {TOBN(0xac6b435e, 0xdc55ae5f), TOBN(0xa527f56c, 0xc0117411),
+       TOBN(0x94d5045e, 0xfd24342c), TOBN(0x2c4c0a35, 0x70b67c0d)}},
+     {{TOBN(0x027cc8b8, 0xfac61d9a), TOBN(0x7d25e062, 0xe3c6fe8a),
+       TOBN(0xe08805bf, 0xe5bff503), TOBN(0x13271e6c, 0x6ff632f7)},
+      {TOBN(0x55dca6c0, 0x232f76a5), TOBN(0x8957c32d, 0x701ef426),
+       TOBN(0xee728bcb, 0xa10a5178), TOBN(0x5ea60411, 0xb62c5173)}},
+     {{TOBN(0xfc4e964e, 0xd0b8892b), TOBN(0x9ea17683, 0x9301bb74),
+       TOBN(0x6265c5ae, 0xfcc48626), TOBN(0xe60cf82e, 0xbb3e9102)},
+      {TOBN(0x57adf797, 0xd4df5531), TOBN(0x235b59a1, 0x8deeefe2),
+       TOBN(0x60adcf58, 0x3f306eb1), TOBN(0x105c2753, 0x3d09492d)}},
+     {{TOBN(0x4090914b, 0xb5def996), TOBN(0x1cb69c83, 0x233dd1e7),
+       TOBN(0xc1e9c1d3, 0x9b3d5e76), TOBN(0x1f3338ed, 0xfccf6012)},
+      {TOBN(0xb1e95d0d, 0x2f5378a8), TOBN(0xacf4c2c7, 0x2f00cd21),
+       TOBN(0x6e984240, 0xeb5fe290), TOBN(0xd66c038d, 0x248088ae)}},
+     {{TOBN(0x804d264a, 0xf94d70cf), TOBN(0xbdb802ef, 0x7314bf7e),
+       TOBN(0x8fb54de2, 0x4333ed02), TOBN(0x740461e0, 0x285635d9)},
+      {TOBN(0x4113b2c8, 0x365e9383), TOBN(0xea762c83, 0x3fdef652),
+       TOBN(0x4eec6e2e, 0x47b956c1), TOBN(0xa3d814be, 0x65620fa4)}},
+     {{TOBN(0x9ad5462b, 0xb4d8bc50), TOBN(0x181c0b16, 0xa9195770),
+       TOBN(0xebd4fe1c, 0x78412a68), TOBN(0xae0341bc, 0xc0dff48c)},
+      {TOBN(0xb6bc45cf, 0x7003e866), TOBN(0xf11a6dea, 0x8a24a41b),
+       TOBN(0x5407151a, 0xd04c24c2), TOBN(0x62c9d27d, 0xda5b7b68)}},
+     {{TOBN(0x2e964235, 0x88cceff6), TOBN(0x8594c54f, 0x8b07ed69),
+       TOBN(0x1578e73c, 0xc84d0d0d), TOBN(0x7b4e1055, 0xff532868)},
+      {TOBN(0xa348c0d5, 0xb5ec995a), TOBN(0xbf4b9d55, 0x14289a54),
+       TOBN(0x9ba155a6, 0x58fbd777), TOBN(0x186ed7a8, 0x1a84491d)}},
+     {{TOBN(0xd4992b30, 0x614c0900), TOBN(0xda98d121, 0xbd00c24b),
+       TOBN(0x7f534dc8, 0x7ec4bfa1), TOBN(0x4a5ff674, 0x37dc34bc)},
+      {TOBN(0x68c196b8, 0x1d7ea1d7), TOBN(0x38cf2893, 0x80a6d208),
+       TOBN(0xfd56cd09, 0xe3cbbd6e), TOBN(0xec72e27e, 0x4205a5b6)}},
+     {{TOBN(0x15ea68f5, 0xa44f77f7), TOBN(0x7aa5f9fd, 0xb43c52bc),
+       TOBN(0x86ff676f, 0x94f0e609), TOBN(0xa4cde963, 0x2e2d432b)},
+      {TOBN(0x8cafa0c0, 0xeee470af), TOBN(0x84137d0e, 0x8a3f5ec8),
+       TOBN(0xebb40411, 0xfaa31231), TOBN(0xa239c13f, 0x6f7f7ccf)}},
+     {{TOBN(0x32865719, 0xa8afd30b), TOBN(0x86798328, 0x8a826dce),
+       TOBN(0xdf04e891, 0xc4a8fbe0), TOBN(0xbb6b6e1b, 0xebf56ad3)},
+      {TOBN(0x0a695b11, 0x471f1ff0), TOBN(0xd76c3389, 0xbe15baf0),
+       TOBN(0x018edb95, 0xbe96c43e), TOBN(0xf2beaaf4, 0x90794158)}},
+     {{TOBN(0x152db09e, 0xc3076a27), TOBN(0x5e82908e, 0xe416545d),
+       TOBN(0xa2c41272, 0x356d6f2e), TOBN(0xdc9c9642, 0x31fd74e1)},
+      {TOBN(0x66ceb88d, 0x519bf615), TOBN(0xe29ecd76, 0x05a2274e),
+       TOBN(0x3a0473c4, 0xbf5e2fa0), TOBN(0x6b6eb671, 0x64284e67)}},
+     {{TOBN(0xe8b97932, 0xb88756dd), TOBN(0xed4e8652, 0xf17e3e61),
+       TOBN(0xc2dd1499, 0x3ee1c4a4), TOBN(0xc0aaee17, 0x597f8c0e)},
+      {TOBN(0x15c4edb9, 0x6c168af3), TOBN(0x6563c7bf, 0xb39ae875),
+       TOBN(0xadfadb6f, 0x20adb436), TOBN(0xad55e8c9, 0x9a042ac0)}},
+     {{TOBN(0x975a1ed8, 0xb76da1f5), TOBN(0x10dfa466, 0xa58acb94),
+       TOBN(0x8dd7f7e3, 0xac060282), TOBN(0x6813e66a, 0x572a051e)},
+      {TOBN(0xb4ccae1e, 0x350cb901), TOBN(0xb653d656, 0x50cb7822),
+       TOBN(0x42484710, 0xdfab3b87), TOBN(0xcd7ee537, 0x9b670fd0)}},
+     {{TOBN(0x0a50b12e, 0x523b8bf6), TOBN(0x8009eb5b, 0x8f910c1b),
+       TOBN(0xf535af82, 0x4a167588), TOBN(0x0f835f9c, 0xfb2a2abd)},
+      {TOBN(0xf59b2931, 0x2afceb62), TOBN(0xc797df2a, 0x169d383f),
+       TOBN(0xeb3f5fb0, 0x66ac02b0), TOBN(0x029d4c6f, 0xdaa2d0ca)}},
+     {{TOBN(0xd4059bc1, 0xafab4bc5), TOBN(0x833f5c6f, 0x56783247),
+       TOBN(0xb5346630, 0x8d2d3605), TOBN(0x83387891, 0xd34d8433)},
+      {TOBN(0xd973b30f, 0xadd9419a), TOBN(0xbcca1099, 0xafe3fce8),
+       TOBN(0x08178315, 0x0809aac6), TOBN(0x01b7f21a, 0x540f0f11)}},
+     {{TOBN(0x65c29219, 0x909523c8), TOBN(0xa62f648f, 0xa3a1c741),
+       TOBN(0x88598d4f, 0x60c9e55a), TOBN(0xbce9141b, 0x0e4f347a)},
+      {TOBN(0x9af97d84, 0x35f9b988), TOBN(0x0210da62, 0x320475b6),
+       TOBN(0x3c076e22, 0x9191476c), TOBN(0x7520dbd9, 0x44fc7834)}},
+     {{TOBN(0x6a6b2cfe, 0xc1ab1bbd), TOBN(0xef8a65be, 0xdc650938),
+       TOBN(0x72855540, 0x805d7bc4), TOBN(0xda389396, 0xed11fdfd)},
+      {TOBN(0xa9d5bd36, 0x74660876), TOBN(0x11d67c54, 0xb45dff35),
+       TOBN(0x6af7d148, 0xa4f5da94), TOBN(0xbb8d4c3f, 0xc0bbeb31)}},
+     {{TOBN(0x87a7ebd1, 0xe0a1b12a), TOBN(0x1e4ef88d, 0x770ba95f),
+       TOBN(0x8c33345c, 0xdc2ae9cb), TOBN(0xcecf1276, 0x01cc8403)},
+      {TOBN(0x687c012e, 0x1b39b80f), TOBN(0xfd90d0ad, 0x35c33ba4),
+       TOBN(0xa3ef5a67, 0x5c9661c2), TOBN(0x368fc88e, 0xe017429e)}},
+     {{TOBN(0xd30c6761, 0x196a2fa2), TOBN(0x931b9817, 0xbd5b312e),
+       TOBN(0xba01000c, 0x72f54a31), TOBN(0xa203d2c8, 0x66eaa541)},
+      {TOBN(0xf2abdee0, 0x98939db3), TOBN(0xe37d6c2c, 0x3e606c02),
+       TOBN(0xf2921574, 0x521ff643), TOBN(0x2781b3c4, 0xd7e2fca3)}},
+     {{TOBN(0x664300b0, 0x7850ec06), TOBN(0xac5a38b9, 0x7d3a10cf),
+       TOBN(0x9233188d, 0xe34ab39d), TOBN(0xe77057e4, 0x5072cbb9)},
+      {TOBN(0xbcf0c042, 0xb59e78df), TOBN(0x4cfc91e8, 0x1d97de52),
+       TOBN(0x4661a26c, 0x3ee0ca4a), TOBN(0x5620a4c1, 0xfb8507bc)}},
+     {{TOBN(0x4b44d4aa, 0x049f842c), TOBN(0xceabc5d5, 0x1540e82b),
+       TOBN(0x306710fd, 0x15c6f156), TOBN(0xbe5ae52b, 0x63db1d72)},
+      {TOBN(0x06f1e7e6, 0x334957f1), TOBN(0x57e388f0, 0x31144a70),
+       TOBN(0xfb69bb2f, 0xdf96447b), TOBN(0x0f78ebd3, 0x73e38a12)}},
+     {{TOBN(0xb8222605, 0x2b7ce542), TOBN(0xe6d4ce99, 0x7472bde1),
+       TOBN(0x53e16ebe, 0x09d2f4da), TOBN(0x180ff42e, 0x53b92b2e)},
+      {TOBN(0xc59bcc02, 0x2c34a1c6), TOBN(0x3803d6f9, 0x422c46c2),
+       TOBN(0x18aff74f, 0x5c14a8a2), TOBN(0x55aebf80, 0x10a08b28)}},
+     {{TOBN(0x66097d58, 0x7135593f), TOBN(0x32e6eff7, 0x2be570cd),
+       TOBN(0x584e6a10, 0x2a8c860d), TOBN(0xcd185890, 0xa2eb4163)},
+      {TOBN(0x7ceae99d, 0x6d97e134), TOBN(0xd42c6b70, 0xdd8447ce),
+       TOBN(0x59ddbb4a, 0xb8c50273), TOBN(0x03c612df, 0x3cf34e1e)}},
+     {{TOBN(0x84b9ca15, 0x04b6c5a0), TOBN(0x35216f39, 0x18f0e3a3),
+       TOBN(0x3ec2d2bc, 0xbd986c00), TOBN(0x8bf546d9, 0xd19228fe)},
+      {TOBN(0xd1c655a4, 0x4cd623c3), TOBN(0x366ce718, 0x502b8e5a),
+       TOBN(0x2cfc84b4, 0xeea0bfe7), TOBN(0xe01d5cee, 0xcf443e8e)}},
+     {{TOBN(0x8ec045d9, 0x036520f8), TOBN(0xdfb3c3d1, 0x92d40e98),
+       TOBN(0x0bac4cce, 0xcc559a04), TOBN(0x35eccae5, 0x240ea6b1)},
+      {TOBN(0x180b32db, 0xf8a5a0ac), TOBN(0x547972a5, 0xeb699700),
+       TOBN(0xa3765801, 0xca26bca0), TOBN(0x57e09d0e, 0xa647f25a)}},
+     {{TOBN(0xb956970e, 0x2fdd23cc), TOBN(0xb80288bc, 0x5682e971),
+       TOBN(0xe6e6d91e, 0x9ae86ebc), TOBN(0x0564c83f, 0x8c9f1939)},
+      {TOBN(0x551932a2, 0x39560368), TOBN(0xe893752b, 0x049c28e2),
+       TOBN(0x0b03cee5, 0xa6a158c3), TOBN(0xe12d656b, 0x04964263)}},
+     {{TOBN(0x4b47554e, 0x63e3bc1d), TOBN(0xc719b6a2, 0x45044ff7),
+       TOBN(0x4f24d30a, 0xe48daa07), TOBN(0xa3f37556, 0xc8c1edc3)},
+      {TOBN(0x9a47bf76, 0x0700d360), TOBN(0xbb1a1824, 0x822ae4e2),
+       TOBN(0x22e275a3, 0x89f1fb4c), TOBN(0x72b1aa23, 0x9968c5f5)}},
+     {{TOBN(0xa75feaca, 0xbe063f64), TOBN(0x9b392f43, 0xbce47a09),
+       TOBN(0xd4241509, 0x1ad07aca), TOBN(0x4b0c591b, 0x8d26cd0f)},
+      {TOBN(0x2d42ddfd, 0x92f1169a), TOBN(0x63aeb1ac, 0x4cbf2392),
+       TOBN(0x1de9e877, 0x0691a2af), TOBN(0xebe79af7, 0xd98021da)}},
+     {{TOBN(0xcfdf2a4e, 0x40e50acf), TOBN(0xf0a98ad7, 0xaf01d665),
+       TOBN(0xefb640bf, 0x1831be1f), TOBN(0x6fe8bd2f, 0x80e9ada0)},
+      {TOBN(0x94c103a1, 0x6cafbc91), TOBN(0x170f8759, 0x8308e08c),
+       TOBN(0x5de2d2ab, 0x9780ff4f), TOBN(0x666466bc, 0x45b201f2)}},
+     {{TOBN(0x58af2010, 0xf5b343bc), TOBN(0x0f2e400a, 0xf2f142fe),
+       TOBN(0x3483bfde, 0xa85f4bdf), TOBN(0xf0b1d093, 0x03bfeaa9)},
+      {TOBN(0x2ea01b95, 0xc7081603), TOBN(0xe943e4c9, 0x3dba1097),
+       TOBN(0x47be92ad, 0xb438f3a6), TOBN(0x00bb7742, 0xe5bf6636)}},
+     {{TOBN(0x136b7083, 0x824297b4), TOBN(0x9d0e5580, 0x5584455f),
+       TOBN(0xab48cedc, 0xf1c7d69e), TOBN(0x53a9e481, 0x2a256e76)},
+      {TOBN(0x0402b0e0, 0x65eb2413), TOBN(0xdadbbb84, 0x8fc407a7),
+       TOBN(0xa65cd5a4, 0x8d7f5492), TOBN(0x21d44293, 0x74bae294)}},
+     {{TOBN(0x66917ce6, 0x3b5f1cc4), TOBN(0x37ae52ea, 0xce872e62),
+       TOBN(0xbb087b72, 0x2905f244), TOBN(0x12077086, 0x1e6af74f)},
+      {TOBN(0x4b644e49, 0x1058edea), TOBN(0x827510e3, 0xb638ca1d),
+       TOBN(0x8cf2b704, 0x6038591c), TOBN(0xffc8b47a, 0xfe635063)}},
+     {{TOBN(0x3ae220e6, 0x1b4d5e63), TOBN(0xbd864742, 0x9d961b4b),
+       TOBN(0x610c107e, 0x9bd16bed), TOBN(0x4270352a, 0x1127147b)},
+      {TOBN(0x7d17ffe6, 0x64cfc50e), TOBN(0x50dee01a, 0x1e36cb42),
+       TOBN(0x068a7622, 0x35dc5f9a), TOBN(0x9a08d536, 0xdf53f62c)}},
+     {{TOBN(0x4ed71457, 0x6be5f7de), TOBN(0xd93006f8, 0xc2263c9e),
+       TOBN(0xe073694c, 0xcacacb36), TOBN(0x2ff7a5b4, 0x3ae118ab)},
+      {TOBN(0x3cce53f1, 0xcd871236), TOBN(0xf156a39d, 0xc2aa6d52),
+       TOBN(0x9cc5f271, 0xb198d76d), TOBN(0xbc615b6f, 0x81383d39)}},
+     {{TOBN(0xa54538e8, 0xde3eee6b), TOBN(0x58c77538, 0xab910d91),
+       TOBN(0x31e5bdbc, 0x58d278bd), TOBN(0x3cde4adf, 0xb963acae)},
+      {TOBN(0xb1881fd2, 0x5302169c), TOBN(0x8ca60fa0, 0xa989ed8b),
+       TOBN(0xa1999458, 0xff96a0ee), TOBN(0xc1141f03, 0xac6c283d)}},
+     {{TOBN(0x7677408d, 0x6dfafed3), TOBN(0x33a01653, 0x39661588),
+       TOBN(0x3c9c15ec, 0x0b726fa0), TOBN(0x090cfd93, 0x6c9b56da)},
+      {TOBN(0xe34f4bae, 0xa3c40af5), TOBN(0x3469eadb, 0xd21129f1),
+       TOBN(0xcc51674a, 0x1e207ce8), TOBN(0x1e293b24, 0xc83b1ef9)}},
+     {{TOBN(0x17173d13, 0x1e6c0bb4), TOBN(0x19004695, 0x90776d35),
+       TOBN(0xe7980e34, 0x6de6f922), TOBN(0x873554cb, 0xf4dd9a22)},
+      {TOBN(0x0316c627, 0xcbf18a51), TOBN(0x4d93651b, 0x3032c081),
+       TOBN(0x207f2771, 0x3946834d), TOBN(0x2c08d7b4, 0x30cdbf80)}},
+     {{TOBN(0x137a4fb4, 0x86df2a61), TOBN(0xa1ed9c07, 0xecf7b4a2),
+       TOBN(0xb2e460e2, 0x7bd042ff), TOBN(0xb7f5e2fa, 0x5f62f5ec)},
+      {TOBN(0x7aa6ec6b, 0xcc2423b7), TOBN(0x75ce0a7f, 0xba63eea7),
+       TOBN(0x67a45fb1, 0xf250a6e1), TOBN(0x93bc919c, 0xe53cdc9f)}},
+     {{TOBN(0x9271f56f, 0x871942df), TOBN(0x2372ff6f, 0x7859ad66),
+       TOBN(0x5f4c2b96, 0x33cb1a78), TOBN(0xe3e29101, 0x5838aa83)},
+      {TOBN(0xa7ed1611, 0xe4e8110c), TOBN(0x2a2d70d5, 0x330198ce),
+       TOBN(0xbdf132e8, 0x6720efe0), TOBN(0xe61a8962, 0x66a471bf)}},
+     {{TOBN(0x796d3a85, 0x825808bd), TOBN(0x51dc3cb7, 0x3fd6e902),
+       TOBN(0x643c768a, 0x916219d1), TOBN(0x36cd7685, 0xa2ad7d32)},
+      {TOBN(0xe3db9d05, 0xb22922a4), TOBN(0x6494c87e, 0xdba29660),
+       TOBN(0xf0ac91df, 0xbcd2ebc7), TOBN(0x4deb57a0, 0x45107f8d)}},
+     {{TOBN(0x42271f59, 0xc3d12a73), TOBN(0x5f71687c, 0xa5c2c51d),
+       TOBN(0xcb1f50c6, 0x05797bcb), TOBN(0x29ed0ed9, 0xd6d34eb0)},
+      {TOBN(0xe5fe5b47, 0x4683c2eb), TOBN(0x4956eeb5, 0x97447c46),
+       TOBN(0x5b163a43, 0x71207167), TOBN(0x93fa2fed, 0x0248c5ef)}},
+     {{TOBN(0x67930af2, 0x31f63950), TOBN(0xa77797c1, 0x14caa2c9),
+       TOBN(0x526e80ee, 0x27ac7e62), TOBN(0xe1e6e626, 0x58b28aec)},
+      {TOBN(0x636178b0, 0xb3c9fef0), TOBN(0xaf7752e0, 0x6d5f90be),
+       TOBN(0x94ecaf18, 0xeece51cf), TOBN(0x2864d0ed, 0xca806e1f)}},
+     {{TOBN(0x6de2e383, 0x97c69134), TOBN(0x5a42c316, 0xeb291293),
+       TOBN(0xc7779219, 0x6a60bae0), TOBN(0xa24de346, 0x6b7599d1)},
+      {TOBN(0x49d374aa, 0xb75d4941), TOBN(0x98900586, 0x2d501ff0),
+       TOBN(0x9f16d40e, 0xeb7974cf), TOBN(0x1033860b, 0xcdd8c115)}},
+     {{TOBN(0xb6c69ac8, 0x2094cec3), TOBN(0x9976fb88, 0x403b770c),
+       TOBN(0x1dea026c, 0x4859590d), TOBN(0xb6acbb46, 0x8562d1fd)},
+      {TOBN(0x7cd6c461, 0x44569d85), TOBN(0xc3190a36, 0x97f0891d),
+       TOBN(0xc6f53195, 0x48d5a17d), TOBN(0x7d919966, 0xd749abc8)}},
+     {{TOBN(0x65104837, 0xdd1c8a20), TOBN(0x7e5410c8, 0x2f683419),
+       TOBN(0x958c3ca8, 0xbe94022e), TOBN(0x605c3197, 0x6145dac2)},
+      {TOBN(0x3fc07501, 0x01683d54), TOBN(0x1d7127c5, 0x595b1234),
+       TOBN(0x10b8f87c, 0x9481277f), TOBN(0x677db2a8, 0xe65a1adb)}},
+     {{TOBN(0xec2fccaa, 0xddce3345), TOBN(0x2a6811b7, 0x012a4350),
+       TOBN(0x96760ff1, 0xac598bdc), TOBN(0x054d652a, 0xd1bf4128)},
+      {TOBN(0x0a1151d4, 0x92a21005), TOBN(0xad7f3971, 0x33110fdf),
+       TOBN(0x8c95928c, 0x1960100f), TOBN(0x6c91c825, 0x7bf03362)}},
+     {{TOBN(0xc8c8b2a2, 0xce309f06), TOBN(0xfdb27b59, 0xca27204b),
+       TOBN(0xd223eaa5, 0x0848e32e), TOBN(0xb93e4b2e, 0xe7bfaf1e)},
+      {TOBN(0xc5308ae6, 0x44aa3ded), TOBN(0x317a666a, 0xc015d573),
+       TOBN(0xc888ce23, 0x1a979707), TOBN(0xf141c1e6, 0x0d5c4958)}},
+     {{TOBN(0xb53b7de5, 0x61906373), TOBN(0x858dbade, 0xeb999595),
+       TOBN(0x8cbb47b2, 0xa59e5c36), TOBN(0x660318b3, 0xdcf4e842)},
+      {TOBN(0xbd161ccd, 0x12ba4b7a), TOBN(0xf399daab, 0xf8c8282a),
+       TOBN(0x1587633a, 0xeeb2130d), TOBN(0xa465311a, 0xda38dd7d)}},
+     {{TOBN(0x5f75eec8, 0x64d3779b), TOBN(0x3c5d0476, 0xad64c171),
+       TOBN(0x87410371, 0x2a914428), TOBN(0x8096a891, 0x90e2fc29)},
+      {TOBN(0xd3d2ae9d, 0x23b3ebc2), TOBN(0x90bdd6db, 0xa580cfd6),
+       TOBN(0x52dbb7f3, 0xc5b01f6c), TOBN(0xe68eded4, 0xe102a2dc)}},
+     {{TOBN(0x17785b77, 0x99eb6df0), TOBN(0x26c3cc51, 0x7386b779),
+       TOBN(0x345ed988, 0x6417a48e), TOBN(0xe990b4e4, 0x07d6ef31)},
+      {TOBN(0x0f456b7e, 0x2586abba), TOBN(0x239ca6a5, 0x59c96e9a),
+       TOBN(0xe327459c, 0xe2eb4206), TOBN(0x3a4c3313, 0xa002b90a)}},
+     {{TOBN(0x2a114806, 0xf6a3f6fb), TOBN(0xad5cad2f, 0x85c251dd),
+       TOBN(0x92c1f613, 0xf5a784d3), TOBN(0xec7bfacf, 0x349766d5)},
+      {TOBN(0x04b3cd33, 0x3e23cb3b), TOBN(0x3979fe84, 0xc5a64b2d),
+       TOBN(0x192e2720, 0x7e589106), TOBN(0xa60c43d1, 0xa15b527f)}},
+     {{TOBN(0x2dae9082, 0xbe7cf3a6), TOBN(0xcc86ba92, 0xbc967274),
+       TOBN(0xf28a2ce8, 0xaea0a8a9), TOBN(0x404ca6d9, 0x6ee988b3)},
+      {TOBN(0xfd7e9c5d, 0x005921b8), TOBN(0xf56297f1, 0x44e79bf9),
+       TOBN(0xa163b460, 0x0d75ddc2), TOBN(0x30b23616, 0xa1f2be87)}},
+     {{TOBN(0x4b070d21, 0xbfe50e2b), TOBN(0x7ef8cfd0, 0xe1bfede1),
+       TOBN(0xadba0011, 0x2aac4ae0), TOBN(0x2a3e7d01, 0xb9ebd033)},
+      {TOBN(0x995277ec, 0xe38d9d1c), TOBN(0xb500249e, 0x9c5d2de3),
+       TOBN(0x8912b820, 0xf13ca8c9), TOBN(0xc8798114, 0x877793af)}},
+     {{TOBN(0x19e6125d, 0xec3f1dec), TOBN(0x07b1f040, 0x911178da),
+       TOBN(0xd93ededa, 0x904a6738), TOBN(0x55187a5a, 0x0bebedcd)},
+      {TOBN(0xf7d04722, 0xeb329d41), TOBN(0xf449099e, 0xf170b391),
+       TOBN(0xfd317a69, 0xca99f828), TOBN(0x50c3db2b, 0x34a4976d)}},
+     {{TOBN(0xe9ba7784, 0x3757b392), TOBN(0x326caefd, 0xaa3ca05a),
+       TOBN(0x78e5293b, 0xf1e593d4), TOBN(0x7842a937, 0x0d98fd13)},
+      {TOBN(0xe694bf96, 0x5f96b10d), TOBN(0x373a9df6, 0x06a8cd05),
+       TOBN(0x997d1e51, 0xe8f0c7fc), TOBN(0x1d019790, 0x63fd972e)}},
+     {{TOBN(0x0064d858, 0x5499fb32), TOBN(0x7b67bad9, 0x77a8aeb7),
+       TOBN(0x1d3eb977, 0x2d08eec5), TOBN(0x5fc047a6, 0xcbabae1d)},
+      {TOBN(0x0577d159, 0xe54a64bb), TOBN(0x8862201b, 0xc43497e4),
+       TOBN(0xad6b4e28, 0x2ce0608d), TOBN(0x8b687b7d, 0x0b167aac)}},
+     {{TOBN(0x6ed4d367, 0x8b2ecfa9), TOBN(0x24dfe62d, 0xa90c3c38),
+       TOBN(0xa1862e10, 0x3fe5c42b), TOBN(0x1ca73dca, 0xd5732a9f)},
+      {TOBN(0x35f038b7, 0x76bb87ad), TOBN(0x674976ab, 0xf242b81f),
+       TOBN(0x4f2bde7e, 0xb0fd90cd), TOBN(0x6efc172e, 0xa7fdf092)}},
+     {{TOBN(0x3806b69b, 0x92222f1f), TOBN(0x5a2459ca, 0x6cf7ae70),
+       TOBN(0x6789f69c, 0xa85217ee), TOBN(0x5f232b5e, 0xe3dc85ac)},
+      {TOBN(0x660e3ec5, 0x48e9e516), TOBN(0x124b4e47, 0x3197eb31),
+       TOBN(0x10a0cb13, 0xaafcca23), TOBN(0x7bd63ba4, 0x8213224f)}},
+     {{TOBN(0xaffad7cc, 0x290a7f4f), TOBN(0x6b409c9e, 0x0286b461),
+       TOBN(0x58ab809f, 0xffa407af), TOBN(0xc3122eed, 0xc68ac073)},
+      {TOBN(0x17bf9e50, 0x4ef24d7e), TOBN(0x5d929794, 0x3e2a5811),
+       TOBN(0x519bc867, 0x02902e01), TOBN(0x76bba5da, 0x39c8a851)}},
+     {{TOBN(0xe9f9669c, 0xda94951e), TOBN(0x4b6af58d, 0x66b8d418),
+       TOBN(0xfa321074, 0x17d426a4), TOBN(0xc78e66a9, 0x9dde6027)},
+      {TOBN(0x0516c083, 0x4a53b964), TOBN(0xfc659d38, 0xff602330),
+       TOBN(0x0ab55e5c, 0x58c5c897), TOBN(0x985099b2, 0x838bc5df)}},
+     {{TOBN(0x061d9efc, 0xc52fc238), TOBN(0x712b2728, 0x6ac1da3f),
+       TOBN(0xfb658149, 0x9283fe08), TOBN(0x4954ac94, 0xb8aaa2f7)},
+      {TOBN(0x85c0ada4, 0x7fb2e74f), TOBN(0xee8ba98e, 0xb89926b0),
+       TOBN(0xe4f9d37d, 0x23d1af5b), TOBN(0x14ccdbf9, 0xba9b015e)}},
+     {{TOBN(0xb674481b, 0x7bfe7178), TOBN(0x4e1debae, 0x65405868),
+       TOBN(0x061b2821, 0xc48c867d), TOBN(0x69c15b35, 0x513b30ea)},
+      {TOBN(0x3b4a1666, 0x36871088), TOBN(0xe5e29f5d, 0x1220b1ff),
+       TOBN(0x4b82bb35, 0x233d9f4d), TOBN(0x4e076333, 0x18cdc675)}}},
+    {{{TOBN(0x0d53f5c7, 0xa3e6fced), TOBN(0xe8cbbdd5, 0xf45fbdeb),
+       TOBN(0xf85c01df, 0x13339a70), TOBN(0x0ff71880, 0x142ceb81)},
+      {TOBN(0x4c4e8774, 0xbd70437a), TOBN(0x5fb32891, 0xba0bda6a),
+       TOBN(0x1cdbebd2, 0xf18bd26e), TOBN(0x2f9526f1, 0x03a9d522)}},
+     {{TOBN(0x40ce3051, 0x92c4d684), TOBN(0x8b04d725, 0x7612efcd),
+       TOBN(0xb9dcda36, 0x6f9cae20), TOBN(0x0edc4d24, 0xf058856c)},
+      {TOBN(0x64f2e6bf, 0x85427900), TOBN(0x3de81295, 0xdc09dfea),
+       TOBN(0xd41b4487, 0x379bf26c), TOBN(0x50b62c6d, 0x6df135a9)}},
+     {{TOBN(0xd4f8e3b4, 0xc72dfe67), TOBN(0xc416b0f6, 0x90e19fdf),
+       TOBN(0x18b9098d, 0x4c13bd35), TOBN(0xac11118a, 0x15b8cb9e)},
+      {TOBN(0xf598a318, 0xf0062841), TOBN(0xbfe0602f, 0x89f356f4),
+       TOBN(0x7ae3637e, 0x30177a0c), TOBN(0x34097747, 0x61136537)}},
+     {{TOBN(0x0db2fb5e, 0xd005832a), TOBN(0x5f5efd3b, 0x91042e4f),
+       TOBN(0x8c4ffdc6, 0xed70f8ca), TOBN(0xe4645d0b, 0xb52da9cc)},
+      {TOBN(0x9596f58b, 0xc9001d1f), TOBN(0x52c8f0bc, 0x4e117205),
+       TOBN(0xfd4aa0d2, 0xe398a084), TOBN(0x815bfe3a, 0x104f49de)}},
+     {{TOBN(0x97e5443f, 0x23885e5f), TOBN(0xf72f8f99, 0xe8433aab),
+       TOBN(0xbd00b154, 0xe4d4e604), TOBN(0xd0b35e6a, 0xe5e173ff)},
+      {TOBN(0x57b2a048, 0x9164722d), TOBN(0x3e3c665b, 0x88761ec8),
+       TOBN(0x6bdd1397, 0x3da83832), TOBN(0x3c8b1a1e, 0x73dafe3b)}},
+     {{TOBN(0x4497ace6, 0x54317cac), TOBN(0xbe600ab9, 0x521771b3),
+       TOBN(0xb42e409e, 0xb0dfe8b8), TOBN(0x386a67d7, 0x3942310f)},
+      {TOBN(0x25548d8d, 0x4431cc28), TOBN(0xa7cff142, 0x985dc524),
+       TOBN(0x4d60f5a1, 0x93c4be32), TOBN(0x83ebd5c8, 0xd071c6e1)}},
+     {{TOBN(0xba3a80a7, 0xb1fd2b0b), TOBN(0x9b3ad396, 0x5bec33e8),
+       TOBN(0xb3868d61, 0x79743fb3), TOBN(0xcfd169fc, 0xfdb462fa)},
+      {TOBN(0xd3b499d7, 0x9ce0a6af), TOBN(0x55dc1cf1, 0xe42d3ff8),
+       TOBN(0x04fb9e6c, 0xc6c3e1b2), TOBN(0x47e6961d, 0x6f69a474)}},
+     {{TOBN(0x54eb3acc, 0xe548b37b), TOBN(0xb38e7542, 0x84d40549),
+       TOBN(0x8c3daa51, 0x7b341b4f), TOBN(0x2f6928ec, 0x690bf7fa)},
+      {TOBN(0x0496b323, 0x86ce6c41), TOBN(0x01be1c55, 0x10adadcd),
+       TOBN(0xc04e67e7, 0x4bb5faf9), TOBN(0x3cbaf678, 0xe15c9985)}},
+     {{TOBN(0x8cd12145, 0x50ca4247), TOBN(0xba1aa47a, 0xe7dd30aa),
+       TOBN(0x2f81ddf1, 0xe58fee24), TOBN(0x03452936, 0xeec9b0e8)},
+      {TOBN(0x8bdc3b81, 0x243aea96), TOBN(0x9a2919af, 0x15c3d0e5),
+       TOBN(0x9ea640ec, 0x10948361), TOBN(0x5ac86d5b, 0x6e0bcccf)}},
+     {{TOBN(0xf892d918, 0xc36cf440), TOBN(0xaed3e837, 0xc939719c),
+       TOBN(0xb07b08d2, 0xc0218b64), TOBN(0x6f1bcbba, 0xce9790dd)},
+      {TOBN(0x4a84d6ed, 0x60919b8e), TOBN(0xd8900791, 0x8ac1f9eb),
+       TOBN(0xf84941aa, 0x0dd5daef), TOBN(0xb22fe40a, 0x67fd62c5)}},
+     {{TOBN(0x97e15ba2, 0x157f2db3), TOBN(0xbda2fc8f, 0x8e28ca9c),
+       TOBN(0x5d050da4, 0x37b9f454), TOBN(0x3d57eb57, 0x2379d72e)},
+      {TOBN(0xe9b5eba2, 0xfb5ee997), TOBN(0x01648ca2, 0xe11538ca),
+       TOBN(0x32bb76f6, 0xf6327974), TOBN(0x338f14b8, 0xff3f4bb7)}},
+     {{TOBN(0x524d226a, 0xd7ab9a2d), TOBN(0x9c00090d, 0x7dfae958),
+       TOBN(0x0ba5f539, 0x8751d8c2), TOBN(0x8afcbcdd, 0x3ab8262d)},
+      {TOBN(0x57392729, 0xe99d043b), TOBN(0xef51263b, 0xaebc943a),
+       TOBN(0x9feace93, 0x20862935), TOBN(0x639efc03, 0xb06c817b)}},
+     {{TOBN(0x1fe054b3, 0x66b4be7a), TOBN(0x3f25a9de, 0x84a37a1e),
+       TOBN(0xf39ef1ad, 0x78d75cd9), TOBN(0xd7b58f49, 0x5062c1b5)},
+      {TOBN(0x6f74f9a9, 0xff563436), TOBN(0xf718ff29, 0xe8af51e7),
+       TOBN(0x5234d313, 0x15e97fec), TOBN(0xb6a8e2b1, 0x292f1c0a)}},
+     {{TOBN(0xa7f53aa8, 0x327720c1), TOBN(0x956ca322, 0xba092cc8),
+       TOBN(0x8f03d64a, 0x28746c4d), TOBN(0x51fe1782, 0x66d0d392)},
+      {TOBN(0xd19b34db, 0x3c832c80), TOBN(0x60dccc5c, 0x6da2e3b4),
+       TOBN(0x245dd62e, 0x0a104ccc), TOBN(0xa7ab1de1, 0x620b21fd)}},
+     {{TOBN(0xb293ae0b, 0x3893d123), TOBN(0xf7b75783, 0xb15ee71c),
+       TOBN(0x5aa3c614, 0x42a9468b), TOBN(0xd686123c, 0xdb15d744)},
+      {TOBN(0x8c616891, 0xa7ab4116), TOBN(0x6fcd72c8, 0xa4e6a459),
+       TOBN(0xac219110, 0x77e5fad7), TOBN(0xfb6a20e7, 0x704fa46b)}},
+     {{TOBN(0xe839be7d, 0x341d81dc), TOBN(0xcddb6889, 0x32148379),
+       TOBN(0xda6211a1, 0xf7026ead), TOBN(0xf3b2575f, 0xf4d1cc5e)},
+      {TOBN(0x40cfc8f6, 0xa7a73ae6), TOBN(0x83879a5e, 0x61d5b483),
+       TOBN(0xc5acb1ed, 0x41a50ebc), TOBN(0x59a60cc8, 0x3c07d8fa)}},
+     {{TOBN(0x1b73bdce, 0xb1876262), TOBN(0x2b0d79f0, 0x12af4ee9),
+       TOBN(0x8bcf3b0b, 0xd46e1d07), TOBN(0x17d6af9d, 0xe45d152f)},
+      {TOBN(0x73520461, 0x6d736451), TOBN(0x43cbbd97, 0x56b0bf5a),
+       TOBN(0xb0833a5b, 0xd5999b9d), TOBN(0x702614f0, 0xeb72e398)}},
+     {{TOBN(0x0aadf01a, 0x59c3e9f8), TOBN(0x40200e77, 0xce6b3d16),
+       TOBN(0xda22bdd3, 0xdeddafad), TOBN(0x76dedaf4, 0x310d72e1)},
+      {TOBN(0x49ef807c, 0x4bc2e88f), TOBN(0x6ba81291, 0x146dd5a5),
+       TOBN(0xa1a4077a, 0x7d8d59e9), TOBN(0x87b6a2e7, 0x802db349)}},
+     {{TOBN(0xd5679997, 0x1b4e598e), TOBN(0xf499ef1f, 0x06fe4b1d),
+       TOBN(0x3978d3ae, 0xfcb267c5), TOBN(0xb582b557, 0x235786d0)},
+      {TOBN(0x32b3b2ca, 0x1715cb07), TOBN(0x4c3de6a2, 0x8480241d),
+       TOBN(0x63b5ffed, 0xcb571ecd), TOBN(0xeaf53900, 0xed2fe9a9)}},
+     {{TOBN(0xdec98d4a, 0xc3b81990), TOBN(0x1cb83722, 0x9e0cc8fe),
+       TOBN(0xfe0b0491, 0xd2b427b9), TOBN(0x0f2386ac, 0xe983a66c)},
+      {TOBN(0x930c4d1e, 0xb3291213), TOBN(0xa2f82b2e, 0x59a62ae4),
+       TOBN(0x77233853, 0xf93e89e3), TOBN(0x7f8063ac, 0x11777c7f)}},
+     {{TOBN(0xff0eb567, 0x59ad2877), TOBN(0x6f454642, 0x9865c754),
+       TOBN(0xe6fe701a, 0x236e9a84), TOBN(0xc586ef16, 0x06e40fc3)},
+      {TOBN(0x3f62b6e0, 0x24bafad9), TOBN(0xc8b42bd2, 0x64da906a),
+       TOBN(0xc98e1eb4, 0xda3276a0), TOBN(0x30d0e5fc, 0x06cbf852)}},
+     {{TOBN(0x1b6b2ae1, 0xe8b4dfd4), TOBN(0xd754d5c7, 0x8301cbac),
+       TOBN(0x66097629, 0x112a39ac), TOBN(0xf86b5999, 0x93ba4ab9)},
+      {TOBN(0x26c9dea7, 0x99f9d581), TOBN(0x0473b1a8, 0xc2fafeaa),
+       TOBN(0x1469af55, 0x3b2505a5), TOBN(0x227d16d7, 0xd6a43323)}},
+     {{TOBN(0x3316f73c, 0xad3d97f9), TOBN(0x52bf3bb5, 0x1f137455),
+       TOBN(0x953eafeb, 0x09954e7c), TOBN(0xa721dfed, 0xdd732411)},
+      {TOBN(0xb4929821, 0x141d4579), TOBN(0x3411321c, 0xaa3bd435),
+       TOBN(0xafb355aa, 0x17fa6015), TOBN(0xb4e7ef4a, 0x18e42f0e)}},
+     {{TOBN(0x604ac97c, 0x59371000), TOBN(0xe1c48c70, 0x7f759c18),
+       TOBN(0x3f62ecc5, 0xa5db6b65), TOBN(0x0a78b173, 0x38a21495)},
+      {TOBN(0x6be1819d, 0xbcc8ad94), TOBN(0x70dc04f6, 0xd89c3400),
+       TOBN(0x462557b4, 0xa6b4840a), TOBN(0x544c6ade, 0x60bd21c0)}},
+     {{TOBN(0x6a00f24e, 0x907a544b), TOBN(0xa7520dcb, 0x313da210),
+       TOBN(0xfe939b75, 0x11e4994b), TOBN(0x918b6ba6, 0xbc275d70)},
+      {TOBN(0xd3e5e0fc, 0x644be892), TOBN(0x707a9816, 0xfdaf6c42),
+       TOBN(0x60145567, 0xf15c13fe), TOBN(0x4818ebaa, 0xe130a54a)}},
+     {{TOBN(0x28aad3ad, 0x58d2f767), TOBN(0xdc5267fd, 0xd7e7c773),
+       TOBN(0x4919cc88, 0xc3afcc98), TOBN(0xaa2e6ab0, 0x2db8cd4b)},
+      {TOBN(0xd46fec04, 0xd0c63eaa), TOBN(0xa1cb92c5, 0x19ffa832),
+       TOBN(0x678dd178, 0xe43a631f), TOBN(0xfb5ae1cd, 0x3dc788b3)}},
+     {{TOBN(0x68b4fb90, 0x6e77de04), TOBN(0x7992bcf0, 0xf06dbb97),
+       TOBN(0x896e6a13, 0xc417c01d), TOBN(0x8d96332c, 0xb956be01)},
+      {TOBN(0x902fc93a, 0x413aa2b9), TOBN(0x99a4d915, 0xfc98c8a5),
+       TOBN(0x52c29407, 0x565f1137), TOBN(0x4072690f, 0x21e4f281)}},
+     {{TOBN(0x36e607cf, 0x02ff6072), TOBN(0xa47d2ca9, 0x8ad98cdc),
+       TOBN(0xbf471d1e, 0xf5f56609), TOBN(0xbcf86623, 0xf264ada0)},
+      {TOBN(0xb70c0687, 0xaa9e5cb6), TOBN(0xc98124f2, 0x17401c6c),
+       TOBN(0x8189635f, 0xd4a61435), TOBN(0xd28fb8af, 0xa9d98ea6)}},
+     {{TOBN(0xb9a67c2a, 0x40c251f8), TOBN(0x88cd5d87, 0xa2da44be),
+       TOBN(0x437deb96, 0xe09b5423), TOBN(0x150467db, 0x64287dc1)},
+      {TOBN(0xe161debb, 0xcdabb839), TOBN(0xa79e9742, 0xf1839a3e),
+       TOBN(0xbb8dd3c2, 0x652d202b), TOBN(0x7b3e67f7, 0xe9f97d96)}},
+     {{TOBN(0x5aa5d78f, 0xb1cb6ac9), TOBN(0xffa13e8e, 0xca1d0d45),
+       TOBN(0x369295dd, 0x2ba5bf95), TOBN(0xd68bd1f8, 0x39aff05e)},
+      {TOBN(0xaf0d86f9, 0x26d783f2), TOBN(0x543a59b3, 0xfc3aafc1),
+       TOBN(0x3fcf81d2, 0x7b7da97c), TOBN(0xc990a056, 0xd25dee46)}},
+     {{TOBN(0x3e6775b8, 0x519cce2c), TOBN(0xfc9af71f, 0xae13d863),
+       TOBN(0x774a4a6f, 0x47c1605c), TOBN(0x46ba4245, 0x2fd205e8)},
+      {TOBN(0xa06feea4, 0xd3fd524d), TOBN(0x1e724641, 0x6de1acc2),
+       TOBN(0xf53816f1, 0x334e2b42), TOBN(0x49e5918e, 0x922f0024)}},
+     {{TOBN(0x439530b6, 0x65c7322d), TOBN(0xcf12cc01, 0xb3c1b3fb),
+       TOBN(0xc70b0186, 0x0172f685), TOBN(0xb915ee22, 0x1b58391d)},
+      {TOBN(0x9afdf03b, 0xa317db24), TOBN(0x87dec659, 0x17b8ffc4),
+       TOBN(0x7f46597b, 0xe4d3d050), TOBN(0x80a1c1ed, 0x006500e7)}},
+     {{TOBN(0x84902a96, 0x78bf030e), TOBN(0xfb5e9c9a, 0x50560148),
+       TOBN(0x6dae0a92, 0x63362426), TOBN(0xdcaeecf4, 0xa9e30c40)},
+      {TOBN(0xc0d887bb, 0x518d0c6b), TOBN(0x99181152, 0xcb985b9d),
+       TOBN(0xad186898, 0xef7bc381), TOBN(0x18168ffb, 0x9ee46201)}},
+     {{TOBN(0x9a04cdaa, 0x2502753c), TOBN(0xbb279e26, 0x51407c41),
+       TOBN(0xeacb03aa, 0xf23564e5), TOBN(0x18336582, 0x71e61016)},
+      {TOBN(0x8684b8c4, 0xeb809877), TOBN(0xb336e18d, 0xea0e672e),
+       TOBN(0xefb601f0, 0x34ee5867), TOBN(0x2733edbe, 0x1341cfd1)}},
+     {{TOBN(0xb15e809a, 0x26025c3c), TOBN(0xe6e981a6, 0x9350df88),
+       TOBN(0x92376237, 0x8502fd8e), TOBN(0x4791f216, 0x0c12be9b)},
+      {TOBN(0xb7256789, 0x25f02425), TOBN(0xec863194, 0x7a974443),
+       TOBN(0x7c0ce882, 0xfb41cc52), TOBN(0xc266ff7e, 0xf25c07f2)}},
+     {{TOBN(0x3d4da8c3, 0x017025f3), TOBN(0xefcf628c, 0xfb9579b4),
+       TOBN(0x5c4d0016, 0x1f3716ec), TOBN(0x9c27ebc4, 0x6801116e)},
+      {TOBN(0x5eba0ea1, 0x1da1767e), TOBN(0xfe151452, 0x47004c57),
+       TOBN(0x3ace6df6, 0x8c2373b7), TOBN(0x75c3dffe, 0x5dbc37ac)}},
+     {{TOBN(0x3dc32a73, 0xddc925fc), TOBN(0xb679c841, 0x2f65ee0b),
+       TOBN(0x715a3295, 0x451cbfeb), TOBN(0xd9889768, 0xf76e9a29)},
+      {TOBN(0xec20ce7f, 0xb28ad247), TOBN(0xe99146c4, 0x00894d79),
+       TOBN(0x71457d7c, 0x9f5e3ea7), TOBN(0x097b2662, 0x38030031)}},
+     {{TOBN(0xdb7f6ae6, 0xcf9f82a8), TOBN(0x319decb9, 0x438f473a),
+       TOBN(0xa63ab386, 0x283856c3), TOBN(0x13e3172f, 0xb06a361b)},
+      {TOBN(0x2959f8dc, 0x7d5a006c), TOBN(0x2dbc27c6, 0x75fba752),
+       TOBN(0xc1227ab2, 0x87c22c9e), TOBN(0x06f61f75, 0x71a268b2)}},
+     {{TOBN(0x1b6bb971, 0x04779ce2), TOBN(0xaca83812, 0x0aadcb1d),
+       TOBN(0x297ae0bc, 0xaeaab2d5), TOBN(0xa5c14ee7, 0x5bfb9f13)},
+      {TOBN(0xaa00c583, 0xf17a62c7), TOBN(0x39eb962c, 0x173759f6),
+       TOBN(0x1eeba1d4, 0x86c9a88f), TOBN(0x0ab6c37a, 0xdf016c5e)}},
+     {{TOBN(0xa2a147db, 0xa28a0749), TOBN(0x246c20d6, 0xee519165),
+       TOBN(0x5068d1b1, 0xd3810715), TOBN(0xb1e7018c, 0x748160b9)},
+      {TOBN(0x03f5b1fa, 0xf380ff62), TOBN(0xef7fb1dd, 0xf3cb2c1e),
+       TOBN(0xeab539a8, 0xfc91a7da), TOBN(0x83ddb707, 0xf3f9b561)}},
+     {{TOBN(0xc550e211, 0xfe7df7a4), TOBN(0xa7cd07f2, 0x063f6f40),
+       TOBN(0xb0de3635, 0x2976879c), TOBN(0xb5f83f85, 0xe55741da)},
+      {TOBN(0x4ea9d25e, 0xf3d8ac3d), TOBN(0x6fe2066f, 0x62819f02),
+       TOBN(0x4ab2b9c2, 0xcef4a564), TOBN(0x1e155d96, 0x5ffa2de3)}},
+     {{TOBN(0x0eb0a19b, 0xc3a72d00), TOBN(0x4037665b, 0x8513c31b),
+       TOBN(0x2fb2b6bf, 0x04c64637), TOBN(0x45c34d6e, 0x08cdc639)},
+      {TOBN(0x56f1e10f, 0xf01fd796), TOBN(0x4dfb8101, 0xfe3667b8),
+       TOBN(0xe0eda253, 0x9021d0c0), TOBN(0x7a94e9ff, 0x8a06c6ab)}},
+     {{TOBN(0x2d3bb0d9, 0xbb9aa882), TOBN(0xea20e4e5, 0xec05fd10),
+       TOBN(0xed7eeb5f, 0x1a1ca64e), TOBN(0x2fa6b43c, 0xc6327cbd)},
+      {TOBN(0xb577e3cf, 0x3aa91121), TOBN(0x8c6bd5ea, 0x3a34079b),
+       TOBN(0xd7e5ba39, 0x60e02fc0), TOBN(0xf16dd2c3, 0x90141bf8)}},
+     {{TOBN(0xb57276d9, 0x80101b98), TOBN(0x760883fd, 0xb82f0f66),
+       TOBN(0x89d7de75, 0x4bc3eff3), TOBN(0x03b60643, 0x5dc2ab40)},
+      {TOBN(0xcd6e53df, 0xe05beeac), TOBN(0xf2f1e862, 0xbc3325cd),
+       TOBN(0xdd0f7921, 0x774f03c3), TOBN(0x97ca7221, 0x4552cc1b)}},
+     {{TOBN(0x5a0d6afe, 0x1cd19f72), TOBN(0xa20915dc, 0xf183fbeb),
+       TOBN(0x9fda4b40, 0x832c403c), TOBN(0x32738edd, 0xbe425442)},
+      {TOBN(0x469a1df6, 0xb5eccf1a), TOBN(0x4b5aff42, 0x28bbe1f0),
+       TOBN(0x31359d7f, 0x570dfc93), TOBN(0xa18be235, 0xf0088628)}},
+     {{TOBN(0xa5b30fba, 0xb00ed3a9), TOBN(0x34c61374, 0x73cdf8be),
+       TOBN(0x2c5c5f46, 0xabc56797), TOBN(0x5cecf93d, 0xb82a8ae2)},
+      {TOBN(0x7d3dbe41, 0xa968fbf0), TOBN(0xd23d4583, 0x1a5c7f3d),
+       TOBN(0xf28f69a0, 0xc087a9c7), TOBN(0xc2d75471, 0x474471ca)}},
+     {{TOBN(0x36ec9f4a, 0x4eb732ec), TOBN(0x6c943bbd, 0xb1ca6bed),
+       TOBN(0xd64535e1, 0xf2457892), TOBN(0x8b84a8ea, 0xf7e2ac06)},
+      {TOBN(0xe0936cd3, 0x2499dd5f), TOBN(0x12053d7e, 0x0ed04e57),
+       TOBN(0x4bdd0076, 0xe4305d9d), TOBN(0x34a527b9, 0x1f67f0a2)}},
+     {{TOBN(0xe79a4af0, 0x9cec46ea), TOBN(0xb15347a1, 0x658b9bc7),
+       TOBN(0x6bd2796f, 0x35af2f75), TOBN(0xac957990, 0x4051c435)},
+      {TOBN(0x2669dda3, 0xc33a655d), TOBN(0x5d503c2e, 0x88514aa3),
+       TOBN(0xdfa11337, 0x3753dd41), TOBN(0x3f054673, 0x0b754f78)}},
+     {{TOBN(0xbf185677, 0x496125bd), TOBN(0xfb0023c8, 0x3775006c),
+       TOBN(0xfa0f072f, 0x3a037899), TOBN(0x4222b6eb, 0x0e4aea57)},
+      {TOBN(0x3dde5e76, 0x7866d25a), TOBN(0xb6eb04f8, 0x4837aa6f),
+       TOBN(0x5315591a, 0x2cf1cdb8), TOBN(0x6dfb4f41, 0x2d4e683c)}},
+     {{TOBN(0x7e923ea4, 0x48ee1f3a), TOBN(0x9604d9f7, 0x05a2afd5),
+       TOBN(0xbe1d4a33, 0x40ea4948), TOBN(0x5b45f1f4, 0xb44cbd2f)},
+      {TOBN(0x5faf8376, 0x4acc757e), TOBN(0xa7cf9ab8, 0x63d68ff7),
+       TOBN(0x8ad62f69, 0xdf0e404b), TOBN(0xd65f33c2, 0x12bdafdf)}},
+     {{TOBN(0xc365de15, 0xa377b14e), TOBN(0x6bf5463b, 0x8e39f60c),
+       TOBN(0x62030d2d, 0x2ce68148), TOBN(0xd95867ef, 0xe6f843a8)},
+      {TOBN(0xd39a0244, 0xef5ab017), TOBN(0x0bd2d8c1, 0x4ab55d12),
+       TOBN(0xc9503db3, 0x41639169), TOBN(0x2d4e25b0, 0xf7660c8a)}},
+     {{TOBN(0x760cb3b5, 0xe224c5d7), TOBN(0xfa3baf8c, 0x68616919),
+       TOBN(0x9fbca113, 0x8d142552), TOBN(0x1ab18bf1, 0x7669ebf5)},
+      {TOBN(0x55e6f53e, 0x9bdf25dd), TOBN(0x04cc0bf3, 0xcb6cd154),
+       TOBN(0x595bef49, 0x95e89080), TOBN(0xfe9459a8, 0x104a9ac1)}},
+     {{TOBN(0xad2d89ca, 0xcce9bb32), TOBN(0xddea65e1, 0xf7de8285),
+       TOBN(0x62ed8c35, 0xb351bd4b), TOBN(0x4150ff36, 0x0c0e19a7)},
+      {TOBN(0x86e3c801, 0x345f4e47), TOBN(0x3bf21f71, 0x203a266c),
+       TOBN(0x7ae110d4, 0x855b1f13), TOBN(0x5d6aaf6a, 0x07262517)}},
+     {{TOBN(0x1e0f12e1, 0x813d28f1), TOBN(0x6000e11d, 0x7ad7a523),
+       TOBN(0xc7d8deef, 0xc744a17b), TOBN(0x1e990b48, 0x14c05a00)},
+      {TOBN(0x68fddaee, 0x93e976d5), TOBN(0x696241d1, 0x46610d63),
+       TOBN(0xb204e7c3, 0x893dda88), TOBN(0x8bccfa65, 0x6a3a6946)}},
+     {{TOBN(0xb59425b4, 0xc5cd1411), TOBN(0x701b4042, 0xff3658b1),
+       TOBN(0xe3e56bca, 0x4784cf93), TOBN(0x27de5f15, 0x8fe68d60)},
+      {TOBN(0x4ab9cfce, 0xf8d53f19), TOBN(0xddb10311, 0xa40a730d),
+       TOBN(0x6fa73cd1, 0x4eee0a8a), TOBN(0xfd548748, 0x5249719d)}},
+     {{TOBN(0x49d66316, 0xa8123ef0), TOBN(0x73c32db4, 0xe7f95438),
+       TOBN(0x2e2ed209, 0x0d9e7854), TOBN(0xf98a9329, 0x9d9f0507)},
+      {TOBN(0xc5d33cf6, 0x0c6aa20a), TOBN(0x9a32ba14, 0x75279bb2),
+       TOBN(0x7e3202cb, 0x774a7307), TOBN(0x64ed4bc4, 0xe8c42dbd)}},
+     {{TOBN(0xc20f1a06, 0xd4caed0d), TOBN(0xb8021407, 0x171d22b3),
+       TOBN(0xd426ca04, 0xd13268d7), TOBN(0x92377007, 0x25f4d126)},
+      {TOBN(0x4204cbc3, 0x71f21a85), TOBN(0x18461b7a, 0xf82369ba),
+       TOBN(0xc0c07d31, 0x3fc858f9), TOBN(0x5deb5a50, 0xe2bab569)}},
+     {{TOBN(0xd5959d46, 0xd5eea89e), TOBN(0xfdff8424, 0x08437f4b),
+       TOBN(0xf21071e4, 0x3cfe254f), TOBN(0x72417696, 0x95468321)},
+      {TOBN(0x5d8288b9, 0x102cae3e), TOBN(0x2d143e3d, 0xf1965dff),
+       TOBN(0x00c9a376, 0xa078d847), TOBN(0x6fc0da31, 0x26028731)}},
+     {{TOBN(0xa2baeadf, 0xe45083a2), TOBN(0x66bc7218, 0x5e5b4bcd),
+       TOBN(0x2c826442, 0xd04b8e7f), TOBN(0xc19f5451, 0x6c4b586b)},
+      {TOBN(0x60182c49, 0x5b7eeed5), TOBN(0xd9954ecd, 0x7aa9dfa1),
+       TOBN(0xa403a8ec, 0xc73884ad), TOBN(0x7fb17de2, 0x9bb39041)}},
+     {{TOBN(0x694b64c5, 0xabb020e8), TOBN(0x3d18c184, 0x19c4eec7),
+       TOBN(0x9c4673ef, 0x1c4793e5), TOBN(0xc7b8aeb5, 0x056092e6)},
+      {TOBN(0x3aa1ca43, 0xf0f8c16b), TOBN(0x224ed5ec, 0xd679b2f6),
+       TOBN(0x0d56eeaf, 0x55a205c9), TOBN(0xbfe115ba, 0x4b8e028b)}},
+     {{TOBN(0x97e60849, 0x3927f4fe), TOBN(0xf91fbf94, 0x759aa7c5),
+       TOBN(0x985af769, 0x6be90a51), TOBN(0xc1277b78, 0x78ccb823)},
+      {TOBN(0x395b656e, 0xe7a75952), TOBN(0x00df7de0, 0x928da5f5),
+       TOBN(0x09c23175, 0x4ca4454f), TOBN(0x4ec971f4, 0x7aa2d3c1)}},
+     {{TOBN(0x45c3c507, 0xe75d9ccc), TOBN(0x63b7be8a, 0x3dc90306),
+       TOBN(0x37e09c66, 0x5db44bdc), TOBN(0x50d60da1, 0x6841c6a2)},
+      {TOBN(0x6f9b65ee, 0x08df1b12), TOBN(0x38734879, 0x7ff089df),
+       TOBN(0x9c331a66, 0x3fe8013d), TOBN(0x017f5de9, 0x5f42fcc8)}},
+     {{TOBN(0x43077866, 0xe8e57567), TOBN(0xc9f781ce, 0xf9fcdb18),
+       TOBN(0x38131dda, 0x9b12e174), TOBN(0x25d84aa3, 0x8a03752a)},
+      {TOBN(0x45e09e09, 0x4d0c0ce2), TOBN(0x1564008b, 0x92bebba5),
+       TOBN(0xf7e8ad31, 0xa87284c7), TOBN(0xb7c4b46c, 0x97e7bbaa)}},
+     {{TOBN(0x3e22a7b3, 0x97acf4ec), TOBN(0x0426c400, 0x5ea8b640),
+       TOBN(0x5e3295a6, 0x4e969285), TOBN(0x22aabc59, 0xa6a45670)},
+      {TOBN(0xb929714c, 0x5f5942bc), TOBN(0x9a6168bd, 0xfa3182ed),
+       TOBN(0x2216a665, 0x104152ba), TOBN(0x46908d03, 0xb6926368)}}},
+    {{{TOBN(0xa9f5d874, 0x5a1251fb), TOBN(0x967747a8, 0xc72725c7),
+       TOBN(0x195c33e5, 0x31ffe89e), TOBN(0x609d210f, 0xe964935e)},
+      {TOBN(0xcafd6ca8, 0x2fe12227), TOBN(0xaf9b5b96, 0x0426469d),
+       TOBN(0x2e9ee04c, 0x5693183c), TOBN(0x1084a333, 0xc8146fef)}},
+     {{TOBN(0x96649933, 0xaed1d1f7), TOBN(0x566eaff3, 0x50563090),
+       TOBN(0x345057f0, 0xad2e39cf), TOBN(0x148ff65b, 0x1f832124)},
+      {TOBN(0x042e89d4, 0xcf94cf0d), TOBN(0x319bec84, 0x520c58b3),
+       TOBN(0x2a267626, 0x5361aa0d), TOBN(0xc86fa302, 0x8fbc87ad)}},
+     {{TOBN(0xfc83d2ab, 0x5c8b06d5), TOBN(0xb1a785a2, 0xfe4eac46),
+       TOBN(0xb99315bc, 0x846f7779), TOBN(0xcf31d816, 0xef9ea505)},
+      {TOBN(0x2391fe6a, 0x15d7dc85), TOBN(0x2f132b04, 0xb4016b33),
+       TOBN(0x29547fe3, 0x181cb4c7), TOBN(0xdb66d8a6, 0x650155a1)}},
+     {{TOBN(0x6b66d7e1, 0xadc1696f), TOBN(0x98ebe593, 0x0acd72d0),
+       TOBN(0x65f24550, 0xcc1b7435), TOBN(0xce231393, 0xb4b9a5ec)},
+      {TOBN(0x234a22d4, 0xdb067df9), TOBN(0x98dda095, 0xcaff9b00),
+       TOBN(0x1bbc75a0, 0x6100c9c1), TOBN(0x1560a9c8, 0x939cf695)}},
+     {{TOBN(0xcf006d3e, 0x99e0925f), TOBN(0x2dd74a96, 0x6322375a),
+       TOBN(0xc58b446a, 0xb56af5ba), TOBN(0x50292683, 0xe0b9b4f1)},
+      {TOBN(0xe2c34cb4, 0x1aeaffa3), TOBN(0x8b17203f, 0x9b9587c1),
+       TOBN(0x6d559207, 0xead1350c), TOBN(0x2b66a215, 0xfb7f9604)}},
+     {{TOBN(0x0850325e, 0xfe51bf74), TOBN(0x9c4f579e, 0x5e460094),
+       TOBN(0x5c87b92a, 0x76da2f25), TOBN(0x889de4e0, 0x6febef33)},
+      {TOBN(0x6900ec06, 0x646083ce), TOBN(0xbe2a0335, 0xbfe12773),
+       TOBN(0xadd1da35, 0xc5344110), TOBN(0x757568b7, 0xb802cd20)}},
+     {{TOBN(0x75559779, 0x00f7e6c8), TOBN(0x38e8b94f, 0x0facd2f0),
+       TOBN(0xfea1f3af, 0x03fde375), TOBN(0x5e11a1d8, 0x75881dfc)},
+      {TOBN(0xb3a6b02e, 0xc1e2f2ef), TOBN(0x193d2bbb, 0xc605a6c5),
+       TOBN(0x325ffeee, 0x339a0b2d), TOBN(0x27b6a724, 0x9e0c8846)}},
+     {{TOBN(0xe4050f1c, 0xf1c367ca), TOBN(0x9bc85a9b, 0xc90fbc7d),
+       TOBN(0xa373c4a2, 0xe1a11032), TOBN(0xb64232b7, 0xad0393a9)},
+      {TOBN(0xf5577eb0, 0x167dad29), TOBN(0x1604f301, 0x94b78ab2),
+       TOBN(0x0baa94af, 0xe829348b), TOBN(0x77fbd8dd, 0x41654342)}},
+     {{TOBN(0xdab50ea5, 0xb964e39a), TOBN(0xd4c29e3c, 0xd0d3c76e),
+       TOBN(0x80dae67c, 0x56d11964), TOBN(0x7307a8bf, 0xe5ffcc2f)},
+      {TOBN(0x65bbc1aa, 0x91708c3b), TOBN(0xa151e62c, 0x28bf0eeb),
+       TOBN(0x6cb53381, 0x6fa34db7), TOBN(0x5139e05c, 0xa29403a8)}},
+     {{TOBN(0x6ff651b4, 0x94a7cd2e), TOBN(0x5671ffd1, 0x0699336c),
+       TOBN(0x6f5fd2cc, 0x979a896a), TOBN(0x11e893a8, 0xd8148cef)},
+      {TOBN(0x988906a1, 0x65cf7b10), TOBN(0x81b67178, 0xc50d8485),
+       TOBN(0x7c0deb35, 0x8a35b3de), TOBN(0x423ac855, 0xc1d29799)}},
+     {{TOBN(0xaf580d87, 0xdac50b74), TOBN(0x28b2b89f, 0x5869734c),
+       TOBN(0x99a3b936, 0x874e28fb), TOBN(0xbb2c9190, 0x25f3f73a)},
+      {TOBN(0x199f6918, 0x84a9d5b7), TOBN(0x7ebe2325, 0x7e770374),
+       TOBN(0xf442e107, 0x0738efe2), TOBN(0xcf9f3f56, 0xcf9082d2)}},
+     {{TOBN(0x719f69e1, 0x09618708), TOBN(0xcc9e8364, 0xc183f9b1),
+       TOBN(0xec203a95, 0x366a21af), TOBN(0x6aec5d6d, 0x068b141f)},
+      {TOBN(0xee2df78a, 0x994f04e9), TOBN(0xb39ccae8, 0x271245b0),
+       TOBN(0xb875a4a9, 0x97e43f4f), TOBN(0x507dfe11, 0xdb2cea98)}},
+     {{TOBN(0x4fbf81cb, 0x489b03e9), TOBN(0xdb86ec5b, 0x6ec414fa),
+       TOBN(0xfad444f9, 0xf51b3ae5), TOBN(0xca7d33d6, 0x1914e3fe)},
+      {TOBN(0xa9c32f5c, 0x0ae6c4d0), TOBN(0xa9ca1d1e, 0x73969568),
+       TOBN(0x98043c31, 0x1aa7467e), TOBN(0xe832e75c, 0xe21b5ac6)}},
+     {{TOBN(0x314b7aea, 0x5232123d), TOBN(0x08307c8c, 0x65ae86db),
+       TOBN(0x06e7165c, 0xaa4668ed), TOBN(0xb170458b, 0xb4d3ec39)},
+      {TOBN(0x4d2e3ec6, 0xc19bb986), TOBN(0xc5f34846, 0xae0304ed),
+       TOBN(0x917695a0, 0x6c9f9722), TOBN(0x6c7f7317, 0x4cab1c0a)}},
+     {{TOBN(0x6295940e, 0x9d6d2e8b), TOBN(0xd318b8c1, 0x549f7c97),
+       TOBN(0x22453204, 0x97713885), TOBN(0x468d834b, 0xa8a440fe)},
+      {TOBN(0xd81fe5b2, 0xbfba796e), TOBN(0x152364db, 0x6d71f116),
+       TOBN(0xbb8c7c59, 0xb5b66e53), TOBN(0x0b12c61b, 0x2641a192)}},
+     {{TOBN(0x31f14802, 0xfcf0a7fd), TOBN(0x42fd0789, 0x5488b01e),
+       TOBN(0x71d78d6d, 0x9952b498), TOBN(0x8eb572d9, 0x07ac5201)},
+      {TOBN(0xe0a2a44c, 0x4d194a88), TOBN(0xd2b63fd9, 0xba017e66),
+       TOBN(0x78efc6c8, 0xf888aefc), TOBN(0xb76f6bda, 0x4a881a11)}},
+     {{TOBN(0x187f314b, 0xb46c2397), TOBN(0x004cf566, 0x5ded2819),
+       TOBN(0xa9ea5704, 0x38764d34), TOBN(0xbba45217, 0x78084709)},
+      {TOBN(0x06474571, 0x1171121e), TOBN(0xad7b7eb1, 0xe7c9b671),
+       TOBN(0xdacfbc40, 0x730f7507), TOBN(0x178cd8c6, 0xc7ad7bd1)}},
+     {{TOBN(0xbf0be101, 0xb2a67238), TOBN(0x3556d367, 0xaf9c14f2),
+       TOBN(0x104b7831, 0xa5662075), TOBN(0x58ca59bb, 0x79d9e60a)},
+      {TOBN(0x4bc45392, 0xa569a73b), TOBN(0x517a52e8, 0x5698f6c9),
+       TOBN(0x85643da5, 0xaeadd755), TOBN(0x1aed0cd5, 0x2a581b84)}},
+     {{TOBN(0xb9b4ff84, 0x80af1372), TOBN(0x244c3113, 0xf1ba5d1f),
+       TOBN(0x2a5dacbe, 0xf5f98d31), TOBN(0x2c3323e8, 0x4375bc2a)},
+      {TOBN(0x17a3ab4a, 0x5594b1dd), TOBN(0xa1928bfb, 0xceb4797e),
+       TOBN(0xe83af245, 0xe4886a19), TOBN(0x8979d546, 0x72b5a74a)}},
+     {{TOBN(0xa0f726bc, 0x19f9e967), TOBN(0xd9d03152, 0xe8fbbf4e),
+       TOBN(0xcfd6f51d, 0xb7707d40), TOBN(0x633084d9, 0x63f6e6e0)},
+      {TOBN(0xedcd9cdc, 0x55667eaf), TOBN(0x73b7f92b, 0x2e44d56f),
+       TOBN(0xfb2e39b6, 0x4e962b14), TOBN(0x7d408f6e, 0xf671fcbf)}},
+     {{TOBN(0xcc634ddc, 0x164a89bb), TOBN(0x74a42bb2, 0x3ef3bd05),
+       TOBN(0x1280dbb2, 0x428decbb), TOBN(0x6103f6bb, 0x402c8596)},
+      {TOBN(0xfa2bf581, 0x355a5752), TOBN(0x562f96a8, 0x00946674),
+       TOBN(0x4e4ca16d, 0x6da0223b), TOBN(0xfe47819f, 0x28d3aa25)}},
+     {{TOBN(0x9eea3075, 0xf8dfcf8a), TOBN(0xa284f0aa, 0x95669825),
+       TOBN(0xb3fca250, 0x867d3fd8), TOBN(0x20757b5f, 0x269d691e)},
+      {TOBN(0xf2c24020, 0x93b8a5de), TOBN(0xd3f93359, 0xebc06da6),
+       TOBN(0x1178293e, 0xb2739c33), TOBN(0xd2a3e770, 0xbcd686e5)}},
+     {{TOBN(0xa76f49f4, 0xcd941534), TOBN(0x0d37406b, 0xe3c71c0e),
+       TOBN(0x172d9397, 0x3b97f7e3), TOBN(0xec17e239, 0xbd7fd0de)},
+      {TOBN(0xe3290551, 0x6f496ba2), TOBN(0x6a693172, 0x36ad50e7),
+       TOBN(0xc4e539a2, 0x83e7eff5), TOBN(0x752737e7, 0x18e1b4cf)}},
+     {{TOBN(0xa2f7932c, 0x68af43ee), TOBN(0x5502468e, 0x703d00bd),
+       TOBN(0xe5dc978f, 0x2fb061f5), TOBN(0xc9a1904a, 0x28c815ad)},
+      {TOBN(0xd3af538d, 0x470c56a4), TOBN(0x159abc5f, 0x193d8ced),
+       TOBN(0x2a37245f, 0x20108ef3), TOBN(0xfa17081e, 0x223f7178)}},
+     {{TOBN(0x27b0fb2b, 0x10c8c0f5), TOBN(0x2102c3ea, 0x40650547),
+       TOBN(0x594564df, 0x8ac3bfa7), TOBN(0x98102033, 0x509dad96)},
+      {TOBN(0x6989643f, 0xf1d18a13), TOBN(0x35eebd91, 0xd7fc5af0),
+       TOBN(0x078d096a, 0xfaeaafd8), TOBN(0xb7a89341, 0xdef3de98)}},
+     {{TOBN(0x2a206e8d, 0xecf2a73a), TOBN(0x066a6397, 0x8e551994),
+       TOBN(0x3a6a088a, 0xb98d53a2), TOBN(0x0ce7c67c, 0x2d1124aa)},
+      {TOBN(0x48cec671, 0x759a113c), TOBN(0xe3b373d3, 0x4f6f67fa),
+       TOBN(0x5455d479, 0xfd36727b), TOBN(0xe5a428ee, 0xa13c0d81)}},
+     {{TOBN(0xb853dbc8, 0x1c86682b), TOBN(0xb78d2727, 0xb8d02b2a),
+       TOBN(0xaaf69bed, 0x8ebc329a), TOBN(0xdb6b40b3, 0x293b2148)},
+      {TOBN(0xe42ea77d, 0xb8c4961f), TOBN(0xb1a12f7c, 0x20e5e0ab),
+       TOBN(0xa0ec5274, 0x79e8b05e), TOBN(0x68027391, 0xfab60a80)}},
+     {{TOBN(0x6bfeea5f, 0x16b1bd5e), TOBN(0xf957e420, 0x4de30ad3),
+       TOBN(0xcbaf664e, 0x6a353b9e), TOBN(0x5c873312, 0x26d14feb)},
+      {TOBN(0x4e87f98c, 0xb65f57cb), TOBN(0xdb60a621, 0x5e0cdd41),
+       TOBN(0x67c16865, 0xa6881440), TOBN(0x1093ef1a, 0x46ab52aa)}},
+     {{TOBN(0xc095afb5, 0x3f4ece64), TOBN(0x6a6bb02e, 0x7604551a),
+       TOBN(0x55d44b4e, 0x0b26b8cd), TOBN(0xe5f9a999, 0xf971268a)},
+      {TOBN(0xc08ec425, 0x11a7de84), TOBN(0x83568095, 0xfda469dd),
+       TOBN(0x737bfba1, 0x6c6c90a2), TOBN(0x1cb9c4a0, 0xbe229831)}},
+     {{TOBN(0x93bccbba, 0xbb2eec64), TOBN(0xa0c23b64, 0xda03adbe),
+       TOBN(0x5f7aa00a, 0xe0e86ac4), TOBN(0x470b941e, 0xfc1401e6)},
+      {TOBN(0x5ad8d679, 0x9df43574), TOBN(0x4ccfb8a9, 0x0f65d810),
+       TOBN(0x1bce80e3, 0xaa7fbd81), TOBN(0x273291ad, 0x9508d20a)}},
+     {{TOBN(0xf5c4b46b, 0x42a92806), TOBN(0x810684ec, 0xa86ab44a),
+       TOBN(0x4591640b, 0xca0bc9f8), TOBN(0xb5efcdfc, 0x5c4b6054)},
+      {TOBN(0x16fc8907, 0x6e9edd12), TOBN(0xe29d0b50, 0xd4d792f9),
+       TOBN(0xa45fd01c, 0x9b03116d), TOBN(0x85035235, 0xc81765a4)}},
+     {{TOBN(0x1fe2a9b2, 0xb4b4b67c), TOBN(0xc1d10df0, 0xe8020604),
+       TOBN(0x9d64abfc, 0xbc8058d8), TOBN(0x8943b9b2, 0x712a0fbb)},
+      {TOBN(0x90eed914, 0x3b3def04), TOBN(0x85ab3aa2, 0x4ce775ff),
+       TOBN(0x605fd4ca, 0x7bbc9040), TOBN(0x8b34a564, 0xe2c75dfb)}},
+     {{TOBN(0x41ffc94a, 0x10358560), TOBN(0x2d8a5072, 0x9e5c28aa),
+       TOBN(0xe915a0fc, 0x4cc7eb15), TOBN(0xe9efab05, 0x8f6d0f5d)},
+      {TOBN(0xdbab47a9, 0xd19e9b91), TOBN(0x8cfed745, 0x0276154c),
+       TOBN(0x154357ae, 0x2cfede0d), TOBN(0x520630df, 0x19f5a4ef)}},
+     {{TOBN(0x25759f7c, 0xe382360f), TOBN(0xb6db05c9, 0x88bf5857),
+       TOBN(0x2917d61d, 0x6c58d46c), TOBN(0x14f8e491, 0xfd20cb7a)},
+      {TOBN(0xb68a727a, 0x11c20340), TOBN(0x0386f86f, 0xaf7ccbb6),
+       TOBN(0x5c8bc6cc, 0xfee09a20), TOBN(0x7d76ff4a, 0xbb7eea35)}},
+     {{TOBN(0xa7bdebe7, 0xdb15be7a), TOBN(0x67a08054, 0xd89f0302),
+       TOBN(0x56bf0ea9, 0xc1193364), TOBN(0xc8244467, 0x62837ebe)},
+      {TOBN(0x32bd8e8b, 0x20d841b8), TOBN(0x127a0548, 0xdbb8a54f),
+       TOBN(0x83dd4ca6, 0x63b20236), TOBN(0x87714718, 0x203491fa)}},
+     {{TOBN(0x4dabcaaa, 0xaa8a5288), TOBN(0x91cc0c8a, 0xaf23a1c9),
+       TOBN(0x34c72c6a, 0x3f220e0c), TOBN(0xbcc20bdf, 0x1232144a)},
+      {TOBN(0x6e2f42da, 0xa20ede1b), TOBN(0xc441f00c, 0x74a00515),
+       TOBN(0xbf46a5b6, 0x734b8c4b), TOBN(0x57409503, 0x7b56c9a4)}},
+     {{TOBN(0x9f735261, 0xe4585d45), TOBN(0x9231faed, 0x6734e642),
+       TOBN(0x1158a176, 0xbe70ee6c), TOBN(0x35f1068d, 0x7c3501bf)},
+      {TOBN(0x6beef900, 0xa2d26115), TOBN(0x649406f2, 0xef0afee3),
+       TOBN(0x3f43a60a, 0xbc2420a1), TOBN(0x509002a7, 0xd5aee4ac)}},
+     {{TOBN(0xb46836a5, 0x3ff3571b), TOBN(0x24f98b78, 0x837927c1),
+       TOBN(0x6254256a, 0x4533c716), TOBN(0xf27abb0b, 0xd07ee196)},
+      {TOBN(0xd7cf64fc, 0x5c6d5bfd), TOBN(0x6915c751, 0xf0cd7a77),
+       TOBN(0xd9f59012, 0x8798f534), TOBN(0x772b0da8, 0xf81d8b5f)}},
+     {{TOBN(0x1244260c, 0x2e03fa69), TOBN(0x36cf0e3a, 0x3be1a374),
+       TOBN(0x6e7c1633, 0xef06b960), TOBN(0xa71a4c55, 0x671f90f6)},
+      {TOBN(0x7a941251, 0x33c673db), TOBN(0xc0bea510, 0x73e8c131),
+       TOBN(0x61a8a699, 0xd4f6c734), TOBN(0x25e78c88, 0x341ed001)}},
+     {{TOBN(0x5c18acf8, 0x8e2f7d90), TOBN(0xfdbf33d7, 0x77be32cd),
+       TOBN(0x0a085cd7, 0xd2eb5ee9), TOBN(0x2d702cfb, 0xb3201115)},
+      {TOBN(0xb6e0ebdb, 0x85c88ce8), TOBN(0x23a3ce3c, 0x1e01d617),
+       TOBN(0x3041618e, 0x567333ac), TOBN(0x9dd0fd8f, 0x157edb6b)}},
+     {{TOBN(0x27f74702, 0xb57872b8), TOBN(0x2ef26b4f, 0x657d5fe1),
+       TOBN(0x95426f0a, 0x57cf3d40), TOBN(0x847e2ad1, 0x65a6067a)},
+      {TOBN(0xd474d9a0, 0x09996a74), TOBN(0x16a56acd, 0x2a26115c),
+       TOBN(0x02a615c3, 0xd16f4d43), TOBN(0xcc3fc965, 0xaadb85b7)}},
+     {{TOBN(0x386bda73, 0xce07d1b0), TOBN(0xd82910c2, 0x58ad4178),
+       TOBN(0x124f82cf, 0xcd2617f4), TOBN(0xcc2f5e8d, 0xef691770)},
+      {TOBN(0x82702550, 0xb8c30ccc), TOBN(0x7b856aea, 0x1a8e575a),
+       TOBN(0xbb822fef, 0xb1ab9459), TOBN(0x085928bc, 0xec24e38e)}},
+     {{TOBN(0x5d0402ec, 0xba8f4b4d), TOBN(0xc07cd4ba, 0x00b4d58b),
+       TOBN(0x5d8dffd5, 0x29227e7a), TOBN(0x61d44d0c, 0x31bf386f)},
+      {TOBN(0xe486dc2b, 0x135e6f4d), TOBN(0x680962eb, 0xe79410ef),
+       TOBN(0xa61bd343, 0xf10088b5), TOBN(0x6aa76076, 0xe2e28686)}},
+     {{TOBN(0x80463d11, 0x8fb98871), TOBN(0xcb26f5c3, 0xbbc76aff),
+       TOBN(0xd4ab8edd, 0xfbe03614), TOBN(0xc8eb579b, 0xc0cf2dee)},
+      {TOBN(0xcc004c15, 0xc93bae41), TOBN(0x46fbae5d, 0x3aeca3b2),
+       TOBN(0x671235cf, 0x0f1e9ab1), TOBN(0xadfba934, 0x9ec285c1)}},
+     {{TOBN(0x88ded013, 0xf216c980), TOBN(0xc8ac4fb8, 0xf79e0bc1),
+       TOBN(0xa29b89c6, 0xfb97a237), TOBN(0xb697b780, 0x9922d8e7)},
+      {TOBN(0x3142c639, 0xddb945b5), TOBN(0x447b06c7, 0xe094c3a9),
+       TOBN(0xcdcb3642, 0x72266c90), TOBN(0x633aad08, 0xa9385046)}},
+     {{TOBN(0xa36c936b, 0xb57c6477), TOBN(0x871f8b64, 0xe94dbcc6),
+       TOBN(0x28d0fb62, 0xa591a67b), TOBN(0x9d40e081, 0xc1d926f5)},
+      {TOBN(0x3111eaf6, 0xf2d84b5a), TOBN(0x228993f9, 0xa565b644),
+       TOBN(0x0ccbf592, 0x2c83188b), TOBN(0xf87b30ab, 0x3df3e197)}},
+     {{TOBN(0xb8658b31, 0x7642bca8), TOBN(0x1a032d7f, 0x52800f17),
+       TOBN(0x051dcae5, 0x79bf9445), TOBN(0xeba6b8ee, 0x54a2e253)},
+      {TOBN(0x5c8b9cad, 0xd4485692), TOBN(0x84bda40e, 0x8986e9be),
+       TOBN(0xd16d16a4, 0x2f0db448), TOBN(0x8ec80050, 0xa14d4188)}},
+     {{TOBN(0xb2b26107, 0x98fa7aaa), TOBN(0x41209ee4, 0xf073aa4e),
+       TOBN(0xf1570359, 0xf2d6b19b), TOBN(0xcbe6868c, 0xfc577caf)},
+      {TOBN(0x186c4bdc, 0x32c04dd3), TOBN(0xa6c35fae, 0xcfeee397),
+       TOBN(0xb4a1b312, 0xf086c0cf), TOBN(0xe0a5ccc6, 0xd9461fe2)}},
+     {{TOBN(0xc32278aa, 0x1536189f), TOBN(0x1126c55f, 0xba6df571),
+       TOBN(0x0f71a602, 0xb194560e), TOBN(0x8b2d7405, 0x324bd6e1)},
+      {TOBN(0x8481939e, 0x3738be71), TOBN(0xb5090b1a, 0x1a4d97a9),
+       TOBN(0x116c65a3, 0xf05ba915), TOBN(0x21863ad3, 0xaae448aa)}},
+     {{TOBN(0xd24e2679, 0xa7aae5d3), TOBN(0x7076013d, 0x0de5c1c4),
+       TOBN(0x2d50f8ba, 0xbb05b629), TOBN(0x73c1abe2, 0x6e66efbb)},
+      {TOBN(0xefd4b422, 0xf2488af7), TOBN(0xe4105d02, 0x663ba575),
+       TOBN(0x7eb60a8b, 0x53a69457), TOBN(0x62210008, 0xc945973b)}},
+     {{TOBN(0xfb255478, 0x77a50ec6), TOBN(0xbf0392f7, 0x0a37a72c),
+       TOBN(0xa0a7a19c, 0x4be18e7a), TOBN(0x90d8ea16, 0x25b1e0af)},
+      {TOBN(0x7582a293, 0xef953f57), TOBN(0x90a64d05, 0xbdc5465a),
+       TOBN(0xca79c497, 0xe2510717), TOBN(0x560dbb7c, 0x18cb641f)}},
+     {{TOBN(0x1d8e3286, 0x4b66abfb), TOBN(0xd26f52e5, 0x59030900),
+       TOBN(0x1ee3f643, 0x5584941a), TOBN(0x6d3b3730, 0x569f5958)},
+      {TOBN(0x9ff2a62f, 0x4789dba5), TOBN(0x91fcb815, 0x72b5c9b7),
+       TOBN(0xf446cb7d, 0x6c8f9a0e), TOBN(0x48f625c1, 0x39b7ecb5)}},
+     {{TOBN(0xbabae801, 0x1c6219b8), TOBN(0xe7a562d9, 0x28ac2f23),
+       TOBN(0xe1b48732, 0x26e20588), TOBN(0x06ee1cad, 0x775af051)},
+      {TOBN(0xda29ae43, 0xfaff79f7), TOBN(0xc141a412, 0x652ee9e0),
+       TOBN(0x1e127f6f, 0x195f4bd0), TOBN(0x29c6ab4f, 0x072f34f8)}},
+     {{TOBN(0x7b7c1477, 0x30448112), TOBN(0x82b51af1, 0xe4a38656),
+       TOBN(0x2bf2028a, 0x2f315010), TOBN(0xc9a4a01f, 0x6ea88cd4)},
+      {TOBN(0xf63e95d8, 0x257e5818), TOBN(0xdd8efa10, 0xb4519b16),
+       TOBN(0xed8973e0, 0x0da910bf), TOBN(0xed49d077, 0x5c0fe4a9)}},
+     {{TOBN(0xac3aac5e, 0xb7caee1e), TOBN(0x1033898d, 0xa7f4da57),
+       TOBN(0x42145c0e, 0x5c6669b9), TOBN(0x42daa688, 0xc1aa2aa0)},
+      {TOBN(0x629cc15c, 0x1a1d885a), TOBN(0x25572ec0, 0xf4b76817),
+       TOBN(0x8312e435, 0x9c8f8f28), TOBN(0x8107f8cd, 0x81965490)}},
+     {{TOBN(0x516ff3a3, 0x6fa6110c), TOBN(0x74fb1eb1, 0xfb93561f),
+       TOBN(0x6c0c9047, 0x8457522b), TOBN(0xcfd32104, 0x6bb8bdc6)},
+      {TOBN(0x2d6884a2, 0xcc80ad57), TOBN(0x7c27fc35, 0x86a9b637),
+       TOBN(0x3461baed, 0xadf4e8cd), TOBN(0x1d56251a, 0x617242f0)}},
+     {{TOBN(0x0b80d209, 0xc955bef4), TOBN(0xdf02cad2, 0x06adb047),
+       TOBN(0xf0d7cb91, 0x5ec74fee), TOBN(0xd2503375, 0x1111ba44)},
+      {TOBN(0x9671755e, 0xdf53cb36), TOBN(0x54dcb612, 0x3368551b),
+       TOBN(0x66d69aac, 0xc8a025a4), TOBN(0x6be946c6, 0xe77ef445)}},
+     {{TOBN(0x719946d1, 0xa995e094), TOBN(0x65e848f6, 0xe51e04d8),
+       TOBN(0xe62f3300, 0x6a1e3113), TOBN(0x1541c7c1, 0x501de503)},
+      {TOBN(0x4daac9fa, 0xf4acfade), TOBN(0x0e585897, 0x44cd0b71),
+       TOBN(0x544fd869, 0x0a51cd77), TOBN(0x60fc20ed, 0x0031016d)}},
+     {{TOBN(0x58b404ec, 0xa4276867), TOBN(0x46f6c3cc, 0x34f34993),
+       TOBN(0x477ca007, 0xc636e5bd), TOBN(0x8018f5e5, 0x7c458b47)},
+      {TOBN(0xa1202270, 0xe47b668f), TOBN(0xcef48ccd, 0xee14f203),
+       TOBN(0x23f98bae, 0x62ff9b4d), TOBN(0x55acc035, 0xc589eddd)}},
+     {{TOBN(0x3fe712af, 0x64db4444), TOBN(0x19e9d634, 0xbecdd480),
+       TOBN(0xe08bc047, 0xa930978a), TOBN(0x2dbf24ec, 0xa1280733)},
+      {TOBN(0x3c0ae38c, 0x2cd706b2), TOBN(0x5b012a5b, 0x359017b9),
+       TOBN(0x3943c38c, 0x72e0f5ae), TOBN(0x786167ea, 0x57176fa3)}},
+     {{TOBN(0xe5f9897d, 0x594881dc), TOBN(0x6b5efad8, 0xcfb820c1),
+       TOBN(0xb2179093, 0xd55018de), TOBN(0x39ad7d32, 0x0bac56ce)},
+      {TOBN(0xb55122e0, 0x2cfc0e81), TOBN(0x117c4661, 0xf6d89daa),
+       TOBN(0x362d01e1, 0xcb64fa09), TOBN(0x6a309b4e, 0x3e9c4ddd)}},
+     {{TOBN(0xfa979fb7, 0xabea49b1), TOBN(0xb4b1d27d, 0x10e2c6c5),
+       TOBN(0xbd61c2c4, 0x23afde7a), TOBN(0xeb6614f8, 0x9786d358)},
+      {TOBN(0x4a5d816b, 0x7f6f7459), TOBN(0xe431a44f, 0x09360e7b),
+       TOBN(0x8c27a032, 0xc309914c), TOBN(0xcea5d68a, 0xcaede3d8)}},
+     {{TOBN(0x3668f665, 0x3a0a3f95), TOBN(0x89369416, 0x7ceba27b),
+       TOBN(0x89981fad, 0xe4728fe9), TOBN(0x7102c8a0, 0x8a093562)},
+      {TOBN(0xbb80310e, 0x235d21c8), TOBN(0x505e55d1, 0xbefb7f7b),
+       TOBN(0xa0a90811, 0x12958a67), TOBN(0xd67e106a, 0x4d851fef)}},
+     {{TOBN(0xb84011a9, 0x431dd80e), TOBN(0xeb7c7cca, 0x73306cd9),
+       TOBN(0x20fadd29, 0xd1b3b730), TOBN(0x83858b5b, 0xfe37b3d3)},
+      {TOBN(0xbf4cd193, 0xb6251d5c), TOBN(0x1cca1fd3, 0x1352d952),
+       TOBN(0xc66157a4, 0x90fbc051), TOBN(0x7990a638, 0x89b98636)}}},
+    {{{TOBN(0xe5aa692a, 0x87dec0e1), TOBN(0x010ded8d, 0xf7b39d00),
+       TOBN(0x7b1b80c8, 0x54cfa0b5), TOBN(0x66beb876, 0xa0f8ea28)},
+      {TOBN(0x50d7f531, 0x3476cd0e), TOBN(0xa63d0e65, 0xb08d3949),
+       TOBN(0x1a09eea9, 0x53479fc6), TOBN(0x82ae9891, 0xf499e742)}},
+     {{TOBN(0xab58b910, 0x5ca7d866), TOBN(0x582967e2, 0x3adb3b34),
+       TOBN(0x89ae4447, 0xcceac0bc), TOBN(0x919c667c, 0x7bf56af5)},
+      {TOBN(0x9aec17b1, 0x60f5dcd7), TOBN(0xec697b9f, 0xddcaadbc),
+       TOBN(0x0b98f341, 0x463467f5), TOBN(0xb187f1f7, 0xa967132f)}},
+     {{TOBN(0x90fe7a1d, 0x214aeb18), TOBN(0x1506af3c, 0x741432f7),
+       TOBN(0xbb5565f9, 0xe591a0c4), TOBN(0x10d41a77, 0xb44f1bc3)},
+      {TOBN(0xa09d65e4, 0xa84bde96), TOBN(0x42f060d8, 0xf20a6a1c),
+       TOBN(0x652a3bfd, 0xf27f9ce7), TOBN(0xb6bdb65c, 0x3b3d739f)}},
+     {{TOBN(0xeb5ddcb6, 0xec7fae9f), TOBN(0x995f2714, 0xefb66e5a),
+       TOBN(0xdee95d8e, 0x69445d52), TOBN(0x1b6c2d46, 0x09e27620)},
+      {TOBN(0x32621c31, 0x8129d716), TOBN(0xb03909f1, 0x0958c1aa),
+       TOBN(0x8c468ef9, 0x1af4af63), TOBN(0x162c429f, 0xfba5cdf6)}},
+     {{TOBN(0x2f682343, 0x753b9371), TOBN(0x29cab45a, 0x5f1f9cd7),
+       TOBN(0x571623ab, 0xb245db96), TOBN(0xc507db09, 0x3fd79999)},
+      {TOBN(0x4e2ef652, 0xaf036c32), TOBN(0x86f0cc78, 0x05018e5c),
+       TOBN(0xc10a73d4, 0xab8be350), TOBN(0x6519b397, 0x7e826327)}},
+     {{TOBN(0xe8cb5eef, 0x9c053df7), TOBN(0x8de25b37, 0xb300ea6f),
+       TOBN(0xdb03fa92, 0xc849cffb), TOBN(0x242e43a7, 0xe84169bb)},
+      {TOBN(0xe4fa51f4, 0xdd6f958e), TOBN(0x6925a77f, 0xf4445a8d),
+       TOBN(0xe6e72a50, 0xe90d8949), TOBN(0xc66648e3, 0x2b1f6390)}},
+     {{TOBN(0xb2ab1957, 0x173e460c), TOBN(0x1bbbce75, 0x30704590),
+       TOBN(0xc0a90dbd, 0xdb1c7162), TOBN(0x505e399e, 0x15cdd65d)},
+      {TOBN(0x68434dcb, 0x57797ab7), TOBN(0x60ad35ba, 0x6a2ca8e8),
+       TOBN(0x4bfdb1e0, 0xde3336c1), TOBN(0xbbef99eb, 0xd8b39015)}},
+     {{TOBN(0x6c3b96f3, 0x1711ebec), TOBN(0x2da40f1f, 0xce98fdc4),
+       TOBN(0xb99774d3, 0x57b4411f), TOBN(0x87c8bdf4, 0x15b65bb6)},
+      {TOBN(0xda3a89e3, 0xc2eef12d), TOBN(0xde95bb9b, 0x3c7471f3),
+       TOBN(0x600f225b, 0xd812c594), TOBN(0x54907c5d, 0x2b75a56b)}},
+     {{TOBN(0xa93cc5f0, 0x8db60e35), TOBN(0x743e3cd6, 0xfa833319),
+       TOBN(0x7dad5c41, 0xf81683c9), TOBN(0x70c1e7d9, 0x9c34107e)},
+      {TOBN(0x0edc4a39, 0xa6be0907), TOBN(0x36d47035, 0x86d0b7d3),
+       TOBN(0x8c76da03, 0x272bfa60), TOBN(0x0b4a07ea, 0x0f08a414)}},
+     {{TOBN(0x699e4d29, 0x45c1dd53), TOBN(0xcadc5898, 0x231debb5),
+       TOBN(0xdf49fcc7, 0xa77f00e0), TOBN(0x93057bbf, 0xa73e5a0e)},
+      {TOBN(0x2f8b7ecd, 0x027a4cd1), TOBN(0x114734b3, 0xc614011a),
+       TOBN(0xe7a01db7, 0x67677c68), TOBN(0x89d9be5e, 0x7e273f4f)}},
+     {{TOBN(0xd225cb2e, 0x089808ef), TOBN(0xf1f7a27d, 0xd59e4107),
+       TOBN(0x53afc761, 0x8211b9c9), TOBN(0x0361bc67, 0xe6819159)},
+      {TOBN(0x2a865d0b, 0x7f071426), TOBN(0x6a3c1810, 0xe7072567),
+       TOBN(0x3e3bca1e, 0x0d6bcabd), TOBN(0xa1b02bc1, 0x408591bc)}},
+     {{TOBN(0xe0deee59, 0x31fba239), TOBN(0xf47424d3, 0x98bd91d1),
+       TOBN(0x0f8886f4, 0x071a3c1d), TOBN(0x3f7d41e8, 0xa819233b)},
+      {TOBN(0x708623c2, 0xcf6eb998), TOBN(0x86bb49af, 0x609a287f),
+       TOBN(0x942bb249, 0x63c90762), TOBN(0x0ef6eea5, 0x55a9654b)}},
+     {{TOBN(0x5f6d2d72, 0x36f5defe), TOBN(0xfa9922dc, 0x56f99176),
+       TOBN(0x6c8c5ece, 0xf78ce0c7), TOBN(0x7b44589d, 0xbe09b55e)},
+      {TOBN(0xe11b3bca, 0x9ea83770), TOBN(0xd7fa2c7f, 0x2ab71547),
+       TOBN(0x2a3dd6fa, 0x2a1ddcc0), TOBN(0x09acb430, 0x5a7b7707)}},
+     {{TOBN(0x4add4a2e, 0x649d4e57), TOBN(0xcd53a2b0, 0x1917526e),
+       TOBN(0xc5262330, 0x20b44ac4), TOBN(0x4028746a, 0xbaa2c31d)},
+      {TOBN(0x51318390, 0x64291d4c), TOBN(0xbf48f151, 0xee5ad909),
+       TOBN(0xcce57f59, 0x7b185681), TOBN(0x7c3ac1b0, 0x4854d442)}},
+     {{TOBN(0x65587dc3, 0xc093c171), TOBN(0xae7acb24, 0x24f42b65),
+       TOBN(0x5a338adb, 0x955996cb), TOBN(0xc8e65675, 0x6051f91b)},
+      {TOBN(0x66711fba, 0x28b8d0b1), TOBN(0x15d74137, 0xb6c10a90),
+       TOBN(0x70cdd7eb, 0x3a232a80), TOBN(0xc9e2f07f, 0x6191ed24)}},
+     {{TOBN(0xa80d1db6, 0xf79588c0), TOBN(0xfa52fc69, 0xb55768cc),
+       TOBN(0x0b4df1ae, 0x7f54438a), TOBN(0x0cadd1a7, 0xf9b46a4f)},
+      {TOBN(0xb40ea6b3, 0x1803dd6f), TOBN(0x488e4fa5, 0x55eaae35),
+       TOBN(0x9f047d55, 0x382e4e16), TOBN(0xc9b5b7e0, 0x2f6e0c98)}},
+     {{TOBN(0x6b1bd2d3, 0x95762649), TOBN(0xa9604ee7, 0xc7aea3f6),
+       TOBN(0x3646ff27, 0x6dc6f896), TOBN(0x9bf0e7f5, 0x2860bad1)},
+      {TOBN(0x2d92c821, 0x7cb44b92), TOBN(0xa2f5ce63, 0xaea9c182),
+       TOBN(0xd0a2afb1, 0x9154a5fd), TOBN(0x482e474c, 0x95801da6)}},
+     {{TOBN(0xc19972d0, 0xb611c24b), TOBN(0x1d468e65, 0x60a8f351),
+       TOBN(0xeb758069, 0x7bcf6421), TOBN(0xec9dd0ee, 0x88fbc491)},
+      {TOBN(0x5b59d2bf, 0x956c2e32), TOBN(0x73dc6864, 0xdcddf94e),
+       TOBN(0xfd5e2321, 0xbcee7665), TOBN(0xa7b4f8ef, 0x5e9a06c4)}},
+     {{TOBN(0xfba918dd, 0x7280f855), TOBN(0xbbaac260, 0x8baec688),
+       TOBN(0xa3b3f00f, 0x33400f42), TOBN(0x3d2dba29, 0x66f2e6e4)},
+      {TOBN(0xb6f71a94, 0x98509375), TOBN(0x8f33031f, 0xcea423cc),
+       TOBN(0x009b8dd0, 0x4807e6fb), TOBN(0x5163cfe5, 0x5cdb954c)}},
+     {{TOBN(0x03cc8f17, 0xcf41c6e8), TOBN(0xf1f03c2a, 0x037b925c),
+       TOBN(0xc39c19cc, 0x66d2427c), TOBN(0x823d24ba, 0x7b6c18e4)},
+      {TOBN(0x32ef9013, 0x901f0b4f), TOBN(0x684360f1, 0xf8941c2e),
+       TOBN(0x0ebaff52, 0x2c28092e), TOBN(0x7891e4e3, 0x256c932f)}},
+     {{TOBN(0x51264319, 0xac445e3d), TOBN(0x553432e7, 0x8ea74381),
+       TOBN(0xe6eeaa69, 0x67e9c50a), TOBN(0x27ced284, 0x62e628c7)},
+      {TOBN(0x3f96d375, 0x7a4afa57), TOBN(0xde0a14c3, 0xe484c150),
+       TOBN(0x364a24eb, 0x38bd9923), TOBN(0x1df18da0, 0xe5177422)}},
+     {{TOBN(0x174e8f82, 0xd8d38a9b), TOBN(0x2e97c600, 0xe7de1391),
+       TOBN(0xc5709850, 0xa1c175dd), TOBN(0x969041a0, 0x32ae5035)},
+      {TOBN(0xcbfd533b, 0x76a2086b), TOBN(0xd6bba71b, 0xd7c2e8fe),
+       TOBN(0xb2d58ee6, 0x099dfb67), TOBN(0x3a8b342d, 0x064a85d9)}},
+     {{TOBN(0x3bc07649, 0x522f9be3), TOBN(0x690c075b, 0xdf1f49a8),
+       TOBN(0x80e1aee8, 0x3854ec42), TOBN(0x2a7dbf44, 0x17689dc7)},
+      {TOBN(0xc004fc0e, 0x3faf4078), TOBN(0xb2f02e9e, 0xdf11862c),
+       TOBN(0xf10a5e0f, 0xa0a1b7b3), TOBN(0x30aca623, 0x8936ec80)}},
+     {{TOBN(0xf83cbf05, 0x02f40d9a), TOBN(0x4681c468, 0x2c318a4d),
+       TOBN(0x98575618, 0x0e9c2674), TOBN(0xbe79d046, 0x1847092e)},
+      {TOBN(0xaf1e480a, 0x78bd01e0), TOBN(0x6dd359e4, 0x72a51db9),
+       TOBN(0x62ce3821, 0xe3afbab6), TOBN(0xc5cee5b6, 0x17733199)}},
+     {{TOBN(0xe08b30d4, 0x6ffd9fbb), TOBN(0x6e5bc699, 0x36c610b7),
+       TOBN(0xf343cff2, 0x9ce262cf), TOBN(0xca2e4e35, 0x68b914c1)},
+      {TOBN(0x011d64c0, 0x16de36c5), TOBN(0xe0b10fdd, 0x42e2b829),
+       TOBN(0x78942981, 0x6685aaf8), TOBN(0xe7511708, 0x230ede97)}},
+     {{TOBN(0x671ed8fc, 0x3b922bf8), TOBN(0xe4d8c0a0, 0x4c29b133),
+       TOBN(0x87eb1239, 0x3b6e99c4), TOBN(0xaff3974c, 0x8793beba)},
+      {TOBN(0x03749405, 0x2c18df9b), TOBN(0xc5c3a293, 0x91007139),
+       TOBN(0x6a77234f, 0xe37a0b95), TOBN(0x02c29a21, 0xb661c96b)}},
+     {{TOBN(0xc3aaf1d6, 0x141ecf61), TOBN(0x9195509e, 0x3bb22f53),
+       TOBN(0x29597404, 0x22d51357), TOBN(0x1b083822, 0x537bed60)},
+      {TOBN(0xcd7d6e35, 0xe07289f0), TOBN(0x1f94c48c, 0x6dd86eff),
+       TOBN(0xc8bb1f82, 0xeb0f9cfa), TOBN(0x9ee0b7e6, 0x1b2eb97d)}},
+     {{TOBN(0x5a52fe2e, 0x34d74e31), TOBN(0xa352c310, 0x3bf79ab6),
+       TOBN(0x97ff6c5a, 0xabfeeb8f), TOBN(0xbfbe8fef, 0xf5c97305)},
+      {TOBN(0xd6081ce6, 0xa7904608), TOBN(0x1f812f3a, 0xc4fca249),
+       TOBN(0x9b24bc9a, 0xb9e5e200), TOBN(0x91022c67, 0x38012ee8)}},
+     {{TOBN(0xe83d9c5d, 0x30a713a1), TOBN(0x4876e3f0, 0x84ef0f93),
+       TOBN(0xc9777029, 0xc1fbf928), TOBN(0xef7a6bb3, 0xbce7d2a4)},
+      {TOBN(0xb8067228, 0xdfa2a659), TOBN(0xd5cd3398, 0xd877a48f),
+       TOBN(0xbea4fd8f, 0x025d0f3f), TOBN(0xd67d2e35, 0x2eae7c2b)}},
+     {{TOBN(0x184de7d7, 0xcc5f4394), TOBN(0xb5551b5c, 0x4536e142),
+       TOBN(0x2e89b212, 0xd34aa60a), TOBN(0x14a96fea, 0xf50051d5)},
+      {TOBN(0x4e21ef74, 0x0d12bb0b), TOBN(0xc522f020, 0x60b9677e),
+       TOBN(0x8b12e467, 0x2df7731d), TOBN(0x39f80382, 0x7b326d31)}},
+     {{TOBN(0xdfb8630c, 0x39024a94), TOBN(0xaacb96a8, 0x97319452),
+       TOBN(0xd68a3961, 0xeda3867c), TOBN(0x0c58e2b0, 0x77c4ffca)},
+      {TOBN(0x3d545d63, 0x4da919fa), TOBN(0xef79b69a, 0xf15e2289),
+       TOBN(0x54bc3d3d, 0x808bab10), TOBN(0xc8ab3007, 0x45f82c37)}},
+     {{TOBN(0xc12738b6, 0x7c4a658a), TOBN(0xb3c47639, 0x40e72182),
+       TOBN(0x3b77be46, 0x8798e44f), TOBN(0xdc047df2, 0x17a7f85f)},
+      {TOBN(0x2439d4c5, 0x5e59d92d), TOBN(0xcedca475, 0xe8e64d8d),
+       TOBN(0xa724cd0d, 0x87ca9b16), TOBN(0x35e4fd59, 0xa5540dfe)}},
+     {{TOBN(0xf8c1ff18, 0xe4bcf6b1), TOBN(0x856d6285, 0x295018fa),
+       TOBN(0x433f665c, 0x3263c949), TOBN(0xa6a76dd6, 0xa1f21409)},
+      {TOBN(0x17d32334, 0xcc7b4f79), TOBN(0xa1d03122, 0x06720e4a),
+       TOBN(0xadb6661d, 0x81d9bed5), TOBN(0xf0d6fb02, 0x11db15d1)}},
+     {{TOBN(0x7fd11ad5, 0x1fb747d2), TOBN(0xab50f959, 0x3033762b),
+       TOBN(0x2a7e711b, 0xfbefaf5a), TOBN(0xc7393278, 0x3fef2bbf)},
+      {TOBN(0xe29fa244, 0x0df6f9be), TOBN(0x9092757b, 0x71efd215),
+       TOBN(0xee60e311, 0x4f3d6fd9), TOBN(0x338542d4, 0x0acfb78b)}},
+     {{TOBN(0x44a23f08, 0x38961a0f), TOBN(0x1426eade, 0x986987ca),
+       TOBN(0x36e6ee2e, 0x4a863cc6), TOBN(0x48059420, 0x628b8b79)},
+      {TOBN(0x30303ad8, 0x7396e1de), TOBN(0x5c8bdc48, 0x38c5aad1),
+       TOBN(0x3e40e11f, 0x5c8f5066), TOBN(0xabd6e768, 0x8d246bbd)}},
+     {{TOBN(0x68aa40bb, 0x23330a01), TOBN(0xd23f5ee4, 0xc34eafa0),
+       TOBN(0x3bbee315, 0x5de02c21), TOBN(0x18dd4397, 0xd1d8dd06)},
+      {TOBN(0x3ba1939a, 0x122d7b44), TOBN(0xe6d3b40a, 0xa33870d6),
+       TOBN(0x8e620f70, 0x1c4fe3f8), TOBN(0xf6bba1a5, 0xd3a50cbf)}},
+     {{TOBN(0x4a78bde5, 0xcfc0aee0), TOBN(0x847edc46, 0xc08c50bd),
+       TOBN(0xbaa2439c, 0xad63c9b2), TOBN(0xceb4a728, 0x10fc2acb)},
+      {TOBN(0xa419e40e, 0x26da033d), TOBN(0x6cc3889d, 0x03e02683),
+       TOBN(0x1cd28559, 0xfdccf725), TOBN(0x0fd7e0f1, 0x8d13d208)}},
+     {{TOBN(0x01b9733b, 0x1f0df9d4), TOBN(0x8cc2c5f3, 0xa2b5e4f3),
+       TOBN(0x43053bfa, 0x3a304fd4), TOBN(0x8e87665c, 0x0a9f1aa7)},
+      {TOBN(0x087f29ec, 0xd73dc965), TOBN(0x15ace455, 0x3e9023db),
+       TOBN(0x2370e309, 0x2bce28b4), TOBN(0xf9723442, 0xb6b1e84a)}},
+     {{TOBN(0xbeee662e, 0xb72d9f26), TOBN(0xb19396de, 0xf0e47109),
+       TOBN(0x85b1fa73, 0xe13289d0), TOBN(0x436cf77e, 0x54e58e32)},
+      {TOBN(0x0ec833b3, 0xe990ef77), TOBN(0x7373e3ed, 0x1b11fc25),
+       TOBN(0xbe0eda87, 0x0fc332ce), TOBN(0xced04970, 0x8d7ea856)}},
+     {{TOBN(0xf85ff785, 0x7e977ca0), TOBN(0xb66ee8da, 0xdfdd5d2b),
+       TOBN(0xf5e37950, 0x905af461), TOBN(0x587b9090, 0x966d487c)},
+      {TOBN(0x6a198a1b, 0x32ba0127), TOBN(0xa7720e07, 0x141615ac),
+       TOBN(0xa23f3499, 0x996ef2f2), TOBN(0xef5f64b4, 0x470bcb3d)}},
+     {{TOBN(0xa526a962, 0x92b8c559), TOBN(0x0c14aac0, 0x69740a0f),
+       TOBN(0x0d41a9e3, 0xa6bdc0a5), TOBN(0x97d52106, 0x9c48aef4)},
+      {TOBN(0xcf16bd30, 0x3e7c253b), TOBN(0xcc834b1a, 0x47fdedc1),
+       TOBN(0x7362c6e5, 0x373aab2e), TOBN(0x264ed85e, 0xc5f590ff)}},
+     {{TOBN(0x7a46d9c0, 0x66d41870), TOBN(0xa50c20b1, 0x4787ba09),
+       TOBN(0x185e7e51, 0xe3d44635), TOBN(0xb3b3e080, 0x31e2d8dc)},
+      {TOBN(0xbed1e558, 0xa179e9d9), TOBN(0x2daa3f79, 0x74a76781),
+       TOBN(0x4372baf2, 0x3a40864f), TOBN(0x46900c54, 0x4fe75cb5)}},
+     {{TOBN(0xb95f171e, 0xf76765d0), TOBN(0x4ad726d2, 0x95c87502),
+       TOBN(0x2ec769da, 0x4d7c99bd), TOBN(0x5e2ddd19, 0xc36cdfa8)},
+      {TOBN(0xc22117fc, 0xa93e6dea), TOBN(0xe8a2583b, 0x93771123),
+       TOBN(0xbe2f6089, 0xfa08a3a2), TOBN(0x4809d5ed, 0x8f0e1112)}},
+     {{TOBN(0x3b414aa3, 0xda7a095e), TOBN(0x9049acf1, 0x26f5aadd),
+       TOBN(0x78d46a4d, 0x6be8b84a), TOBN(0xd66b1963, 0xb732b9b3)},
+      {TOBN(0x5c2ac2a0, 0xde6e9555), TOBN(0xcf52d098, 0xb5bd8770),
+       TOBN(0x15a15fa6, 0x0fd28921), TOBN(0x56ccb81e, 0x8b27536d)}},
+     {{TOBN(0x0f0d8ab8, 0x9f4ccbb8), TOBN(0xed5f44d2, 0xdb221729),
+       TOBN(0x43141988, 0x00bed10c), TOBN(0xc94348a4, 0x1d735b8b)},
+      {TOBN(0x79f3e9c4, 0x29ef8479), TOBN(0x4c13a4e3, 0x614c693f),
+       TOBN(0x32c9af56, 0x8e143a14), TOBN(0xbc517799, 0xe29ac5c4)}},
+     {{TOBN(0x05e17992, 0x2774856f), TOBN(0x6e52fb05, 0x6c1bf55f),
+       TOBN(0xaeda4225, 0xe4f19e16), TOBN(0x70f4728a, 0xaf5ccb26)},
+      {TOBN(0x5d2118d1, 0xb2947f22), TOBN(0xc827ea16, 0x281d6fb9),
+       TOBN(0x8412328d, 0x8cf0eabd), TOBN(0x45ee9fb2, 0x03ef9dcf)}},
+     {{TOBN(0x8e700421, 0xbb937d63), TOBN(0xdf8ff2d5, 0xcc4b37a6),
+       TOBN(0xa4c0d5b2, 0x5ced7b68), TOBN(0x6537c1ef, 0xc7308f59)},
+      {TOBN(0x25ce6a26, 0x3b37f8e8), TOBN(0x170e9a9b, 0xdeebc6ce),
+       TOBN(0xdd037952, 0x8728d72c), TOBN(0x445b0e55, 0x850154bc)}},
+     {{TOBN(0x4b7d0e06, 0x83a7337b), TOBN(0x1e3416d4, 0xffecf249),
+       TOBN(0x24840eff, 0x66a2b71f), TOBN(0xd0d9a50a, 0xb37cc26d)},
+      {TOBN(0xe2198150, 0x6fe28ef7), TOBN(0x3cc5ef16, 0x23324c7f),
+       TOBN(0x220f3455, 0x769b5263), TOBN(0xe2ade2f1, 0xa10bf475)}},
+     {{TOBN(0x28cd20fa, 0x458d3671), TOBN(0x1549722c, 0x2dc4847b),
+       TOBN(0x6dd01e55, 0x591941e3), TOBN(0x0e6fbcea, 0x27128ccb)},
+      {TOBN(0xae1a1e6b, 0x3bef0262), TOBN(0xfa8c472c, 0x8f54e103),
+       TOBN(0x7539c0a8, 0x72c052ec), TOBN(0xd7b27369, 0x5a3490e9)}},
+     {{TOBN(0x143fe1f1, 0x71684349), TOBN(0x36b4722e, 0x32e19b97),
+       TOBN(0xdc059227, 0x90980aff), TOBN(0x175c9c88, 0x9e13d674)},
+      {TOBN(0xa7de5b22, 0x6e6bfdb1), TOBN(0x5ea5b7b2, 0xbedb4b46),
+       TOBN(0xd5570191, 0xd34a6e44), TOBN(0xfcf60d2e, 0xa24ff7e6)}},
+     {{TOBN(0x614a392d, 0x677819e1), TOBN(0x7be74c7e, 0xaa5a29e8),
+       TOBN(0xab50fece, 0x63c85f3f), TOBN(0xaca2e2a9, 0x46cab337)},
+      {TOBN(0x7f700388, 0x122a6fe3), TOBN(0xdb69f703, 0x882a04a8),
+       TOBN(0x9a77935d, 0xcf7aed57), TOBN(0xdf16207c, 0x8d91c86f)}},
+     {{TOBN(0x2fca49ab, 0x63ed9998), TOBN(0xa3125c44, 0xa77ddf96),
+       TOBN(0x05dd8a86, 0x24344072), TOBN(0xa023dda2, 0xfec3fb56)},
+      {TOBN(0x421b41fc, 0x0c743032), TOBN(0x4f2120c1, 0x5e438639),
+       TOBN(0xfb7cae51, 0xc83c1b07), TOBN(0xb2370caa, 0xcac2171a)}},
+     {{TOBN(0x2eb2d962, 0x6cc820fb), TOBN(0x59feee5c, 0xb85a44bf),
+       TOBN(0x94620fca, 0x5b6598f0), TOBN(0x6b922cae, 0x7e314051)},
+      {TOBN(0xff8745ad, 0x106bed4e), TOBN(0x546e71f5, 0xdfa1e9ab),
+       TOBN(0x935c1e48, 0x1ec29487), TOBN(0x9509216c, 0x4d936530)}},
+     {{TOBN(0xc7ca3067, 0x85c9a2db), TOBN(0xd6ae5152, 0x6be8606f),
+       TOBN(0x09dbcae6, 0xe14c651d), TOBN(0xc9536e23, 0x9bc32f96)},
+      {TOBN(0xa90535a9, 0x34521b03), TOBN(0xf39c526c, 0x878756ff),
+       TOBN(0x383172ec, 0x8aedf03c), TOBN(0x20a8075e, 0xefe0c034)}},
+     {{TOBN(0xf22f9c62, 0x64026422), TOBN(0x8dd10780, 0x24b9d076),
+       TOBN(0x944c742a, 0x3bef2950), TOBN(0x55b9502e, 0x88a2b00b)},
+      {TOBN(0xa59e14b4, 0x86a09817), TOBN(0xa39dd3ac, 0x47bb4071),
+       TOBN(0x55137f66, 0x3be0592f), TOBN(0x07fcafd4, 0xc9e63f5b)}},
+     {{TOBN(0x963652ee, 0x346eb226), TOBN(0x7dfab085, 0xec2facb7),
+       TOBN(0x273bf2b8, 0x691add26), TOBN(0x30d74540, 0xf2b46c44)},
+      {TOBN(0x05e8e73e, 0xf2c2d065), TOBN(0xff9b8a00, 0xd42eeac9),
+       TOBN(0x2fcbd205, 0x97209d22), TOBN(0xeb740ffa, 0xde14ea2c)}},
+     {{TOBN(0xc71ff913, 0xa8aef518), TOBN(0x7bfc74bb, 0xfff4cfa2),
+       TOBN(0x1716680c, 0xb6b36048), TOBN(0x121b2cce, 0x9ef79af1)},
+      {TOBN(0xbff3c836, 0xa01eb3d3), TOBN(0x50eb1c6a, 0x5f79077b),
+       TOBN(0xa48c32d6, 0xa004bbcf), TOBN(0x47a59316, 0x7d64f61d)}},
+     {{TOBN(0x6068147f, 0x93102016), TOBN(0x12c5f654, 0x94d12576),
+       TOBN(0xefb071a7, 0xc9bc6b91), TOBN(0x7c2da0c5, 0x6e23ea95)},
+      {TOBN(0xf4fd45b6, 0xd4a1dd5d), TOBN(0x3e7ad9b6, 0x9122b13c),
+       TOBN(0x342ca118, 0xe6f57a48), TOBN(0x1c2e94a7, 0x06f8288f)}},
+     {{TOBN(0x99e68f07, 0x5a97d231), TOBN(0x7c80de97, 0x4d838758),
+       TOBN(0xbce0f5d0, 0x05872727), TOBN(0xbe5d95c2, 0x19c4d016)},
+      {TOBN(0x921d5cb1, 0x9c2492ee), TOBN(0x42192dc1, 0x404d6fb3),
+       TOBN(0x4c84dcd1, 0x32f988d3), TOBN(0xde26d61f, 0xa17b8e85)}},
+     {{TOBN(0xc466dcb6, 0x137c7408), TOBN(0x9a38d7b6, 0x36a266da),
+       TOBN(0x7ef5cb06, 0x83bebf1b), TOBN(0xe5cdcbbf, 0x0fd014e3)},
+      {TOBN(0x30aa376d, 0xf65965a0), TOBN(0x60fe88c2, 0xebb3e95e),
+       TOBN(0x33fd0b61, 0x66ee6f20), TOBN(0x8827dcdb, 0x3f41f0a0)}},
+     {{TOBN(0xbf8a9d24, 0x0c56c690), TOBN(0x40265dad, 0xddb7641d),
+       TOBN(0x522b05bf, 0x3a6b662b), TOBN(0x466d1dfe, 0xb1478c9b)},
+      {TOBN(0xaa616962, 0x1484469b), TOBN(0x0db60549, 0x02df8f9f),
+       TOBN(0xc37bca02, 0x3cb8bf51), TOBN(0x5effe346, 0x21371ce8)}},
+     {{TOBN(0xe8f65264, 0xff112c32), TOBN(0x8a9c736d, 0x7b971fb2),
+       TOBN(0xa4f19470, 0x7b75080d), TOBN(0xfc3f2c5a, 0x8839c59b)},
+      {TOBN(0x1d6c777e, 0x5aeb49c2), TOBN(0xf3db034d, 0xda1addfe),
+       TOBN(0xd76fee5a, 0x5535affc), TOBN(0x0853ac70, 0xb92251fd)}},
+     {{TOBN(0x37e3d594, 0x8b2a29d5), TOBN(0x28f1f457, 0x4de00ddb),
+       TOBN(0x8083c1b5, 0xf42c328b), TOBN(0xd8ef1d8f, 0xe493c73b)},
+      {TOBN(0x96fb6260, 0x41dc61bd), TOBN(0xf74e8a9d, 0x27ee2f8a),
+       TOBN(0x7c605a80, 0x2c946a5d), TOBN(0xeed48d65, 0x3839ccfd)}},
+     {{TOBN(0x9894344f, 0x3a29467a), TOBN(0xde81e949, 0xc51eba6d),
+       TOBN(0xdaea066b, 0xa5e5c2f2), TOBN(0x3fc8a614, 0x08c8c7b3)},
+      {TOBN(0x7adff88f, 0x06d0de9f), TOBN(0xbbc11cf5, 0x3b75ce0a),
+       TOBN(0x9fbb7acc, 0xfbbc87d5), TOBN(0xa1458e26, 0x7badfde2)}}},
+    {{{TOBN(0x1cb43668, 0xe039c256), TOBN(0x5f26fb8b, 0x7c17fd5d),
+       TOBN(0xeee426af, 0x79aa062b), TOBN(0x072002d0, 0xd78fbf04)},
+      {TOBN(0x4c9ca237, 0xe84fb7e3), TOBN(0xb401d8a1, 0x0c82133d),
+       TOBN(0xaaa52592, 0x6d7e4181), TOBN(0xe9430833, 0x73dbb152)}},
+     {{TOBN(0xf92dda31, 0xbe24319a), TOBN(0x03f7d28b, 0xe095a8e7),
+       TOBN(0xa52fe840, 0x98782185), TOBN(0x276ddafe, 0x29c24dbc)},
+      {TOBN(0x80cd5496, 0x1d7a64eb), TOBN(0xe4360889, 0x7f1dbe42),
+       TOBN(0x2f81a877, 0x8438d2d5), TOBN(0x7e4d52a8, 0x85169036)}},
+     {{TOBN(0x19e3d5b1, 0x1d59715d), TOBN(0xc7eaa762, 0xd788983e),
+       TOBN(0xe5a730b0, 0xabf1f248), TOBN(0xfbab8084, 0xfae3fd83)},
+      {TOBN(0x65e50d21, 0x53765b2f), TOBN(0xbdd4e083, 0xfa127f3d),
+       TOBN(0x9cf3c074, 0x397b1b10), TOBN(0x59f8090c, 0xb1b59fd3)}},
+     {{TOBN(0x7b15fd9d, 0x615faa8f), TOBN(0x8fa1eb40, 0x968554ed),
+       TOBN(0x7bb4447e, 0x7aa44882), TOBN(0x2bb2d0d1, 0x029fff32)},
+      {TOBN(0x075e2a64, 0x6caa6d2f), TOBN(0x8eb879de, 0x22e7351b),
+       TOBN(0xbcd5624e, 0x9a506c62), TOBN(0x218eaef0, 0xa87e24dc)}},
+     {{TOBN(0x37e56847, 0x44ddfa35), TOBN(0x9ccfc5c5, 0xdab3f747),
+       TOBN(0x9ac1df3f, 0x1ee96cf4), TOBN(0x0c0571a1, 0x3b480b8f)},
+      {TOBN(0x2fbeb3d5, 0x4b3a7b3c), TOBN(0x35c03669, 0x5dcdbb99),
+       TOBN(0x52a0f5dc, 0xb2415b3a), TOBN(0xd57759b4, 0x4413ed9a)}},
+     {{TOBN(0x1fe647d8, 0x3d30a2c5), TOBN(0x0857f77e, 0xf78a81dc),
+       TOBN(0x11d5a334, 0x131a4a9b), TOBN(0xc0a94af9, 0x29d393f5)},
+      {TOBN(0xbc3a5c0b, 0xdaa6ec1a), TOBN(0xba9fe493, 0x88d2d7ed),
+       TOBN(0xbb4335b4, 0xbb614797), TOBN(0x991c4d68, 0x72f83533)}},
+     {{TOBN(0x53258c28, 0xd2f01cb3), TOBN(0x93d6eaa3, 0xd75db0b1),
+       TOBN(0x419a2b0d, 0xe87d0db4), TOBN(0xa1e48f03, 0xd8fe8493)},
+      {TOBN(0xf747faf6, 0xc508b23a), TOBN(0xf137571a, 0x35d53549),
+       TOBN(0x9f5e58e2, 0xfcf9b838), TOBN(0xc7186cee, 0xa7fd3cf5)}},
+     {{TOBN(0x77b868ce, 0xe978a1d3), TOBN(0xe3a68b33, 0x7ab92d04),
+       TOBN(0x51029794, 0x87a5b862), TOBN(0x5f0606c3, 0x3a61d41d)},
+      {TOBN(0x2814be27, 0x6f9326f1), TOBN(0x2f521c14, 0xc6fe3c2e),
+       TOBN(0x17464d7d, 0xacdf7351), TOBN(0x10f5f9d3, 0x777f7e44)}},
+     {{TOBN(0xce8e616b, 0x269fb37d), TOBN(0xaaf73804, 0x7de62de5),
+       TOBN(0xaba11175, 0x4fdd4153), TOBN(0x515759ba, 0x3770b49b)},
+      {TOBN(0x8b09ebf8, 0xaa423a61), TOBN(0x592245a1, 0xcd41fb92),
+       TOBN(0x1cba8ec1, 0x9b4c8936), TOBN(0xa87e91e3, 0xaf36710e)}},
+     {{TOBN(0x1fd84ce4, 0x3d34a2e3), TOBN(0xee3759ce, 0xb43b5d61),
+       TOBN(0x895bc78c, 0x619186c7), TOBN(0xf19c3809, 0xcbb9725a)},
+      {TOBN(0xc0be21aa, 0xde744b1f), TOBN(0xa7d222b0, 0x60f8056b),
+       TOBN(0x74be6157, 0xb23efe11), TOBN(0x6fab2b4f, 0x0cd68253)}},
+     {{TOBN(0xad33ea5f, 0x4bf1d725), TOBN(0x9c1d8ee2, 0x4f6c950f),
+       TOBN(0x544ee78a, 0xa377af06), TOBN(0x54f489bb, 0x94a113e1)},
+      {TOBN(0x8f11d634, 0x992fb7e8), TOBN(0x0169a7aa, 0xa2a44347),
+       TOBN(0x1d49d4af, 0x95020e00), TOBN(0x95945722, 0xe08e120b)}},
+     {{TOBN(0xb6e33878, 0xa4d32282), TOBN(0xe36e029d, 0x48020ae7),
+       TOBN(0xe05847fb, 0x37a9b750), TOBN(0xf876812c, 0xb29e3819)},
+      {TOBN(0x84ad138e, 0xd23a17f0), TOBN(0x6d7b4480, 0xf0b3950e),
+       TOBN(0xdfa8aef4, 0x2fd67ae0), TOBN(0x8d3eea24, 0x52333af6)}},
+     {{TOBN(0x0d052075, 0xb15d5acc), TOBN(0xc6d9c79f, 0xbd815bc4),
+       TOBN(0x8dcafd88, 0xdfa36cf2), TOBN(0x908ccbe2, 0x38aa9070)},
+      {TOBN(0x638722c4, 0xba35afce), TOBN(0x5a3da8b0, 0xfd6abf0b),
+       TOBN(0x2dce252c, 0xc9c335c1), TOBN(0x84e7f0de, 0x65aa799b)}},
+     {{TOBN(0x2101a522, 0xb99a72cb), TOBN(0x06de6e67, 0x87618016),
+       TOBN(0x5ff8c7cd, 0xe6f3653e), TOBN(0x0a821ab5, 0xc7a6754a)},
+      {TOBN(0x7e3fa52b, 0x7cb0b5a2), TOBN(0xa7fb121c, 0xc9048790),
+       TOBN(0x1a725020, 0x06ce053a), TOBN(0xb490a31f, 0x04e929b0)}},
+     {{TOBN(0xe17be47d, 0x62dd61ad), TOBN(0x781a961c, 0x6be01371),
+       TOBN(0x1063bfd3, 0xdae3cbba), TOBN(0x35647406, 0x7f73c9ba)},
+      {TOBN(0xf50e957b, 0x2736a129), TOBN(0xa6313702, 0xed13f256),
+       TOBN(0x9436ee65, 0x3a19fcc5), TOBN(0xcf2bdb29, 0xe7a4c8b6)}},
+     {{TOBN(0xb06b1244, 0xc5f95cd8), TOBN(0xda8c8af0, 0xf4ab95f4),
+       TOBN(0x1bae59c2, 0xb9e5836d), TOBN(0x07d51e7e, 0x3acffffc)},
+      {TOBN(0x01e15e6a, 0xc2ccbcda), TOBN(0x3bc1923f, 0x8528c3e0),
+       TOBN(0x43324577, 0xa49fead4), TOBN(0x61a1b884, 0x2aa7a711)}},
+     {{TOBN(0xf9a86e08, 0x700230ef), TOBN(0x0af585a1, 0xbd19adf8),
+       TOBN(0x7645f361, 0xf55ad8f2), TOBN(0x6e676223, 0x46c3614c)},
+      {TOBN(0x23cb257c, 0x4e774d3f), TOBN(0x82a38513, 0xac102d1b),
+       TOBN(0x9bcddd88, 0x7b126aa5), TOBN(0xe716998b, 0xeefd3ee4)}},
+     {{TOBN(0x4239d571, 0xfb167583), TOBN(0xdd011c78, 0xd16c8f8a),
+       TOBN(0x271c2895, 0x69a27519), TOBN(0x9ce0a3b7, 0xd2d64b6a)},
+      {TOBN(0x8c977289, 0xd5ec6738), TOBN(0xa3b49f9a, 0x8840ef6b),
+       TOBN(0x808c14c9, 0x9a453419), TOBN(0x5c00295b, 0x0cf0a2d5)}},
+     {{TOBN(0x524414fb, 0x1d4bcc76), TOBN(0xb07691d2, 0x459a88f1),
+       TOBN(0x77f43263, 0xf70d110f), TOBN(0x64ada5e0, 0xb7abf9f3)},
+      {TOBN(0xafd0f94e, 0x5b544cf5), TOBN(0xb4a13a15, 0xfd2713fe),
+       TOBN(0xb99b7d6e, 0x250c74f4), TOBN(0x097f2f73, 0x20324e45)}},
+     {{TOBN(0x994b37d8, 0xaffa8208), TOBN(0xc3c31b0b, 0xdc29aafc),
+       TOBN(0x3da74651, 0x7a3a607f), TOBN(0xd8e1b8c1, 0xfe6955d6)},
+      {TOBN(0x716e1815, 0xc8418682), TOBN(0x541d487f, 0x7dc91d97),
+       TOBN(0x48a04669, 0xc6996982), TOBN(0xf39cab15, 0x83a6502e)}},
+     {{TOBN(0x025801a0, 0xe68db055), TOBN(0xf3569758, 0xba3338d5),
+       TOBN(0xb0c8c0aa, 0xee2afa84), TOBN(0x4f6985d3, 0xfb6562d1)},
+      {TOBN(0x351f1f15, 0x132ed17a), TOBN(0x510ed0b4, 0xc04365fe),
+       TOBN(0xa3f98138, 0xe5b1f066), TOBN(0xbc9d95d6, 0x32df03dc)}},
+     {{TOBN(0xa83ccf6e, 0x19abd09e), TOBN(0x0b4097c1, 0x4ff17edb),
+       TOBN(0x58a5c478, 0xd64a06ce), TOBN(0x2ddcc3fd, 0x544a58fd)},
+      {TOBN(0xd449503d, 0x9e8153b8), TOBN(0x3324fd02, 0x7774179b),
+       TOBN(0xaf5d47c8, 0xdbd9120c), TOBN(0xeb860162, 0x34fa94db)}},
+     {{TOBN(0x5817bdd1, 0x972f07f4), TOBN(0xe5579e2e, 0xd27bbceb),
+       TOBN(0x86847a1f, 0x5f11e5a6), TOBN(0xb39ed255, 0x7c3cf048)},
+      {TOBN(0xe1076417, 0xa2f62e55), TOBN(0x6b9ab38f, 0x1bcf82a2),
+       TOBN(0x4bb7c319, 0x7aeb29f9), TOBN(0xf6d17da3, 0x17227a46)}},
+     {{TOBN(0xab53ddbd, 0x0f968c00), TOBN(0xa03da7ec, 0x000c880b),
+       TOBN(0x7b239624, 0x6a9ad24d), TOBN(0x612c0401, 0x01ec60d0)},
+      {TOBN(0x70d10493, 0x109f5df1), TOBN(0xfbda4030, 0x80af7550),
+       TOBN(0x30b93f95, 0xc6b9a9b3), TOBN(0x0c74ec71, 0x007d9418)}},
+     {{TOBN(0x94175564, 0x6edb951f), TOBN(0x5f4a9d78, 0x7f22c282),
+       TOBN(0xb7870895, 0xb38d1196), TOBN(0xbc593df3, 0xa228ce7c)},
+      {TOBN(0xc78c5bd4, 0x6af3641a), TOBN(0x7802200b, 0x3d9b3dcc),
+       TOBN(0x0dc73f32, 0x8be33304), TOBN(0x847ed87d, 0x61ffb79a)}},
+     {{TOBN(0xf85c974e, 0x6d671192), TOBN(0x1e14100a, 0xde16f60f),
+       TOBN(0x45cb0d5a, 0x95c38797), TOBN(0x18923bba, 0x9b022da4)},
+      {TOBN(0xef2be899, 0xbbe7e86e), TOBN(0x4a1510ee, 0x216067bf),
+       TOBN(0xd98c8154, 0x84d5ce3e), TOBN(0x1af777f0, 0xf92a2b90)}},
+     {{TOBN(0x9fbcb400, 0x4ef65724), TOBN(0x3e04a4c9, 0x3c0ca6fe),
+       TOBN(0xfb3e2cb5, 0x55002994), TOBN(0x1f3a93c5, 0x5363ecab)},
+      {TOBN(0x1fe00efe, 0x3923555b), TOBN(0x744bedd9, 0x1e1751ea),
+       TOBN(0x3fb2db59, 0x6ab69357), TOBN(0x8dbd7365, 0xf5e6618b)}},
+     {{TOBN(0x99d53099, 0xdf1ea40e), TOBN(0xb3f24a0b, 0x57d61e64),
+       TOBN(0xd088a198, 0x596eb812), TOBN(0x22c8361b, 0x5762940b)},
+      {TOBN(0x66f01f97, 0xf9c0d95c), TOBN(0x88461172, 0x8e43cdae),
+       TOBN(0x11599a7f, 0xb72b15c3), TOBN(0x135a7536, 0x420d95cc)}},
+     {{TOBN(0x2dcdf0f7, 0x5f7ae2f6), TOBN(0x15fc6e1d, 0xd7fa6da2),
+       TOBN(0x81ca829a, 0xd1d441b6), TOBN(0x84c10cf8, 0x04a106b6)},
+      {TOBN(0xa9b26c95, 0xa73fbbd0), TOBN(0x7f24e0cb, 0x4d8f6ee8),
+       TOBN(0x48b45937, 0x1e25a043), TOBN(0xf8a74fca, 0x036f3dfe)}},
+     {{TOBN(0x1ed46585, 0xc9f84296), TOBN(0x7fbaa8fb, 0x3bc278b0),
+       TOBN(0xa8e96cd4, 0x6c4fcbd0), TOBN(0x940a1202, 0x73b60a5f)},
+      {TOBN(0x34aae120, 0x55a4aec8), TOBN(0x550e9a74, 0xdbd742f0),
+       TOBN(0x794456d7, 0x228c68ab), TOBN(0x492f8868, 0xa4e25ec6)}},
+     {{TOBN(0x682915ad, 0xb2d8f398), TOBN(0xf13b51cc, 0x5b84c953),
+       TOBN(0xcda90ab8, 0x5bb917d6), TOBN(0x4b615560, 0x4ea3dee1)},
+      {TOBN(0x578b4e85, 0x0a52c1c8), TOBN(0xeab1a695, 0x20b75fc4),
+       TOBN(0x60c14f3c, 0xaa0bb3c6), TOBN(0x220f448a, 0xb8216094)}},
+     {{TOBN(0x4fe7ee31, 0xb0e63d34), TOBN(0xf4600572, 0xa9e54fab),
+       TOBN(0xc0493334, 0xd5e7b5a4), TOBN(0x8589fb92, 0x06d54831)},
+      {TOBN(0xaa70f5cc, 0x6583553a), TOBN(0x0879094a, 0xe25649e5),
+       TOBN(0xcc904507, 0x10044652), TOBN(0xebb0696d, 0x02541c4f)}},
+     {{TOBN(0x5a171fde, 0xb9718710), TOBN(0x38f1bed8, 0xf374a9f5),
+       TOBN(0xc8c582e1, 0xba39bdc1), TOBN(0xfc457b0a, 0x908cc0ce)},
+      {TOBN(0x9a187fd4, 0x883841e2), TOBN(0x8ec25b39, 0x38725381),
+       TOBN(0x2553ed05, 0x96f84395), TOBN(0x095c7661, 0x6f6c6897)}},
+     {{TOBN(0x917ac85c, 0x4bdc5610), TOBN(0xb2885fe4, 0x179eb301),
+       TOBN(0x5fc65547, 0x8b78bdcc), TOBN(0x4a9fc893, 0xe59e4699)},
+      {TOBN(0xbb7ff0cd, 0x3ce299af), TOBN(0x195be9b3, 0xadf38b20),
+       TOBN(0x6a929c87, 0xd38ddb8f), TOBN(0x55fcc99c, 0xb21a51b9)}},
+     {{TOBN(0x2b695b4c, 0x721a4593), TOBN(0xed1e9a15, 0x768eaac2),
+       TOBN(0xfb63d71c, 0x7489f914), TOBN(0xf98ba31c, 0x78118910)},
+      {TOBN(0x80291373, 0x9b128eb4), TOBN(0x7801214e, 0xd448af4a),
+       TOBN(0xdbd2e22b, 0x55418dd3), TOBN(0xeffb3c0d, 0xd3998242)}},
+     {{TOBN(0xdfa6077c, 0xc7bf3827), TOBN(0xf2165bcb, 0x47f8238f),
+       TOBN(0xfe37cf68, 0x8564d554), TOBN(0xe5f825c4, 0x0a81fb98)},
+      {TOBN(0x43cc4f67, 0xffed4d6f), TOBN(0xbc609578, 0xb50a34b0),
+       TOBN(0x8aa8fcf9, 0x5041faf1), TOBN(0x5659f053, 0x651773b6)}},
+     {{TOBN(0xe87582c3, 0x6044d63b), TOBN(0xa6089409, 0x0cdb0ca0),
+       TOBN(0x8c993e0f, 0xbfb2bcf6), TOBN(0xfc64a719, 0x45985cfc)},
+      {TOBN(0x15c4da80, 0x83dbedba), TOBN(0x804ae112, 0x2be67df7),
+       TOBN(0xda4c9658, 0xa23defde), TOBN(0x12002ddd, 0x5156e0d3)}},
+     {{TOBN(0xe68eae89, 0x5dd21b96), TOBN(0x8b99f28b, 0xcf44624d),
+       TOBN(0x0ae00808, 0x1ec8897a), TOBN(0xdd0a9303, 0x6712f76e)},
+      {TOBN(0x96237522, 0x4e233de4), TOBN(0x192445b1, 0x2b36a8a5),
+       TOBN(0xabf9ff74, 0x023993d9), TOBN(0x21f37bf4, 0x2aad4a8f)}},
+     {{TOBN(0x340a4349, 0xf8bd2bbd), TOBN(0x1d902cd9, 0x4868195d),
+       TOBN(0x3d27bbf1, 0xe5fdb6f1), TOBN(0x7a5ab088, 0x124f9f1c)},
+      {TOBN(0xc466ab06, 0xf7a09e03), TOBN(0x2f8a1977, 0x31f2c123),
+       TOBN(0xda355dc7, 0x041b6657), TOBN(0xcb840d12, 0x8ece2a7c)}},
+     {{TOBN(0xb600ad9f, 0x7db32675), TOBN(0x78fea133, 0x07a06f1b),
+       TOBN(0x5d032269, 0xb31f6094), TOBN(0x07753ef5, 0x83ec37aa)},
+      {TOBN(0x03485aed, 0x9c0bea78), TOBN(0x41bb3989, 0xbc3f4524),
+       TOBN(0x09403761, 0x697f726d), TOBN(0x6109beb3, 0xdf394820)}},
+     {{TOBN(0x804111ea, 0x3b6d1145), TOBN(0xb6271ea9, 0xa8582654),
+       TOBN(0x619615e6, 0x24e66562), TOBN(0xa2554945, 0xd7b6ad9c)},
+      {TOBN(0xd9c4985e, 0x99bfe35f), TOBN(0x9770ccc0, 0x7b51cdf6),
+       TOBN(0x7c327013, 0x92881832), TOBN(0x8777d45f, 0x286b26d1)}},
+     {{TOBN(0x9bbeda22, 0xd847999d), TOBN(0x03aa33b6, 0xc3525d32),
+       TOBN(0x4b7b96d4, 0x28a959a1), TOBN(0xbb3786e5, 0x31e5d234)},
+      {TOBN(0xaeb5d3ce, 0x6961f247), TOBN(0x20aa85af, 0x02f93d3f),
+       TOBN(0x9cd1ad3d, 0xd7a7ae4f), TOBN(0xbf6688f0, 0x781adaa8)}},
+     {{TOBN(0xb1b40e86, 0x7469cead), TOBN(0x1904c524, 0x309fca48),
+       TOBN(0x9b7312af, 0x4b54bbc7), TOBN(0xbe24bf8f, 0x593affa2)},
+      {TOBN(0xbe5e0790, 0xbd98764b), TOBN(0xa0f45f17, 0xa26e299e),
+       TOBN(0x4af0d2c2, 0x6b8fe4c7), TOBN(0xef170db1, 0x8ae8a3e6)}},
+     {{TOBN(0x0e8d61a0, 0x29e0ccc1), TOBN(0xcd53e87e, 0x60ad36ca),
+       TOBN(0x328c6623, 0xc8173822), TOBN(0x7ee1767d, 0xa496be55)},
+      {TOBN(0x89f13259, 0x648945af), TOBN(0x9e45a5fd, 0x25c8009c),
+       TOBN(0xaf2febd9, 0x1f61ab8c), TOBN(0x43f6bc86, 0x8a275385)}},
+     {{TOBN(0x87792348, 0xf2142e79), TOBN(0x17d89259, 0xc6e6238a),
+       TOBN(0x7536d2f6, 0x4a839d9b), TOBN(0x1f428fce, 0x76a1fbdc)},
+      {TOBN(0x1c109601, 0x0db06dfe), TOBN(0xbfc16bc1, 0x50a3a3cc),
+       TOBN(0xf9cbd9ec, 0x9b30f41b), TOBN(0x5b5da0d6, 0x00138cce)}},
+     {{TOBN(0xec1d0a48, 0x56ef96a7), TOBN(0xb47eb848, 0x982bf842),
+       TOBN(0x66deae32, 0xec3f700d), TOBN(0x4e43c42c, 0xaa1181e0)},
+      {TOBN(0xa1d72a31, 0xd1a4aa2a), TOBN(0x440d4668, 0xc004f3ce),
+       TOBN(0x0d6a2d3b, 0x45fe8a7a), TOBN(0x820e52e2, 0xfb128365)}},
+     {{TOBN(0x29ac5fcf, 0x25e51b09), TOBN(0x180cd2bf, 0x2023d159),
+       TOBN(0xa9892171, 0xa1ebf90e), TOBN(0xf97c4c87, 0x7c132181)},
+      {TOBN(0x9f1dc724, 0xc03dbb7e), TOBN(0xae043765, 0x018cbbe4),
+       TOBN(0xfb0b2a36, 0x0767d153), TOBN(0xa8e2f4d6, 0x249cbaeb)}},
+     {{TOBN(0x172a5247, 0xd95ea168), TOBN(0x1758fada, 0x2970764a),
+       TOBN(0xac803a51, 0x1d978169), TOBN(0x299cfe2e, 0xde77e01b)},
+      {TOBN(0x652a1e17, 0xb0a98927), TOBN(0x2e26e1d1, 0x20014495),
+       TOBN(0x7ae0af9f, 0x7175b56a), TOBN(0xc2e22a80, 0xd64b9f95)}},
+     {{TOBN(0x4d0ff9fb, 0xd90a060a), TOBN(0x496a27db, 0xbaf38085),
+       TOBN(0x32305401, 0xda776bcf), TOBN(0xb8cdcef6, 0x725f209e)},
+      {TOBN(0x61ba0f37, 0x436a0bba), TOBN(0x263fa108, 0x76860049),
+       TOBN(0x92beb98e, 0xda3542cf), TOBN(0xa2d4d14a, 0xd5849538)}},
+     {{TOBN(0x989b9d68, 0x12e9a1bc), TOBN(0x61d9075c, 0x5f6e3268),
+       TOBN(0x352c6aa9, 0x99ace638), TOBN(0xde4e4a55, 0x920f43ff)},
+      {TOBN(0xe5e4144a, 0xd673c017), TOBN(0x667417ae, 0x6f6e05ea),
+       TOBN(0x613416ae, 0xdcd1bd56), TOBN(0x5eb36201, 0x86693711)}},
+     {{TOBN(0x2d7bc504, 0x3a1aa914), TOBN(0x175a1299, 0x76dc5975),
+       TOBN(0xe900e0f2, 0x3fc8125c), TOBN(0x569ef68c, 0x11198875)},
+      {TOBN(0x9012db63, 0x63a113b4), TOBN(0xe3bd3f56, 0x98835766),
+       TOBN(0xa5c94a52, 0x76412dea), TOBN(0xad9e2a09, 0xaa735e5c)}},
+     {{TOBN(0x405a984c, 0x508b65e9), TOBN(0xbde4a1d1, 0x6df1a0d1),
+       TOBN(0x1a9433a1, 0xdfba80da), TOBN(0xe9192ff9, 0x9440ad2e)},
+      {TOBN(0x9f649696, 0x5099fe92), TOBN(0x25ddb65c, 0x0b27a54a),
+       TOBN(0x178279dd, 0xc590da61), TOBN(0x5479a999, 0xfbde681a)}},
+     {{TOBN(0xd0e84e05, 0x013fe162), TOBN(0xbe11dc92, 0x632d471b),
+       TOBN(0xdf0b0c45, 0xfc0e089f), TOBN(0x04fb15b0, 0x4c144025)},
+      {TOBN(0xa61d5fc2, 0x13c99927), TOBN(0xa033e9e0, 0x3de2eb35),
+       TOBN(0xf8185d5c, 0xb8dacbb4), TOBN(0x9a88e265, 0x8644549d)}},
+     {{TOBN(0xf717af62, 0x54671ff6), TOBN(0x4bd4241b, 0x5fa58603),
+       TOBN(0x06fba40b, 0xe67773c0), TOBN(0xc1d933d2, 0x6a2847e9)},
+      {TOBN(0xf4f5acf3, 0x689e2c70), TOBN(0x92aab0e7, 0x46bafd31),
+       TOBN(0x798d76aa, 0x3473f6e5), TOBN(0xcc6641db, 0x93141934)}},
+     {{TOBN(0xcae27757, 0xd31e535e), TOBN(0x04cc43b6, 0x87c2ee11),
+       TOBN(0x8d1f9675, 0x2e029ffa), TOBN(0xc2150672, 0xe4cc7a2c)},
+      {TOBN(0x3b03c1e0, 0x8d68b013), TOBN(0xa9d6816f, 0xedf298f3),
+       TOBN(0x1bfbb529, 0xa2804464), TOBN(0x95a52fae, 0x5db22125)}},
+     {{TOBN(0x55b32160, 0x0e1cb64e), TOBN(0x004828f6, 0x7e7fc9fe),
+       TOBN(0x13394b82, 0x1bb0fb93), TOBN(0xb6293a2d, 0x35f1a920)},
+      {TOBN(0xde35ef21, 0xd145d2d9), TOBN(0xbe6225b3, 0xbb8fa603),
+       TOBN(0x00fc8f6b, 0x32cf252d), TOBN(0xa28e52e6, 0x117cf8c2)}},
+     {{TOBN(0x9d1dc89b, 0x4c371e6d), TOBN(0xcebe0675, 0x36ef0f28),
+       TOBN(0x5de05d09, 0xa4292f81), TOBN(0xa8303593, 0x353e3083)},
+      {TOBN(0xa1715b0a, 0x7e37a9bb), TOBN(0x8c56f61e, 0x2b8faec3),
+       TOBN(0x52507431, 0x33c9b102), TOBN(0x0130cefc, 0xa44431f0)}},
+     {{TOBN(0x56039fa0, 0xbd865cfb), TOBN(0x4b03e578, 0xbc5f1dd7),
+       TOBN(0x40edf2e4, 0xbabe7224), TOBN(0xc752496d, 0x3a1988f6)},
+      {TOBN(0xd1572d3b, 0x564beb6b), TOBN(0x0db1d110, 0x39a1c608),
+       TOBN(0x568d1934, 0x16f60126), TOBN(0x05ae9668, 0xf354af33)}},
+     {{TOBN(0x19de6d37, 0xc92544f2), TOBN(0xcc084353, 0xa35837d5),
+       TOBN(0xcbb6869c, 0x1a514ece), TOBN(0xb633e728, 0x2e1d1066)},
+      {TOBN(0xf15dd69f, 0x936c581c), TOBN(0x96e7b8ce, 0x7439c4f9),
+       TOBN(0x5e676f48, 0x2e448a5b), TOBN(0xb2ca7d5b, 0xfd916bbb)}},
+     {{TOBN(0xd55a2541, 0xf5024025), TOBN(0x47bc5769, 0xe4c2d937),
+       TOBN(0x7d31b92a, 0x0362189f), TOBN(0x83f3086e, 0xef7816f9)},
+      {TOBN(0xf9f46d94, 0xb587579a), TOBN(0xec2d22d8, 0x30e76c5f),
+       TOBN(0x27d57461, 0xb000ffcf), TOBN(0xbb7e65f9, 0x364ffc2c)}},
+     {{TOBN(0x7c7c9477, 0x6652a220), TOBN(0x61618f89, 0xd696c981),
+       TOBN(0x5021701d, 0x89effff3), TOBN(0xf2c8ff8e, 0x7c314163)},
+      {TOBN(0x2da413ad, 0x8efb4d3e), TOBN(0x937b5adf, 0xce176d95),
+       TOBN(0x22867d34, 0x2a67d51c), TOBN(0x262b9b10, 0x18eb3ac9)}},
+     {{TOBN(0x4e314fe4, 0xc43ff28b), TOBN(0x76476627, 0x6a664e7a),
+       TOBN(0x3e90e40b, 0xb7a565c2), TOBN(0x8588993a, 0xc1acf831)},
+      {TOBN(0xd7b501d6, 0x8f938829), TOBN(0x996627ee, 0x3edd7d4c),
+       TOBN(0x37d44a62, 0x90cd34c7), TOBN(0xa8327499, 0xf3833e8d)}},
+     {{TOBN(0x2e18917d, 0x4bf50353), TOBN(0x85dd726b, 0x556765fb),
+       TOBN(0x54fe65d6, 0x93d5ab66), TOBN(0x3ddbaced, 0x915c25fe)},
+      {TOBN(0xa799d9a4, 0x12f22e85), TOBN(0xe2a24867, 0x6d06f6bc),
+       TOBN(0xf4f1ee56, 0x43ca1637), TOBN(0xfda2828b, 0x61ece30a)}},
+     {{TOBN(0x758c1a3e, 0xa2dee7a6), TOBN(0xdcde2f3c, 0x734b2284),
+       TOBN(0xaba445d2, 0x4eaba6ad), TOBN(0x35aaf668, 0x76cee0a7)},
+      {TOBN(0x7e0b04a9, 0xe5aa049a), TOBN(0xe74083ad, 0x91103e84),
+       TOBN(0xbeb183ce, 0x40afecc3), TOBN(0x6b89de9f, 0xea043f7a)}}},
+    {{{TOBN(0x0e299d23, 0xfe67ba66), TOBN(0x91450760, 0x93cf2f34),
+       TOBN(0xf45b5ea9, 0x97fcf913), TOBN(0x5be00843, 0x8bd7ddda)},
+      {TOBN(0x358c3e05, 0xd53ff04d), TOBN(0xbf7ccdc3, 0x5de91ef7),
+       TOBN(0xad684dbf, 0xb69ec1a0), TOBN(0x367e7cf2, 0x801fd997)}},
+     {{TOBN(0x0ca1f3b7, 0xb0dc8595), TOBN(0x27de4608, 0x9f1d9f2e),
+       TOBN(0x1af3bf39, 0xbadd82a7), TOBN(0x79356a79, 0x65862448)},
+      {TOBN(0xc0602345, 0xf5f9a052), TOBN(0x1a8b0f89, 0x139a42f9),
+       TOBN(0xb53eee42, 0x844d40fc), TOBN(0x93b0bfe5, 0x4e5b6368)}},
+     {{TOBN(0x5434dd02, 0xc024789c), TOBN(0x90dca9ea, 0x41b57bfc),
+       TOBN(0x8aa898e2, 0x243398df), TOBN(0xf607c834, 0x894a94bb)},
+      {TOBN(0xbb07be97, 0xc2c99b76), TOBN(0x6576ba67, 0x18c29302),
+       TOBN(0x3d79efcc, 0xe703a88c), TOBN(0xf259ced7, 0xb6a0d106)}},
+     {{TOBN(0x0f893a5d, 0xc8de610b), TOBN(0xe8c515fb, 0x67e223ce),
+       TOBN(0x7774bfa6, 0x4ead6dc5), TOBN(0x89d20f95, 0x925c728f)},
+      {TOBN(0x7a1e0966, 0x098583ce), TOBN(0xa2eedb94, 0x93f2a7d7),
+       TOBN(0x1b282097, 0x4c304d4a), TOBN(0x0842e3da, 0xc077282d)}},
+     {{TOBN(0xe4d972a3, 0x3b9e2d7b), TOBN(0x7cc60b27, 0xc48218ff),
+       TOBN(0x8fc70838, 0x84149d91), TOBN(0x5c04346f, 0x2f461ecc)},
+      {TOBN(0xebe9fdf2, 0x614650a9), TOBN(0x5e35b537, 0xc1f666ac),
+       TOBN(0x645613d1, 0x88babc83), TOBN(0x88cace3a, 0xc5e1c93e)}},
+     {{TOBN(0x209ca375, 0x3de92e23), TOBN(0xccb03cc8, 0x5fbbb6e3),
+       TOBN(0xccb90f03, 0xd7b1487e), TOBN(0xfa9c2a38, 0xc710941f)},
+      {TOBN(0x756c3823, 0x6724ceed), TOBN(0x3a902258, 0x192d0323),
+       TOBN(0xb150e519, 0xea5e038e), TOBN(0xdcba2865, 0xc7427591)}},
+     {{TOBN(0xe549237f, 0x78890732), TOBN(0xc443bef9, 0x53fcb4d9),
+       TOBN(0x9884d8a6, 0xeb3480d6), TOBN(0x8a35b6a1, 0x3048b186)},
+      {TOBN(0xb4e44716, 0x65e9a90a), TOBN(0x45bf380d, 0x653006c0),
+       TOBN(0x8f3f820d, 0x4fe9ae3b), TOBN(0x244a35a0, 0x979a3b71)}},
+     {{TOBN(0xa1010e9d, 0x74cd06ff), TOBN(0x9c17c7df, 0xaca3eeac),
+       TOBN(0x74c86cd3, 0x8063aa2b), TOBN(0x8595c4b3, 0x734614ff)},
+      {TOBN(0xa3de00ca, 0x990f62cc), TOBN(0xd9bed213, 0xca0c3be5),
+       TOBN(0x7886078a, 0xdf8ce9f5), TOBN(0xddb27ce3, 0x5cd44444)}},
+     {{TOBN(0xed374a66, 0x58926ddd), TOBN(0x138b2d49, 0x908015b8),
+       TOBN(0x886c6579, 0xde1f7ab8), TOBN(0x888b9aa0, 0xc3020b7a)},
+      {TOBN(0xd3ec034e, 0x3a96e355), TOBN(0xba65b0b8, 0xf30fbe9a),
+       TOBN(0x064c8e50, 0xff21367a), TOBN(0x1f508ea4, 0x0b04b46e)}},
+     {{TOBN(0x98561a49, 0x747c866c), TOBN(0xbbb1e5fe, 0x0518a062),
+       TOBN(0x20ff4e8b, 0xecdc3608), TOBN(0x7f55cded, 0x20184027)},
+      {TOBN(0x8d73ec95, 0xf38c85f0), TOBN(0x5b589fdf, 0x8bc3b8c3),
+       TOBN(0xbe95dd98, 0x0f12b66f), TOBN(0xf5bd1a09, 0x0e338e01)}},
+     {{TOBN(0x65163ae5, 0x5e915918), TOBN(0x6158d6d9, 0x86f8a46b),
+       TOBN(0x8466b538, 0xeeebf99c), TOBN(0xca8761f6, 0xbca477ef)},
+      {TOBN(0xaf3449c2, 0x9ebbc601), TOBN(0xef3b0f41, 0xe0c3ae2f),
+       TOBN(0xaa6c577d, 0x5de63752), TOBN(0xe9166601, 0x64682a51)}},
+     {{TOBN(0x5a3097be, 0xfc15aa1e), TOBN(0x40d12548, 0xb54b0745),
+       TOBN(0x5bad4706, 0x519a5f12), TOBN(0xed03f717, 0xa439dee6)},
+      {TOBN(0x0794bb6c, 0x4a02c499), TOBN(0xf725083d, 0xcffe71d2),
+       TOBN(0x2cad7519, 0x0f3adcaf), TOBN(0x7f68ea1c, 0x43729310)}},
+     {{TOBN(0xe747c8c7, 0xb7ffd977), TOBN(0xec104c35, 0x80761a22),
+       TOBN(0x8395ebaf, 0x5a3ffb83), TOBN(0xfb3261f4, 0xe4b63db7)},
+      {TOBN(0x53544960, 0xd883e544), TOBN(0x13520d70, 0x8cc2eeb8),
+       TOBN(0x08f6337b, 0xd3d65f99), TOBN(0x83997db2, 0x781cf95b)}},
+     {{TOBN(0xce6ff106, 0x0dbd2c01), TOBN(0x4f8eea6b, 0x1f9ce934),
+       TOBN(0x546f7c4b, 0x0e993921), TOBN(0x6236a324, 0x5e753fc7)},
+      {TOBN(0x65a41f84, 0xa16022e9), TOBN(0x0c18d878, 0x43d1dbb2),
+       TOBN(0x73c55640, 0x2d4cef9c), TOBN(0xa0428108, 0x70444c74)}},
+     {{TOBN(0x68e4f15e, 0x9afdfb3c), TOBN(0x49a56143, 0x5bdfb6df),
+       TOBN(0xa9bc1bd4, 0x5f823d97), TOBN(0xbceb5970, 0xea111c2a)},
+      {TOBN(0x366b455f, 0xb269bbc4), TOBN(0x7cd85e1e, 0xe9bc5d62),
+       TOBN(0xc743c41c, 0x4f18b086), TOBN(0xa4b40990, 0x95294fb9)}},
+     {{TOBN(0x9c7c581d, 0x26ee8382), TOBN(0xcf17dcc5, 0x359d638e),
+       TOBN(0xee8273ab, 0xb728ae3d), TOBN(0x1d112926, 0xf821f047)},
+      {TOBN(0x11498477, 0x50491a74), TOBN(0x687fa761, 0xfde0dfb9),
+       TOBN(0x2c258022, 0x7ea435ab), TOBN(0x6b8bdb94, 0x91ce7e3f)}},
+     {{TOBN(0x4c5b5dc9, 0x3bf834aa), TOBN(0x04371819, 0x4f6c7e4b),
+       TOBN(0xc284e00a, 0x3736bcad), TOBN(0x0d881118, 0x21ae8f8d)},
+      {TOBN(0xf9cf0f82, 0xf48c8e33), TOBN(0xa11fd075, 0xa1bf40db),
+       TOBN(0xdceab0de, 0xdc2733e5), TOBN(0xc560a8b5, 0x8e986bd7)}},
+     {{TOBN(0x48dd1fe2, 0x3929d097), TOBN(0x3885b290, 0x92f188f1),
+       TOBN(0x0f2ae613, 0xda6fcdac), TOBN(0x9054303e, 0xb662a46c)},
+      {TOBN(0xb6871e44, 0x0738042a), TOBN(0x98e6a977, 0xbdaf6449),
+       TOBN(0xd8bc0650, 0xd1c9df1b), TOBN(0xef3d6451, 0x36e098f9)}},
+     {{TOBN(0x03fbae82, 0xb6d72d28), TOBN(0x77ca9db1, 0xf5d84080),
+       TOBN(0x8a112cff, 0xa58efc1c), TOBN(0x518d761c, 0xc564cb4a)},
+      {TOBN(0x69b5740e, 0xf0d1b5ce), TOBN(0x717039cc, 0xe9eb1785),
+       TOBN(0x3fe29f90, 0x22f53382), TOBN(0x8e54ba56, 0x6bc7c95c)}},
+     {{TOBN(0x9c806d8a, 0xf7f91d0f), TOBN(0x3b61b0f1, 0xa82a5728),
+       TOBN(0x4640032d, 0x94d76754), TOBN(0x273eb5de, 0x47d834c6)},
+      {TOBN(0x2988abf7, 0x7b4e4d53), TOBN(0xb7ce66bf, 0xde401777),
+       TOBN(0x9fba6b32, 0x715071b3), TOBN(0x82413c24, 0xad3a1a98)}},
+     {{TOBN(0x5b7fc8c4, 0xe0e8ad93), TOBN(0xb5679aee, 0x5fab868d),
+       TOBN(0xb1f9d2fa, 0x2b3946f3), TOBN(0x458897dc, 0x5685b50a)},
+      {TOBN(0x1e98c930, 0x89d0caf3), TOBN(0x39564c5f, 0x78642e92),
+       TOBN(0x1b77729a, 0x0dbdaf18), TOBN(0xf9170722, 0x579e82e6)}},
+     {{TOBN(0x680c0317, 0xe4515fa5), TOBN(0xf85cff84, 0xfb0c790f),
+       TOBN(0xc7a82aab, 0x6d2e0765), TOBN(0x7446bca9, 0x35c82b32)},
+      {TOBN(0x5de607aa, 0x6d63184f), TOBN(0x7c1a46a8, 0x262803a6),
+       TOBN(0xd218313d, 0xaebe8035), TOBN(0x92113ffd, 0xc73c51f8)}},
+     {{TOBN(0x4b38e083, 0x12e7e46c), TOBN(0x69d0a37a, 0x56126bd5),
+       TOBN(0xfb3f324b, 0x73c07e04), TOBN(0xa0c22f67, 0x8fda7267)},
+      {TOBN(0x8f2c0051, 0x4d2c7d8f), TOBN(0xbc45ced3, 0xcbe2cae5),
+       TOBN(0xe1c6cf07, 0xa8f0f277), TOBN(0xbc392312, 0x1eb99a98)}},
+     {{TOBN(0x75537b7e, 0x3cc8ac85), TOBN(0x8d725f57, 0xdd02753b),
+       TOBN(0xfd05ff64, 0xb737df2f), TOBN(0x55fe8712, 0xf6d2531d)},
+      {TOBN(0x57ce04a9, 0x6ab6b01c), TOBN(0x69a02a89, 0x7cd93724),
+       TOBN(0x4f82ac35, 0xcf86699b), TOBN(0x8242d3ad, 0x9cb4b232)}},
+     {{TOBN(0x713d0f65, 0xd62105e5), TOBN(0xbb222bfa, 0x2d29be61),
+       TOBN(0xf2f9a79e, 0x6cfbef09), TOBN(0xfc24d8d3, 0xd5d6782f)},
+      {TOBN(0x5db77085, 0xd4129967), TOBN(0xdb81c3cc, 0xdc3c2a43),
+       TOBN(0x9d655fc0, 0x05d8d9a3), TOBN(0x3f5d057a, 0x54298026)}},
+     {{TOBN(0x1157f56d, 0x88c54694), TOBN(0xb26baba5, 0x9b09573e),
+       TOBN(0x2cab03b0, 0x22adffd1), TOBN(0x60a412c8, 0xdd69f383)},
+      {TOBN(0xed76e98b, 0x54b25039), TOBN(0xd4ee67d3, 0x687e714d),
+       TOBN(0x87739648, 0x7b00b594), TOBN(0xce419775, 0xc9ef709b)}},
+     {{TOBN(0x40f76f85, 0x1c203a40), TOBN(0x30d352d6, 0xeafd8f91),
+       TOBN(0xaf196d3d, 0x95578dd2), TOBN(0xea4bb3d7, 0x77cc3f3d)},
+      {TOBN(0x42a5bd03, 0xb98e782b), TOBN(0xac958c40, 0x0624920d),
+       TOBN(0xb838134c, 0xfc56fcc8), TOBN(0x86ec4ccf, 0x89572e5e)}},
+     {{TOBN(0x69c43526, 0x9be47be0), TOBN(0x323b7dd8, 0xcb28fea1),
+       TOBN(0xfa5538ba, 0x3a6c67e5), TOBN(0xef921d70, 0x1d378e46)},
+      {TOBN(0xf92961fc, 0x3c4b880e), TOBN(0x3f6f914e, 0x98940a67),
+       TOBN(0xa990eb0a, 0xfef0ff39), TOBN(0xa6c2920f, 0xf0eeff9c)}},
+     {{TOBN(0xca804166, 0x51b8d9a3), TOBN(0x42531bc9, 0x0ffb0db1),
+       TOBN(0x72ce4718, 0xaa82e7ce), TOBN(0x6e199913, 0xdf574741)},
+      {TOBN(0xd5f1b13d, 0xd5d36946), TOBN(0x8255dc65, 0xf68f0194),
+       TOBN(0xdc9df4cd, 0x8710d230), TOBN(0x3453c20f, 0x138c1988)}},
+     {{TOBN(0x9af98dc0, 0x89a6ef01), TOBN(0x4dbcc3f0, 0x9857df85),
+       TOBN(0x34805601, 0x5c1ad924), TOBN(0x40448da5, 0xd0493046)},
+      {TOBN(0xf629926d, 0x4ee343e2), TOBN(0x6343f1bd, 0x90e8a301),
+       TOBN(0xefc93491, 0x40815b3f), TOBN(0xf882a423, 0xde8f66fb)}},
+     {{TOBN(0x3a12d5f4, 0xe7db9f57), TOBN(0x7dfba38a, 0x3c384c27),
+       TOBN(0x7a904bfd, 0x6fc660b1), TOBN(0xeb6c5db3, 0x2773b21c)},
+      {TOBN(0xc350ee66, 0x1cdfe049), TOBN(0x9baac0ce, 0x44540f29),
+       TOBN(0xbc57b6ab, 0xa5ec6aad), TOBN(0x167ce8c3, 0x0a7c1baa)}},
+     {{TOBN(0xb23a03a5, 0x53fb2b56), TOBN(0x6ce141e7, 0x4e057f78),
+       TOBN(0x796525c3, 0x89e490d9), TOBN(0x0bc95725, 0xa31a7e75)},
+      {TOBN(0x1ec56791, 0x1220fd06), TOBN(0x716e3a3c, 0x408b0bd6),
+       TOBN(0x31cd6bf7, 0xe8ebeba9), TOBN(0xa7326ca6, 0xbee6b670)}},
+     {{TOBN(0x3d9f851c, 0xcd090c43), TOBN(0x561e8f13, 0xf12c3988),
+       TOBN(0x50490b6a, 0x904b7be4), TOBN(0x61690ce1, 0x0410737b)},
+      {TOBN(0x299e9a37, 0x0f009052), TOBN(0x258758f0, 0xf026092e),
+       TOBN(0x9fa255f3, 0xfdfcdc0f), TOBN(0xdbc9fb1f, 0xc0e1bcd2)}},
+     {{TOBN(0x35f9dd6e, 0x24651840), TOBN(0xdca45a84, 0xa5c59abc),
+       TOBN(0x103d396f, 0xecca4938), TOBN(0x4532da0a, 0xb97b3f29)},
+      {TOBN(0xc4135ea5, 0x1999a6bf), TOBN(0x3aa9505a, 0x5e6bf2ee),
+       TOBN(0xf77cef06, 0x3f5be093), TOBN(0x97d1a0f8, 0xa943152e)}},
+     {{TOBN(0x2cb0ebba, 0x2e1c21dd), TOBN(0xf41b29fc, 0x2c6797c4),
+       TOBN(0xc6e17321, 0xb300101f), TOBN(0x4422b0e9, 0xd0d79a89)},
+      {TOBN(0x49e4901c, 0x92f1bfc4), TOBN(0x06ab1f8f, 0xe1e10ed9),
+       TOBN(0x84d35577, 0xdb2926b8), TOBN(0xca349d39, 0x356e8ec2)}},
+     {{TOBN(0x70b63d32, 0x343bf1a9), TOBN(0x8fd3bd28, 0x37d1a6b1),
+       TOBN(0x0454879c, 0x316865b4), TOBN(0xee959ff6, 0xc458efa2)},
+      {TOBN(0x0461dcf8, 0x9706dc3f), TOBN(0x737db0e2, 0x164e4b2e),
+       TOBN(0x09262680, 0x2f8843c8), TOBN(0x54498bbc, 0x7745e6f6)}},
+     {{TOBN(0x359473fa, 0xa29e24af), TOBN(0xfcc3c454, 0x70aa87a1),
+       TOBN(0xfd2c4bf5, 0x00573ace), TOBN(0xb65b514e, 0x28dd1965)},
+      {TOBN(0xe46ae7cf, 0x2193e393), TOBN(0x60e9a4e1, 0xf5444d97),
+       TOBN(0xe7594e96, 0x00ff38ed), TOBN(0x43d84d2f, 0x0a0e0f02)}},
+     {{TOBN(0x8b6db141, 0xee398a21), TOBN(0xb88a56ae, 0xe3bcc5be),
+       TOBN(0x0a1aa52f, 0x373460ea), TOBN(0x20da1a56, 0x160bb19b)},
+      {TOBN(0xfb54999d, 0x65bf0384), TOBN(0x71a14d24, 0x5d5a180e),
+       TOBN(0xbc44db7b, 0x21737b04), TOBN(0xd84fcb18, 0x01dd8e92)}},
+     {{TOBN(0x80de937b, 0xfa44b479), TOBN(0x53505499, 0x5c98fd4f),
+       TOBN(0x1edb12ab, 0x28f08727), TOBN(0x4c58b582, 0xa5f3ef53)},
+      {TOBN(0xbfb236d8, 0x8327f246), TOBN(0xc3a3bfaa, 0x4d7df320),
+       TOBN(0xecd96c59, 0xb96024f2), TOBN(0xfc293a53, 0x7f4e0433)}},
+     {{TOBN(0x5341352b, 0x5acf6e10), TOBN(0xc50343fd, 0xafe652c3),
+       TOBN(0x4af3792d, 0x18577a7f), TOBN(0xe1a4c617, 0xaf16823d)},
+      {TOBN(0x9b26d0cd, 0x33425d0a), TOBN(0x306399ed, 0x9b7bc47f),
+       TOBN(0x2a792f33, 0x706bb20b), TOBN(0x31219614, 0x98111055)}},
+     {{TOBN(0x864ec064, 0x87f5d28b), TOBN(0x11392d91, 0x962277fd),
+       TOBN(0xb5aa7942, 0xbb6aed5f), TOBN(0x080094dc, 0x47e799d9)},
+      {TOBN(0x4afa588c, 0x208ba19b), TOBN(0xd3e7570f, 0x8512f284),
+       TOBN(0xcbae64e6, 0x02f5799a), TOBN(0xdeebe7ef, 0x514b9492)}},
+     {{TOBN(0x30300f98, 0xe5c298ff), TOBN(0x17f561be, 0x3678361f),
+       TOBN(0xf52ff312, 0x98cb9a16), TOBN(0x6233c3bc, 0x5562d490)},
+      {TOBN(0x7bfa15a1, 0x92e3a2cb), TOBN(0x961bcfd1, 0xe6365119),
+       TOBN(0x3bdd29bf, 0x2c8c53b1), TOBN(0x739704df, 0x822844ba)}},
+     {{TOBN(0x7dacfb58, 0x7e7b754b), TOBN(0x23360791, 0xa806c9b9),
+       TOBN(0xe7eb88c9, 0x23504452), TOBN(0x2983e996, 0x852c1783)},
+      {TOBN(0xdd4ae529, 0x958d881d), TOBN(0x026bae03, 0x262c7b3c),
+       TOBN(0x3a6f9193, 0x960b52d1), TOBN(0xd0980f90, 0x92696cfb)}},
+     {{TOBN(0x4c1f428c, 0xd5f30851), TOBN(0x94dfed27, 0x2a4f6630),
+       TOBN(0x4df53772, 0xfc5d48a4), TOBN(0xdd2d5a2f, 0x933260ce)},
+      {TOBN(0x574115bd, 0xd44cc7a5), TOBN(0x4ba6b20d, 0xbd12533a),
+       TOBN(0x30e93cb8, 0x243057c9), TOBN(0x794c486a, 0x14de320e)}},
+     {{TOBN(0xe925d4ce, 0xf21496e4), TOBN(0xf951d198, 0xec696331),
+       TOBN(0x9810e2de, 0x3e8d812f), TOBN(0xd0a47259, 0x389294ab)},
+      {TOBN(0x513ba2b5, 0x0e3bab66), TOBN(0x462caff5, 0xabad306f),
+       TOBN(0xe2dc6d59, 0xaf04c49e), TOBN(0x1aeb8750, 0xe0b84b0b)}},
+     {{TOBN(0xc034f12f, 0x2f7d0ca2), TOBN(0x6d2e8128, 0xe06acf2f),
+       TOBN(0x801f4f83, 0x21facc2f), TOBN(0xa1170c03, 0xf40ef607)},
+      {TOBN(0xfe0a1d4f, 0x7805a99c), TOBN(0xbde56a36, 0xcc26aba5),
+       TOBN(0x5b1629d0, 0x35531f40), TOBN(0xac212c2b, 0x9afa6108)}},
+     {{TOBN(0x30a06bf3, 0x15697be5), TOBN(0x6f0545dc, 0x2c63c7c1),
+       TOBN(0x5d8cb842, 0x7ccdadaf), TOBN(0xd52e379b, 0xac7015bb)},
+      {TOBN(0xc4f56147, 0xf462c23e), TOBN(0xd44a4298, 0x46bc24b0),
+       TOBN(0xbc73d23a, 0xe2856d4f), TOBN(0x61cedd8c, 0x0832bcdf)}},
+     {{TOBN(0x60953556, 0x99f241d7), TOBN(0xee4adbd7, 0x001a349d),
+       TOBN(0x0b35bf6a, 0xaa89e491), TOBN(0x7f0076f4, 0x136f7546)},
+      {TOBN(0xd19a18ba, 0x9264da3d), TOBN(0x6eb2d2cd, 0x62a7a28b),
+       TOBN(0xcdba941f, 0x8761c971), TOBN(0x1550518b, 0xa3be4a5d)}},
+     {{TOBN(0xd0e8e2f0, 0x57d0b70c), TOBN(0xeea8612e, 0xcd133ba3),
+       TOBN(0x814670f0, 0x44416aec), TOBN(0x424db6c3, 0x30775061)},
+      {TOBN(0xd96039d1, 0x16213fd1), TOBN(0xc61e7fa5, 0x18a3478f),
+       TOBN(0xa805bdcc, 0xcb0c5021), TOBN(0xbdd6f3a8, 0x0cc616dd)}},
+     {{TOBN(0x06009667, 0x5d97f7e2), TOBN(0x31db0fc1, 0xaf0bf4b6),
+       TOBN(0x23680ed4, 0x5491627a), TOBN(0xb99a3c66, 0x7d741fb1)},
+      {TOBN(0xe9bb5f55, 0x36b1ff92), TOBN(0x29738577, 0x512b388d),
+       TOBN(0xdb8a2ce7, 0x50fcf263), TOBN(0x385346d4, 0x6c4f7b47)}},
+     {{TOBN(0xbe86c5ef, 0x31631f9e), TOBN(0xbf91da21, 0x03a57a29),
+       TOBN(0xc3b1f796, 0x7b23f821), TOBN(0x0f7d00d2, 0x770db354)},
+      {TOBN(0x8ffc6c3b, 0xd8fe79da), TOBN(0xcc5e8c40, 0xd525c996),
+       TOBN(0x4640991d, 0xcfff632a), TOBN(0x64d97e8c, 0x67112528)}},
+     {{TOBN(0xc232d973, 0x02f1cd1e), TOBN(0xce87eacb, 0x1dd212a4),
+       TOBN(0x6e4c8c73, 0xe69802f7), TOBN(0x12ef0290, 0x1fffddbd)},
+      {TOBN(0x941ec74e, 0x1bcea6e2), TOBN(0xd0b54024, 0x3cb92cbb),
+       TOBN(0x809fb9d4, 0x7e8f9d05), TOBN(0x3bf16159, 0xf2992aae)}},
+     {{TOBN(0xad40f279, 0xf8a7a838), TOBN(0x11aea631, 0x05615660),
+       TOBN(0xbf52e6f1, 0xa01f6fa1), TOBN(0xef046995, 0x3dc2aec9)},
+      {TOBN(0x785dbec9, 0xd8080711), TOBN(0xe1aec60a, 0x9fdedf76),
+       TOBN(0xece797b5, 0xfa21c126), TOBN(0xc66e898f, 0x05e52732)}},
+     {{TOBN(0x39bb69c4, 0x08811fdb), TOBN(0x8bfe1ef8, 0x2fc7f082),
+       TOBN(0xc8e7a393, 0x174f4138), TOBN(0xfba8ad1d, 0xd58d1f98)},
+      {TOBN(0xbc21d0ce, 0xbfd2fd5b), TOBN(0x0b839a82, 0x6ee60d61),
+       TOBN(0xaacf7658, 0xafd22253), TOBN(0xb526bed8, 0xaae396b3)}},
+     {{TOBN(0xccc1bbc2, 0x38564464), TOBN(0x9e3ff947, 0x8c45bc73),
+       TOBN(0xcde9bca3, 0x58188a78), TOBN(0x138b8ee0, 0xd73bf8f7)},
+      {TOBN(0x5c7e234c, 0x4123c489), TOBN(0x66e69368, 0xfa643297),
+       TOBN(0x0629eeee, 0x39a15fa3), TOBN(0x95fab881, 0xa9e2a927)}},
+     {{TOBN(0xb2497007, 0xeafbb1e1), TOBN(0xd75c9ce6, 0xe75b7a93),
+       TOBN(0x3558352d, 0xefb68d78), TOBN(0xa2f26699, 0x223f6396)},
+      {TOBN(0xeb911ecf, 0xe469b17a), TOBN(0x62545779, 0xe72d3ec2),
+       TOBN(0x8ea47de7, 0x82cb113f), TOBN(0xebe4b086, 0x4e1fa98d)}},
+     {{TOBN(0xec2d5ed7, 0x8cdfedb1), TOBN(0xa535c077, 0xfe211a74),
+       TOBN(0x9678109b, 0x11d244c5), TOBN(0xf17c8bfb, 0xbe299a76)},
+      {TOBN(0xb651412e, 0xfb11fbc4), TOBN(0xea0b5482, 0x94ab3f65),
+       TOBN(0xd8dffd95, 0x0cf78243), TOBN(0x2e719e57, 0xce0361d4)}},
+     {{TOBN(0x9007f085, 0x304ddc5b), TOBN(0x095e8c6d, 0x4daba2ea),
+       TOBN(0x5a33cdb4, 0x3f9d28a9), TOBN(0x85b95cd8, 0xe2283003)},
+      {TOBN(0xbcd6c819, 0xb9744733), TOBN(0x29c5f538, 0xfc7f5783),
+       TOBN(0x6c49b2fa, 0xd59038e4), TOBN(0x68349cc1, 0x3bbe1018)}},
+     {{TOBN(0xcc490c1d, 0x21830ee5), TOBN(0x36f9c4ee, 0xe9bfa297),
+       TOBN(0x58fd7294, 0x48de1a94), TOBN(0xaadb13a8, 0x4e8f2cdc)},
+      {TOBN(0x515eaaa0, 0x81313dba), TOBN(0xc76bb468, 0xc2152dd8),
+       TOBN(0x357f8d75, 0xa653dbf8), TOBN(0xe4d8c4d1, 0xb14ac143)}},
+     {{TOBN(0xbdb8e675, 0xb055cb40), TOBN(0x898f8e7b, 0x977b5167),
+       TOBN(0xecc65651, 0xb82fb863), TOBN(0x56544814, 0x6d88f01f)},
+      {TOBN(0xb0928e95, 0x263a75a9), TOBN(0xcfb6836f, 0x1a22fcda),
+       TOBN(0x651d14db, 0x3f3bd37c), TOBN(0x1d3837fb, 0xb6ad4664)}},
+     {{TOBN(0x7c5fb538, 0xff4f94ab), TOBN(0x7243c712, 0x6d7fb8f2),
+       TOBN(0xef13d60c, 0xa85c5287), TOBN(0x18cfb7c7, 0x4bb8dd1b)},
+      {TOBN(0x82f9bfe6, 0x72908219), TOBN(0x35c4592b, 0x9d5144ab),
+       TOBN(0x52734f37, 0x9cf4b42f), TOBN(0x6bac55e7, 0x8c60ddc4)}},
+     {{TOBN(0xb5cd811e, 0x94dea0f6), TOBN(0x259ecae4, 0xe18cc1a3),
+       TOBN(0x6a0e836e, 0x15e660f8), TOBN(0x6c639ea6, 0x0e02bff2)},
+      {TOBN(0x8721b8cb, 0x7e1026fd), TOBN(0x9e73b50b, 0x63261942),
+       TOBN(0xb8c70974, 0x77f01da3), TOBN(0x1839e6a6, 0x8268f57f)}},
+     {{TOBN(0x571b9415, 0x5150b805), TOBN(0x1892389e, 0xf92c7097),
+       TOBN(0x8d69c18e, 0x4a084b95), TOBN(0x7014c512, 0xbe5b495c)},
+      {TOBN(0x4780db36, 0x1b07523c), TOBN(0x2f6219ce, 0x2c1c64fa),
+       TOBN(0xc38b81b0, 0x602c105a), TOBN(0xab4f4f20, 0x5dc8e360)}},
+     {{TOBN(0x20d3c982, 0xcf7d62d2), TOBN(0x1f36e29d, 0x23ba8150),
+       TOBN(0x48ae0bf0, 0x92763f9e), TOBN(0x7a527e6b, 0x1d3a7007)},
+      {TOBN(0xb4a89097, 0x581a85e3), TOBN(0x1f1a520f, 0xdc158be5),
+       TOBN(0xf98db37d, 0x167d726e), TOBN(0x8802786e, 0x1113e862)}}},
+    {{{TOBN(0xefb2149e, 0x36f09ab0), TOBN(0x03f163ca, 0x4a10bb5b),
+       TOBN(0xd0297045, 0x06e20998), TOBN(0x56f0af00, 0x1b5a3bab)},
+      {TOBN(0x7af4cfec, 0x70880e0d), TOBN(0x7332a66f, 0xbe3d913f),
+       TOBN(0x32e6c84a, 0x7eceb4bd), TOBN(0xedc4a79a, 0x9c228f55)}},
+     {{TOBN(0xc37c7dd0, 0xc55c4496), TOBN(0xa6a96357, 0x25bbabd2),
+       TOBN(0x5b7e63f2, 0xadd7f363), TOBN(0x9dce3782, 0x2e73f1df)},
+      {TOBN(0xe1e5a16a, 0xb2b91f71), TOBN(0xe4489823, 0x5ba0163c),
+       TOBN(0xf2759c32, 0xf6e515ad), TOBN(0xa5e2f1f8, 0x8615eecf)}},
+     {{TOBN(0x74519be7, 0xabded551), TOBN(0x03d358b8, 0xc8b74410),
+       TOBN(0x4d00b10b, 0x0e10d9a9), TOBN(0x6392b0b1, 0x28da52b7)},
+      {TOBN(0x6744a298, 0x0b75c904), TOBN(0xc305b0ae, 0xa8f7f96c),
+       TOBN(0x042e421d, 0x182cf932), TOBN(0xf6fc5d50, 0x9e4636ca)}},
+     {{TOBN(0x795847c9, 0xd64cc78c), TOBN(0x6c50621b, 0x9b6cb27b),
+       TOBN(0x07099bf8, 0xdf8022ab), TOBN(0x48f862eb, 0xc04eda1d)},
+      {TOBN(0xd12732ed, 0xe1603c16), TOBN(0x19a80e0f, 0x5c9a9450),
+       TOBN(0xe2257f54, 0xb429b4fc), TOBN(0x66d3b2c6, 0x45460515)}},
+     {{TOBN(0x6ca4f87e, 0x822e37be), TOBN(0x73f237b4, 0x253bda4e),
+       TOBN(0xf747f3a2, 0x41190aeb), TOBN(0xf06fa36f, 0x804cf284)},
+      {TOBN(0x0a6bbb6e, 0xfc621c12), TOBN(0x5d624b64, 0x40b80ec6),
+       TOBN(0x4b072425, 0x7ba556f3), TOBN(0x7fa0c354, 0x3e2d20a8)}},
+     {{TOBN(0xe921fa31, 0xe3229d41), TOBN(0xa929c652, 0x94531bd4),
+       TOBN(0x84156027, 0xa6d38209), TOBN(0xf3d69f73, 0x6bdb97bd)},
+      {TOBN(0x8906d19a, 0x16833631), TOBN(0x68a34c2e, 0x03d51be3),
+       TOBN(0xcb59583b, 0x0e511cd8), TOBN(0x99ce6bfd, 0xfdc132a8)}},
+     {{TOBN(0x3facdaaa, 0xffcdb463), TOBN(0x658bbc1a, 0x34a38b08),
+       TOBN(0x12a801f8, 0xf1a9078d), TOBN(0x1567bcf9, 0x6ab855de)},
+      {TOBN(0xe08498e0, 0x3572359b), TOBN(0xcf0353e5, 0x8659e68b),
+       TOBN(0xbb86e9c8, 0x7d23807c), TOBN(0xbc08728d, 0x2198e8a2)}},
+     {{TOBN(0x8de2b7bc, 0x453cadd6), TOBN(0x203900a7, 0xbc0bc1f8),
+       TOBN(0xbcd86e47, 0xa6abd3af), TOBN(0x911cac12, 0x8502effb)},
+      {TOBN(0x2d550242, 0xec965469), TOBN(0x0e9f7692, 0x29e0017e),
+       TOBN(0x633f078f, 0x65979885), TOBN(0xfb87d449, 0x4cf751ef)}},
+     {{TOBN(0xe1790e4b, 0xfc25419a), TOBN(0x36467203, 0x4bff3cfd),
+       TOBN(0xc8db6386, 0x25b6e83f), TOBN(0x6cc69f23, 0x6cad6fd2)},
+      {TOBN(0x0219e45a, 0x6bc68bb9), TOBN(0xe43d79b6, 0x297f7334),
+       TOBN(0x7d445368, 0x465dc97c), TOBN(0x4b9eea32, 0x2a0b949a)}},
+     {{TOBN(0x1b96c6ba, 0x6102d021), TOBN(0xeaafac78, 0x2f4461ea),
+       TOBN(0xd4b85c41, 0xc49f19a8), TOBN(0x275c28e4, 0xcf538875)},
+      {TOBN(0x35451a9d, 0xdd2e54e0), TOBN(0x6991adb5, 0x0605618b),
+       TOBN(0x5b8b4bcd, 0x7b36cd24), TOBN(0x372a4f8c, 0x56f37216)}},
+     {{TOBN(0xc890bd73, 0xa6a5da60), TOBN(0x6f083da0, 0xdc4c9ff0),
+       TOBN(0xf4e14d94, 0xf0536e57), TOBN(0xf9ee1eda, 0xaaec8243)},
+      {TOBN(0x571241ec, 0x8bdcf8e7), TOBN(0xa5db8271, 0x0b041e26),
+       TOBN(0x9a0b9a99, 0xe3fff040), TOBN(0xcaaf21dd, 0x7c271202)}},
+     {{TOBN(0xb4e2b2e1, 0x4f0dd2e8), TOBN(0xe77e7c4f, 0x0a377ac7),
+       TOBN(0x69202c3f, 0x0d7a2198), TOBN(0xf759b7ff, 0x28200eb8)},
+      {TOBN(0xc87526ed, 0xdcfe314e), TOBN(0xeb84c524, 0x53d5cf99),
+       TOBN(0xb1b52ace, 0x515138b6), TOBN(0x5aa7ff8c, 0x23fca3f4)}},
+     {{TOBN(0xff0b13c3, 0xb9791a26), TOBN(0x960022da, 0xcdd58b16),
+       TOBN(0xdbd55c92, 0x57aad2de), TOBN(0x3baaaaa3, 0xf30fe619)},
+      {TOBN(0x9a4b2346, 0x0d881efd), TOBN(0x506416c0, 0x46325e2a),
+       TOBN(0x91381e76, 0x035c18d4), TOBN(0xb3bb68be, 0xf27817b0)}},
+     {{TOBN(0x15bfb8bf, 0x5116f937), TOBN(0x7c64a586, 0xc1268943),
+       TOBN(0x71e25cc3, 0x8419a2c8), TOBN(0x9fd6b0c4, 0x8335f463)},
+      {TOBN(0x4bf0ba3c, 0xe8ee0e0e), TOBN(0x6f6fba60, 0x298c21fa),
+       TOBN(0x57d57b39, 0xae66bee0), TOBN(0x292d5130, 0x22672544)}},
+     {{TOBN(0xf451105d, 0xbab093b3), TOBN(0x012f59b9, 0x02839986),
+       TOBN(0x8a915802, 0x3474a89c), TOBN(0x048c919c, 0x2de03e97)},
+      {TOBN(0xc476a2b5, 0x91071cd5), TOBN(0x791ed89a, 0x034970a5),
+       TOBN(0x89bd9042, 0xe1b7994b), TOBN(0x8eaf5179, 0xa1057ffd)}},
+     {{TOBN(0x6066e2a2, 0xd551ee10), TOBN(0x87a8f1d8, 0x727e09a6),
+       TOBN(0x00d08bab, 0x2c01148d), TOBN(0x6da8e4f1, 0x424f33fe)},
+      {TOBN(0x466d17f0, 0xcf9a4e71), TOBN(0xff502010, 0x3bf5cb19),
+       TOBN(0xdccf97d8, 0xd062ecc0), TOBN(0x80c0d9af, 0x81d80ac4)}},
+     {{TOBN(0xe87771d8, 0x033f2876), TOBN(0xb0186ec6, 0x7d5cc3db),
+       TOBN(0x58e8bb80, 0x3bc9bc1d), TOBN(0x4d1395cc, 0x6f6ef60e)},
+      {TOBN(0xa73c62d6, 0x186244a0), TOBN(0x918e5f23, 0x110a5b53),
+       TOBN(0xed4878ca, 0x741b7eab), TOBN(0x3038d71a, 0xdbe03e51)}},
+     {{TOBN(0x840204b7, 0xa93c3246), TOBN(0x21ab6069, 0xa0b9b4cd),
+       TOBN(0xf5fa6e2b, 0xb1d64218), TOBN(0x1de6ad0e, 0xf3d56191)},
+      {TOBN(0x570aaa88, 0xff1929c7), TOBN(0xc6df4c6b, 0x640e87b5),
+       TOBN(0xde8a74f2, 0xc65f0ccc), TOBN(0x8b972fd5, 0xe6f6cc01)}},
+     {{TOBN(0x3fff36b6, 0x0b846531), TOBN(0xba7e45e6, 0x10a5e475),
+       TOBN(0x84a1d10e, 0x4145b6c5), TOBN(0xf1f7f91a, 0x5e046d9d)},
+      {TOBN(0x0317a692, 0x44de90d7), TOBN(0x951a1d4a, 0xf199c15e),
+       TOBN(0x91f78046, 0xc9d73deb), TOBN(0x74c82828, 0xfab8224f)}},
+     {{TOBN(0xaa6778fc, 0xe7560b90), TOBN(0xb4073e61, 0xa7e824ce),
+       TOBN(0xff0d693c, 0xd642eba8), TOBN(0x7ce2e57a, 0x5dccef38)},
+      {TOBN(0x89c2c789, 0x1df1ad46), TOBN(0x83a06922, 0x098346fd),
+       TOBN(0x2d715d72, 0xda2fc177), TOBN(0x7b6dd71d, 0x85b6cf1d)}},
+     {{TOBN(0xc60a6d0a, 0x73fa9cb0), TOBN(0xedd3992e, 0x328bf5a9),
+       TOBN(0xc380ddd0, 0x832c8c82), TOBN(0xd182d410, 0xa2a0bf50)},
+      {TOBN(0x7d9d7438, 0xd9a528db), TOBN(0xe8b1a0e9, 0xcaf53994),
+       TOBN(0xddd6e5fe, 0x0e19987c), TOBN(0xacb8df03, 0x190b059d)}},
+     {{TOBN(0x53703a32, 0x8300129f), TOBN(0x1f637662, 0x68c43bfd),
+       TOBN(0xbcbd1913, 0x00e54051), TOBN(0x812fcc62, 0x7bf5a8c5)},
+      {TOBN(0x3f969d5f, 0x29fb85da), TOBN(0x72f4e00a, 0x694759e8),
+       TOBN(0x426b6e52, 0x790726b7), TOBN(0x617bbc87, 0x3bdbb209)}},
+     {{TOBN(0x511f8bb9, 0x97aee317), TOBN(0x812a4096, 0xe81536a8),
+       TOBN(0x137dfe59, 0x3ac09b9b), TOBN(0x0682238f, 0xba8c9a7a)},
+      {TOBN(0x7072ead6, 0xaeccb4bd), TOBN(0x6a34e9aa, 0x692ba633),
+       TOBN(0xc82eaec2, 0x6fff9d33), TOBN(0xfb753512, 0x1d4d2b62)}},
+     {{TOBN(0x1a0445ff, 0x1d7aadab), TOBN(0x65d38260, 0xd5f6a67c),
+       TOBN(0x6e62fb08, 0x91cfb26f), TOBN(0xef1e0fa5, 0x5c7d91d6)},
+      {TOBN(0x47e7c7ba, 0x33db72cd), TOBN(0x017cbc09, 0xfa7c74b2),
+       TOBN(0x3c931590, 0xf50a503c), TOBN(0xcac54f60, 0x616baa42)}},
+     {{TOBN(0x9b6cd380, 0xb2369f0f), TOBN(0x97d3a70d, 0x23c76151),
+       TOBN(0x5f9dd6fc, 0x9862a9c6), TOBN(0x044c4ab2, 0x12312f51)},
+      {TOBN(0x035ea0fd, 0x834a2ddc), TOBN(0x49e6b862, 0xcc7b826d),
+       TOBN(0xb03d6883, 0x62fce490), TOBN(0x62f2497a, 0xb37e36e9)}},
+     {{TOBN(0x04b005b6, 0xc6458293), TOBN(0x36bb5276, 0xe8d10af7),
+       TOBN(0xacf2dc13, 0x8ee617b8), TOBN(0x470d2d35, 0xb004b3d4)},
+      {TOBN(0x06790832, 0xfeeb1b77), TOBN(0x2bb75c39, 0x85657f9c),
+       TOBN(0xd70bd4ed, 0xc0f60004), TOBN(0xfe797ecc, 0x219b018b)}},
+     {{TOBN(0x9b5bec2a, 0x753aebcc), TOBN(0xdaf9f3dc, 0xc939eca5),
+       TOBN(0xd6bc6833, 0xd095ad09), TOBN(0x98abdd51, 0xdaa4d2fc)},
+      {TOBN(0xd9840a31, 0x8d168be5), TOBN(0xcf7c10e0, 0x2325a23c),
+       TOBN(0xa5c02aa0, 0x7e6ecfaf), TOBN(0x2462e7e6, 0xb5bfdf18)}},
+     {{TOBN(0xab2d8a8b, 0xa0cc3f12), TOBN(0x68dd485d, 0xbc672a29),
+       TOBN(0x72039752, 0x596f2cd3), TOBN(0x5d3eea67, 0xa0cf3d8d)},
+      {TOBN(0x810a1a81, 0xe6602671), TOBN(0x8f144a40, 0x14026c0c),
+       TOBN(0xbc753a6d, 0x76b50f85), TOBN(0xc4dc21e8, 0x645cd4a4)}},
+     {{TOBN(0xc5262dea, 0x521d0378), TOBN(0x802b8e0e, 0x05011c6f),
+       TOBN(0x1ba19cbb, 0x0b4c19ea), TOBN(0x21db64b5, 0xebf0aaec)},
+      {TOBN(0x1f394ee9, 0x70342f9d), TOBN(0x93a10aee, 0x1bc44a14),
+       TOBN(0xa7eed31b, 0x3efd0baa), TOBN(0x6e7c824e, 0x1d154e65)}},
+     {{TOBN(0xee23fa81, 0x9966e7ee), TOBN(0x64ec4aa8, 0x05b7920d),
+       TOBN(0x2d44462d, 0x2d90aad4), TOBN(0xf44dd195, 0xdf277ad5)},
+      {TOBN(0x8d6471f1, 0xbb46b6a1), TOBN(0x1e65d313, 0xfd885090),
+       TOBN(0x33a800f5, 0x13a977b4), TOBN(0xaca9d721, 0x0797e1ef)}},
+     {{TOBN(0x9a5a85a0, 0xfcff6a17), TOBN(0x9970a3f3, 0x1eca7cee),
+       TOBN(0xbb9f0d6b, 0xc9504be3), TOBN(0xe0c504be, 0xadd24ee2)},
+      {TOBN(0x7e09d956, 0x77fcc2f4), TOBN(0xef1a5227, 0x65bb5fc4),
+       TOBN(0x145d4fb1, 0x8b9286aa), TOBN(0x66fd0c5d, 0x6649028b)}},
+     {{TOBN(0x98857ceb, 0x1bf4581c), TOBN(0xe635e186, 0xaca7b166),
+       TOBN(0x278ddd22, 0x659722ac), TOBN(0xa0903c4c, 0x1db68007)},
+      {TOBN(0x366e4589, 0x48f21402), TOBN(0x31b49c14, 0xb96abda2),
+       TOBN(0x329c4b09, 0xe0403190), TOBN(0x97197ca3, 0xd29f43fe)}},
+     {{TOBN(0x8073dd1e, 0x274983d8), TOBN(0xda1a3bde, 0x55717c8f),
+       TOBN(0xfd3d4da2, 0x0361f9d1), TOBN(0x1332d081, 0x4c7de1ce)},
+      {TOBN(0x9b7ef7a3, 0xaa6d0e10), TOBN(0x17db2e73, 0xf54f1c4a),
+       TOBN(0xaf3dffae, 0x4cd35567), TOBN(0xaaa2f406, 0xe56f4e71)}},
+     {{TOBN(0x8966759e, 0x7ace3fc7), TOBN(0x9594eacf, 0x45a8d8c6),
+       TOBN(0x8de3bd8b, 0x91834e0e), TOBN(0xafe4ca53, 0x548c0421)},
+      {TOBN(0xfdd7e856, 0xe6ee81c6), TOBN(0x8f671beb, 0x6b891a3a),
+       TOBN(0xf7a58f2b, 0xfae63829), TOBN(0x9ab186fb, 0x9c11ac9f)}},
+     {{TOBN(0x8d6eb369, 0x10b5be76), TOBN(0x046b7739, 0xfb040bcd),
+       TOBN(0xccb4529f, 0xcb73de88), TOBN(0x1df0fefc, 0xcf26be03)},
+      {TOBN(0xad7757a6, 0xbcfcd027), TOBN(0xa8786c75, 0xbb3165ca),
+       TOBN(0xe9db1e34, 0x7e99a4d9), TOBN(0x99ee86df, 0xb06c504b)}},
+     {{TOBN(0x5b7c2ddd, 0xc15c9f0a), TOBN(0xdf87a734, 0x4295989e),
+       TOBN(0x59ece47c, 0x03d08fda), TOBN(0xb074d3dd, 0xad5fc702)},
+      {TOBN(0x20407903, 0x51a03776), TOBN(0x2bb1f77b, 0x2a608007),
+       TOBN(0x25c58f4f, 0xe1153185), TOBN(0xe6df62f6, 0x766e6447)}},
+     {{TOBN(0xefb3d1be, 0xed51275a), TOBN(0x5de47dc7, 0x2f0f483f),
+       TOBN(0x7932d98e, 0x97c2bedf), TOBN(0xd5c11927, 0x0219f8a1)},
+      {TOBN(0x9d751200, 0xa73a294e), TOBN(0x5f88434a, 0x9dc20172),
+       TOBN(0xd28d9fd3, 0xa26f506a), TOBN(0xa890cd31, 0x9d1dcd48)}},
+     {{TOBN(0x0aebaec1, 0x70f4d3b4), TOBN(0xfd1a1369, 0x0ffc8d00),
+       TOBN(0xb9d9c240, 0x57d57838), TOBN(0x45929d26, 0x68bac361)},
+      {TOBN(0x5a2cd060, 0x25b15ca6), TOBN(0x4b3c83e1, 0x6e474446),
+       TOBN(0x1aac7578, 0xee1e5134), TOBN(0xa418f5d6, 0xc91e2f41)}},
+     {{TOBN(0x6936fc8a, 0x213ed68b), TOBN(0x860ae7ed, 0x510a5224),
+       TOBN(0x63660335, 0xdef09b53), TOBN(0x641b2897, 0xcd79c98d)},
+      {TOBN(0x29bd38e1, 0x01110f35), TOBN(0x79c26f42, 0x648b1937),
+       TOBN(0x64dae519, 0x9d9164f4), TOBN(0xd85a2310, 0x0265c273)}},
+     {{TOBN(0x7173dd5d, 0x4b07e2b1), TOBN(0xd144c4cb, 0x8d9ea221),
+       TOBN(0xe8b04ea4, 0x1105ab14), TOBN(0x92dda542, 0xfe80d8f1)},
+      {TOBN(0xe9982fa8, 0xcf03dce6), TOBN(0x8b5ea965, 0x1a22cffc),
+       TOBN(0xf7f4ea7f, 0x3fad88c4), TOBN(0x62db773e, 0x6a5ba95c)}},
+     {{TOBN(0xd20f02fb, 0x93f24567), TOBN(0xfd46c69a, 0x315257ca),
+       TOBN(0x0ac74cc7, 0x8bcab987), TOBN(0x46f31c01, 0x5ceca2f5)},
+      {TOBN(0x40aedb59, 0x888b219e), TOBN(0xe50ecc37, 0xe1fccd02),
+       TOBN(0x1bcd9dad, 0x911f816c), TOBN(0x583cc1ec, 0x8db9b00c)}},
+     {{TOBN(0xf3cd2e66, 0xa483bf11), TOBN(0xfa08a6f5, 0xb1b2c169),
+       TOBN(0xf375e245, 0x4be9fa28), TOBN(0x99a7ffec, 0x5b6d011f)},
+      {TOBN(0x6a3ebddb, 0xc4ae62da), TOBN(0x6cea00ae, 0x374aef5d),
+       TOBN(0xab5fb98d, 0x9d4d05bc), TOBN(0x7cba1423, 0xd560f252)}},
+     {{TOBN(0x49b2cc21, 0x208490de), TOBN(0x1ca66ec3, 0xbcfb2879),
+       TOBN(0x7f1166b7, 0x1b6fb16f), TOBN(0xfff63e08, 0x65fe5db3)},
+      {TOBN(0xb8345abe, 0x8b2610be), TOBN(0xb732ed80, 0x39de3df4),
+       TOBN(0x0e24ed50, 0x211c32b4), TOBN(0xd10d8a69, 0x848ff27d)}},
+     {{TOBN(0xc1074398, 0xed4de248), TOBN(0xd7cedace, 0x10488927),
+       TOBN(0xa4aa6bf8, 0x85673e13), TOBN(0xb46bae91, 0x6daf30af)},
+      {TOBN(0x07088472, 0xfcef7ad8), TOBN(0x61151608, 0xd4b35e97),
+       TOBN(0xbcfe8f26, 0xdde29986), TOBN(0xeb84c4c7, 0xd5a34c79)}},
+     {{TOBN(0xc1eec55c, 0x164e1214), TOBN(0x891be86d, 0xa147bb03),
+       TOBN(0x9fab4d10, 0x0ba96835), TOBN(0xbf01e9b8, 0xa5c1ae9f)},
+      {TOBN(0x6b4de139, 0xb186ebc0), TOBN(0xd5c74c26, 0x85b91bca),
+       TOBN(0x5086a99c, 0xc2d93854), TOBN(0xeed62a7b, 0xa7a9dfbc)}},
+     {{TOBN(0x8778ed6f, 0x76b7618a), TOBN(0xbff750a5, 0x03b66062),
+       TOBN(0x4cb7be22, 0xb65186db), TOBN(0x369dfbf0, 0xcc3a6d13)},
+      {TOBN(0xc7dab26c, 0x7191a321), TOBN(0x9edac3f9, 0x40ed718e),
+       TOBN(0xbc142b36, 0xd0cfd183), TOBN(0xc8af82f6, 0x7c991693)}},
+     {{TOBN(0xb3d1e4d8, 0x97ce0b2a), TOBN(0xe6d7c87f, 0xc3a55cdf),
+       TOBN(0x35846b95, 0x68b81afe), TOBN(0x018d12af, 0xd3c239d8)},
+      {TOBN(0x2b2c6208, 0x01206e15), TOBN(0xe0e42453, 0xa3b882c6),
+       TOBN(0x854470a3, 0xa50162d5), TOBN(0x08157478, 0x7017a62a)}},
+     {{TOBN(0x18bd3fb4, 0x820357c7), TOBN(0x992039ae, 0x6f1458ad),
+       TOBN(0x9a1df3c5, 0x25b44aa1), TOBN(0x2d780357, 0xed3d5281)},
+      {TOBN(0x58cf7e4d, 0xc77ad4d4), TOBN(0xd49a7998, 0xf9df4fc4),
+       TOBN(0x4465a8b5, 0x1d71205e), TOBN(0xa0ee0ea6, 0x649254aa)}},
+     {{TOBN(0x4b5eeecf, 0xab7bd771), TOBN(0x6c873073, 0x35c262b9),
+       TOBN(0xdc5bd648, 0x3c9d61e7), TOBN(0x233d6d54, 0x321460d2)},
+      {TOBN(0xd20c5626, 0xfc195bcc), TOBN(0x25445958, 0x04d78b63),
+       TOBN(0xe03fcb3d, 0x17ec8ef3), TOBN(0x54b690d1, 0x46b8f781)}},
+     {{TOBN(0x82fa2c8a, 0x21230646), TOBN(0xf51aabb9, 0x084f418c),
+       TOBN(0xff4fbec1, 0x1a30ba43), TOBN(0x6a5acf73, 0x743c9df7)},
+      {TOBN(0x1da2b357, 0xd635b4d5), TOBN(0xc3de68dd, 0xecd5c1da),
+       TOBN(0xa689080b, 0xd61af0dd), TOBN(0xdea5938a, 0xd665bf99)}},
+     {{TOBN(0x0231d71a, 0xfe637294), TOBN(0x01968aa6, 0xa5a81cd8),
+       TOBN(0x11252d50, 0x048e63b5), TOBN(0xc446bc52, 0x6ca007e9)},
+      {TOBN(0xef8c50a6, 0x96d6134b), TOBN(0x9361fbf5, 0x9e09a05c),
+       TOBN(0xf17f85a6, 0xdca3291a), TOBN(0xb178d548, 0xff251a21)}},
+     {{TOBN(0x87f6374b, 0xa4df3915), TOBN(0x566ce1bf, 0x2fd5d608),
+       TOBN(0x425cba4d, 0x7de35102), TOBN(0x6b745f8f, 0x58c5d5e2)},
+      {TOBN(0x88402af6, 0x63122edf), TOBN(0x3190f9ed, 0x3b989a89),
+       TOBN(0x4ad3d387, 0xebba3156), TOBN(0xef385ad9, 0xc7c469a5)}},
+     {{TOBN(0xb08281de, 0x3f642c29), TOBN(0x20be0888, 0x910ffb88),
+       TOBN(0xf353dd4a, 0xd5292546), TOBN(0x3f1627de, 0x8377a262)},
+      {TOBN(0xa5faa013, 0xeefcd638), TOBN(0x8f3bf626, 0x74cc77c3),
+       TOBN(0x32618f65, 0xa348f55e), TOBN(0x5787c0dc, 0x9fefeb9e)}},
+     {{TOBN(0xf1673aa2, 0xd9a23e44), TOBN(0x88dfa993, 0x4e10690d),
+       TOBN(0x1ced1b36, 0x2bf91108), TOBN(0x9193ceca, 0x3af48649)},
+      {TOBN(0xfb34327d, 0x2d738fc5), TOBN(0x6697b037, 0x975fee6c),
+       TOBN(0x2f485da0, 0xc04079a5), TOBN(0x2cdf5735, 0x2feaa1ac)}},
+     {{TOBN(0x76944420, 0xbd55659e), TOBN(0x7973e32b, 0x4376090c),
+       TOBN(0x86bb4fe1, 0x163b591a), TOBN(0x10441aed, 0xc196f0ca)},
+      {TOBN(0x3b431f4a, 0x045ad915), TOBN(0x6c11b437, 0xa4afacb1),
+       TOBN(0x30b0c7db, 0x71fdbbd8), TOBN(0xb642931f, 0xeda65acd)}},
+     {{TOBN(0x4baae6e8, 0x9c92b235), TOBN(0xa73bbd0e, 0x6b3993a1),
+       TOBN(0xd06d60ec, 0x693dd031), TOBN(0x03cab91b, 0x7156881c)},
+      {TOBN(0xd615862f, 0x1db3574b), TOBN(0x485b0185, 0x64bb061a),
+       TOBN(0x27434988, 0xa0181e06), TOBN(0x2cd61ad4, 0xc1c0c757)}},
+     {{TOBN(0x3effed5a, 0x2ff9f403), TOBN(0x8dc98d8b, 0x62239029),
+       TOBN(0x2206021e, 0x1f17b70d), TOBN(0xafbec0ca, 0xbf510015)},
+      {TOBN(0x9fed7164, 0x80130dfa), TOBN(0x306dc2b5, 0x8a02dcf5),
+       TOBN(0x48f06620, 0xfeb10fc0), TOBN(0x78d1e1d5, 0x5a57cf51)}},
+     {{TOBN(0xadef8c5a, 0x192ef710), TOBN(0x88afbd4b, 0x3b7431f9),
+       TOBN(0x7e1f7407, 0x64250c9e), TOBN(0x6e31318d, 0xb58bec07)},
+      {TOBN(0xfd4fc4b8, 0x24f89b4e), TOBN(0x65a5dd88, 0x48c36a2a),
+       TOBN(0x4f1eccff, 0xf024baa7), TOBN(0x22a21cf2, 0xcba94650)}},
+     {{TOBN(0x95d29dee, 0x42a554f7), TOBN(0x828983a5, 0x002ec4ba),
+       TOBN(0x8112a1f7, 0x8badb73d), TOBN(0x79ea8897, 0xa27c1839)},
+      {TOBN(0x8969a5a7, 0xd065fd83), TOBN(0xf49af791, 0xb262a0bc),
+       TOBN(0xfcdea8b6, 0xaf2b5127), TOBN(0x10e913e1, 0x564c2dbc)}},
+     {{TOBN(0x51239d14, 0xbc21ef51), TOBN(0xe51c3ceb, 0x4ce57292),
+       TOBN(0x795ff068, 0x47bbcc3b), TOBN(0x86b46e1e, 0xbd7e11e6)},
+      {TOBN(0x0ea6ba23, 0x80041ef4), TOBN(0xd72fe505, 0x6262342e),
+       TOBN(0x8abc6dfd, 0x31d294d4), TOBN(0xbbe017a2, 0x1278c2c9)}},
+     {{TOBN(0xb1fcfa09, 0xb389328a), TOBN(0x322fbc62, 0xd01771b5),
+       TOBN(0x04c0d063, 0x60b045bf), TOBN(0xdb652edc, 0x10e52d01)},
+      {TOBN(0x50ef932c, 0x03ec6627), TOBN(0xde1b3b2d, 0xc1ee50e3),
+       TOBN(0x5ab7bdc5, 0xdc37a90d), TOBN(0xfea67213, 0x31e33a96)}},
+     {{TOBN(0x6482b5cb, 0x4f2999aa), TOBN(0x38476cc6, 0xb8cbf0dd),
+       TOBN(0x93ebfacb, 0x173405bb), TOBN(0x15cdafe7, 0xe52369ec)},
+      {TOBN(0xd42d5ba4, 0xd935b7db), TOBN(0x648b6004, 0x1c99a4cd),
+       TOBN(0x785101bd, 0xa3b5545b), TOBN(0x4bf2c38a, 0x9dd67faf)}},
+     {{TOBN(0xb1aadc63, 0x4442449c), TOBN(0xe0e9921a, 0x33ad4fb8),
+       TOBN(0x5c552313, 0xaa686d82), TOBN(0xdee635fa, 0x465d866c)},
+      {TOBN(0xbc3c224a, 0x18ee6e8a), TOBN(0xeed748a6, 0xed42e02f),
+       TOBN(0xe70f930a, 0xd474cd08), TOBN(0x774ea6ec, 0xfff24adf)}},
+     {{TOBN(0x03e2de1c, 0xf3480d4a), TOBN(0xf0d8edc7, 0xbc8acf1a),
+       TOBN(0xf23e3303, 0x68295a9c), TOBN(0xfadd5f68, 0xc546a97d)},
+      {TOBN(0x895597ad, 0x96f8acb1), TOBN(0xbddd49d5, 0x671bdae2),
+       TOBN(0x16fcd528, 0x21dd43f4), TOBN(0xa5a45412, 0x6619141a)}}},
+    {{{TOBN(0x8ce9b6bf, 0xc360e25a), TOBN(0xe6425195, 0x075a1a78),
+       TOBN(0x9dc756a8, 0x481732f4), TOBN(0x83c0440f, 0x5432b57a)},
+      {TOBN(0xc670b3f1, 0xd720281f), TOBN(0x2205910e, 0xd135e051),
+       TOBN(0xded14b0e, 0xdb052be7), TOBN(0x697b3d27, 0xc568ea39)}},
+     {{TOBN(0x2e599b9a, 0xfb3ff9ed), TOBN(0x28c2e0ab, 0x17f6515c),
+       TOBN(0x1cbee4fd, 0x474da449), TOBN(0x071279a4, 0x4f364452)},
+      {TOBN(0x97abff66, 0x01fbe855), TOBN(0x3ee394e8, 0x5fda51c4),
+       TOBN(0x190385f6, 0x67597c0b), TOBN(0x6e9fccc6, 0xa27ee34b)}},
+     {{TOBN(0x0b89de93, 0x14092ebb), TOBN(0xf17256bd, 0x428e240c),
+       TOBN(0xcf89a7f3, 0x93d2f064), TOBN(0x4f57841e, 0xe1ed3b14)},
+      {TOBN(0x4ee14405, 0xe708d855), TOBN(0x856aae72, 0x03f1c3d0),
+       TOBN(0xc8e5424f, 0xbdd7eed5), TOBN(0x3333e4ef, 0x73ab4270)}},
+     {{TOBN(0x3bc77ade, 0xdda492f8), TOBN(0xc11a3aea, 0x78297205),
+       TOBN(0x5e89a3e7, 0x34931b4c), TOBN(0x17512e2e, 0x9f5694bb)},
+      {TOBN(0x5dc349f3, 0x177bf8b6), TOBN(0x232ea4ba, 0x08c7ff3e),
+       TOBN(0x9c4f9d16, 0xf511145d), TOBN(0xccf109a3, 0x33b379c3)}},
+     {{TOBN(0xe75e7a88, 0xa1f25897), TOBN(0x7ac6961f, 0xa1b5d4d8),
+       TOBN(0xe3e10773, 0x08f3ed5c), TOBN(0x208a54ec, 0x0a892dfb)},
+      {TOBN(0xbe826e19, 0x78660710), TOBN(0x0cf70a97, 0x237df2c8),
+       TOBN(0x418a7340, 0xed704da5), TOBN(0xa3eeb9a9, 0x08ca33fd)}},
+     {{TOBN(0x49d96233, 0x169bca96), TOBN(0x04d286d4, 0x2da6aafb),
+       TOBN(0xc09606ec, 0xa0c2fa94), TOBN(0x8869d0d5, 0x23ff0fb3)},
+      {TOBN(0xa99937e5, 0xd0150d65), TOBN(0xa92e2503, 0x240c14c9),
+       TOBN(0x656bf945, 0x108e2d49), TOBN(0x152a733a, 0xa2f59e2b)}},
+     {{TOBN(0xb4323d58, 0x8434a920), TOBN(0xc0af8e93, 0x622103c5),
+       TOBN(0x667518ef, 0x938dbf9a), TOBN(0xa1843073, 0x83a9cdf2)},
+      {TOBN(0x350a94aa, 0x5447ab80), TOBN(0xe5e5a325, 0xc75a3d61),
+       TOBN(0x74ba507f, 0x68411a9e), TOBN(0x10581fc1, 0x594f70c5)}},
+     {{TOBN(0x60e28570, 0x80eb24a9), TOBN(0x7bedfb4d, 0x488e0cfd),
+       TOBN(0x721ebbd7, 0xc259cdb8), TOBN(0x0b0da855, 0xbc6390a9)},
+      {TOBN(0x2b4d04db, 0xde314c70), TOBN(0xcdbf1fbc, 0x6c32e846),
+       TOBN(0x33833eab, 0xb162fc9e), TOBN(0x9939b48b, 0xb0dd3ab7)}},
+     {{TOBN(0x5aaa98a7, 0xcb0c9c8c), TOBN(0x75105f30, 0x81c4375c),
+       TOBN(0xceee5057, 0x5ef1c90f), TOBN(0xb31e065f, 0xc23a17bf)},
+      {TOBN(0x5364d275, 0xd4b6d45a), TOBN(0xd363f3ad, 0x62ec8996),
+       TOBN(0xb5d21239, 0x4391c65b), TOBN(0x84564765, 0xebb41b47)}},
+     {{TOBN(0x20d18ecc, 0x37107c78), TOBN(0xacff3b6b, 0x570c2a66),
+       TOBN(0x22f975d9, 0x9bd0d845), TOBN(0xef0a0c46, 0xba178fa0)},
+      {TOBN(0x1a419651, 0x76b6028e), TOBN(0xc49ec674, 0x248612d4),
+       TOBN(0x5b6ac4f2, 0x7338af55), TOBN(0x06145e62, 0x7bee5a36)}},
+     {{TOBN(0x33e95d07, 0xe75746b5), TOBN(0x1c1e1f6d, 0xc40c78be),
+       TOBN(0x967833ef, 0x222ff8e2), TOBN(0x4bedcf6a, 0xb49180ad)},
+      {TOBN(0x6b37e9c1, 0x3d7a4c8a), TOBN(0x2748887c, 0x6ddfe760),
+       TOBN(0xf7055123, 0xaa3a5bbc), TOBN(0x954ff225, 0x7bbb8e74)}},
+     {{TOBN(0xc42b8ab1, 0x97c3dfb9), TOBN(0x55a549b0, 0xcf168154),
+       TOBN(0xad6748e7, 0xc1b50692), TOBN(0x2775780f, 0x6fc5cbcb)},
+      {TOBN(0x4eab80b8, 0xe1c9d7c8), TOBN(0x8c69dae1, 0x3fdbcd56),
+       TOBN(0x47e6b4fb, 0x9969eace), TOBN(0x002f1085, 0xa705cb5a)}},
+     {{TOBN(0x4e23ca44, 0x6d3fea55), TOBN(0xb4ae9c86, 0xf4810568),
+       TOBN(0x47bfb91b, 0x2a62f27d), TOBN(0x60deb4c9, 0xd9bac28c)},
+      {TOBN(0xa892d894, 0x7de6c34c), TOBN(0x4ee68259, 0x4494587d),
+       TOBN(0x914ee14e, 0x1a3f8a5b), TOBN(0xbb113eaa, 0x28700385)}},
+     {{TOBN(0x81ca03b9, 0x2115b4c9), TOBN(0x7c163d38, 0x8908cad1),
+       TOBN(0xc912a118, 0xaa18179a), TOBN(0xe09ed750, 0x886e3081)},
+      {TOBN(0xa676e3fa, 0x26f516ca), TOBN(0x753cacf7, 0x8e732f91),
+       TOBN(0x51592aea, 0x833da8b4), TOBN(0xc626f42f, 0x4cbea8aa)}},
+     {{TOBN(0xef9dc899, 0xa7b56eaf), TOBN(0x00c0e52c, 0x34ef7316),
+       TOBN(0x5b1e4e24, 0xfe818a86), TOBN(0x9d31e20d, 0xc538be47)},
+      {TOBN(0x22eb932d, 0x3ed68974), TOBN(0xe44bbc08, 0x7c4e87c4),
+       TOBN(0x4121086e, 0x0dde9aef), TOBN(0x8e6b9cff, 0x134f4345)}},
+     {{TOBN(0x96892c1f, 0x711b0eb9), TOBN(0xb905f2c8, 0x780ab954),
+       TOBN(0xace26309, 0xa20792db), TOBN(0xec8ac9b3, 0x0684e126)},
+      {TOBN(0x486ad8b6, 0xb40a2447), TOBN(0x60121fc1, 0x9fe3fb24),
+       TOBN(0x5626fccf, 0x1a8e3b3f), TOBN(0x4e568622, 0x6ad1f394)}},
+     {{TOBN(0xda7aae0d, 0x196aa5a1), TOBN(0xe0df8c77, 0x1041b5fb),
+       TOBN(0x451465d9, 0x26b318b7), TOBN(0xc29b6e55, 0x7ab136e9)},
+      {TOBN(0x2c2ab48b, 0x71148463), TOBN(0xb5738de3, 0x64454a76),
+       TOBN(0x54ccf9a0, 0x5a03abe4), TOBN(0x377c0296, 0x0427d58e)}},
+     {{TOBN(0x73f5f0b9, 0x2bb39c1f), TOBN(0x14373f2c, 0xe608d8c5),
+       TOBN(0xdcbfd314, 0x00fbb805), TOBN(0xdf18fb20, 0x83afdcfb)},
+      {TOBN(0x81a57f42, 0x42b3523f), TOBN(0xe958532d, 0x87f650fb),
+       TOBN(0xaa8dc8b6, 0x8b0a7d7c), TOBN(0x1b75dfb7, 0x150166be)}},
+     {{TOBN(0x90e4f7c9, 0x2d7d1413), TOBN(0x67e2d6b5, 0x9834f597),
+       TOBN(0x4fd4f4f9, 0xa808c3e8), TOBN(0xaf8237e0, 0xd5281ec1)},
+      {TOBN(0x25ab5fdc, 0x84687cee), TOBN(0xc5ded6b1, 0xa5b26c09),
+       TOBN(0x8e4a5aec, 0xc8ea7650), TOBN(0x23b73e5c, 0x14cc417f)}},
+     {{TOBN(0x2bfb4318, 0x3037bf52), TOBN(0xb61e6db5, 0x78c725d7),
+       TOBN(0x8efd4060, 0xbbb3e5d7), TOBN(0x2e014701, 0xdbac488e)},
+      {TOBN(0xac75cf9a, 0x360aa449), TOBN(0xb70cfd05, 0x79634d08),
+       TOBN(0xa591536d, 0xfffb15ef), TOBN(0xb2c37582, 0xd07c106c)}},
+     {{TOBN(0xb4293fdc, 0xf50225f9), TOBN(0xc52e175c, 0xb0e12b03),
+       TOBN(0xf649c3ba, 0xd0a8bf64), TOBN(0x745a8fef, 0xeb8ae3c6)},
+      {TOBN(0x30d7e5a3, 0x58321bc3), TOBN(0xb1732be7, 0x0bc4df48),
+       TOBN(0x1f217993, 0xe9ea5058), TOBN(0xf7a71cde, 0x3e4fd745)}},
+     {{TOBN(0x86cc533e, 0x894c5bbb), TOBN(0x6915c7d9, 0x69d83082),
+       TOBN(0xa6aa2d05, 0x5815c244), TOBN(0xaeeee592, 0x49b22ce5)},
+      {TOBN(0x89e39d13, 0x78135486), TOBN(0x3a275c1f, 0x16b76f2f),
+       TOBN(0xdb6bcc1b, 0xe036e8f5), TOBN(0x4df69b21, 0x5e4709f5)}},
+     {{TOBN(0xa188b250, 0x2d0f39aa), TOBN(0x622118bb, 0x15a85947),
+       TOBN(0x2ebf520f, 0xfde0f4fa), TOBN(0xa40e9f29, 0x4860e539)},
+      {TOBN(0x7b6a51eb, 0x22b57f0f), TOBN(0x849a33b9, 0x7e80644a),
+       TOBN(0x50e5d16f, 0x1cf095fe), TOBN(0xd754b54e, 0xec55f002)}},
+     {{TOBN(0x5cfbbb22, 0x236f4a98), TOBN(0x0b0c59e9, 0x066800bb),
+       TOBN(0x4ac69a8f, 0x5a9a7774), TOBN(0x2b33f804, 0xd6bec948)},
+      {TOBN(0xb3729295, 0x32e6c466), TOBN(0x68956d0f, 0x4e599c73),
+       TOBN(0xa47a249f, 0x155c31cc), TOBN(0x24d80f0d, 0xe1ce284e)}},
+     {{TOBN(0xcd821dfb, 0x988baf01), TOBN(0xe6331a7d, 0xdbb16647),
+       TOBN(0x1eb8ad33, 0x094cb960), TOBN(0x593cca38, 0xc91bbca5)},
+      {TOBN(0x384aac8d, 0x26567456), TOBN(0x40fa0309, 0xc04b6490),
+       TOBN(0x97834cd6, 0xdab6c8f6), TOBN(0x68a7318d, 0x3f91e55f)}},
+     {{TOBN(0xa00fd04e, 0xfc4d3157), TOBN(0xb56f8ab2, 0x2bf3bdea),
+       TOBN(0x014f5648, 0x4fa57172), TOBN(0x948c5860, 0x450abdb3)},
+      {TOBN(0x342b5df0, 0x0ebd4f08), TOBN(0x3e5168cd, 0x0e82938e),
+       TOBN(0x7aedc1ce, 0xb0df5dd0), TOBN(0x6bbbc6d9, 0xe5732516)}},
+     {{TOBN(0xc7bfd486, 0x605daaa6), TOBN(0x46fd72b7, 0xbb9a6c9e),
+       TOBN(0xe4847fb1, 0xa124fb89), TOBN(0x75959cbd, 0xa2d8ffbc)},
+      {TOBN(0x42579f65, 0xc8a588ee), TOBN(0x368c92e6, 0xb80b499d),
+       TOBN(0xea4ef6cd, 0x999a5df1), TOBN(0xaa73bb7f, 0x936fe604)}},
+     {{TOBN(0xf347a70d, 0x6457d188), TOBN(0x86eda86b, 0x8b7a388b),
+       TOBN(0xb7cdff06, 0x0ccd6013), TOBN(0xbeb1b6c7, 0xd0053fb2)},
+      {TOBN(0x0b022387, 0x99240a9f), TOBN(0x1bbb384f, 0x776189b2),
+       TOBN(0x8695e71e, 0x9066193a), TOBN(0x2eb50097, 0x06ffac7e)}},
+     {{TOBN(0x0654a9c0, 0x4a7d2caa), TOBN(0x6f3fb3d1, 0xa5aaa290),
+       TOBN(0x835db041, 0xff476e8f), TOBN(0x540b8b0b, 0xc42295e4)},
+      {TOBN(0xa5c73ac9, 0x05e214f5), TOBN(0x9a74075a, 0x56a0b638),
+       TOBN(0x2e4b1090, 0xce9e680b), TOBN(0x57a5b479, 0x6b8d9afa)}},
+     {{TOBN(0x0dca48e7, 0x26bfe65c), TOBN(0x097e391c, 0x7290c307),
+       TOBN(0x683c462e, 0x6669e72e), TOBN(0xf505be1e, 0x062559ac)},
+      {TOBN(0x5fbe3ea1, 0xe3a3035a), TOBN(0x6431ebf6, 0x9cd50da8),
+       TOBN(0xfd169d5c, 0x1f6407f2), TOBN(0x8d838a95, 0x60fce6b8)}},
+     {{TOBN(0x2a2bfa7f, 0x650006f0), TOBN(0xdfd7dad3, 0x50c0fbb2),
+       TOBN(0x92452495, 0xccf9ad96), TOBN(0x183bf494, 0xd95635f9)},
+      {TOBN(0x02d5df43, 0x4a7bd989), TOBN(0x505385cc, 0xa5431095),
+       TOBN(0xdd98e67d, 0xfd43f53e), TOBN(0xd61e1a6c, 0x500c34a9)}},
+     {{TOBN(0x5a4b46c6, 0x4a8a3d62), TOBN(0x8469c4d0, 0x247743d2),
+       TOBN(0x2bb3a13d, 0x88f7e433), TOBN(0x62b23a10, 0x01be5849)},
+      {TOBN(0xe83596b4, 0xa63d1a4c), TOBN(0x454e7fea, 0x7d183f3e),
+       TOBN(0x643fce61, 0x17afb01c), TOBN(0x4e65e5e6, 0x1c4c3638)}},
+     {{TOBN(0x41d85ea1, 0xef74c45b), TOBN(0x2cfbfa66, 0xae328506),
+       TOBN(0x98b078f5, 0x3ada7da9), TOBN(0xd985fe37, 0xec752fbb)},
+      {TOBN(0xeece68fe, 0x5a0148b4), TOBN(0x6f9a55c7, 0x2d78136d),
+       TOBN(0x232dccc4, 0xd2b729ce), TOBN(0xa27e0dfd, 0x90aafbc4)}},
+     {{TOBN(0x96474452, 0x12b4603e), TOBN(0xa876c551, 0x6b706d14),
+       TOBN(0xdf145fcf, 0x69a9d412), TOBN(0xe2ab75b7, 0x2d479c34)},
+      {TOBN(0x12df9a76, 0x1a23ff97), TOBN(0xc6138992, 0x5d359d10),
+       TOBN(0x6e51c7ae, 0xfa835f22), TOBN(0x69a79cb1, 0xc0fcc4d9)}},
+     {{TOBN(0xf57f350d, 0x594cc7e1), TOBN(0x3079ca63, 0x3350ab79),
+       TOBN(0x226fb614, 0x9aff594a), TOBN(0x35afec02, 0x6d59a62b)},
+      {TOBN(0x9bee46f4, 0x06ed2c6e), TOBN(0x58da1735, 0x7d939a57),
+       TOBN(0x44c50402, 0x8fd1797e), TOBN(0xd8853e7c, 0x5ccea6ca)}},
+     {{TOBN(0x4065508d, 0xa35fcd5f), TOBN(0x8965df8c, 0x495ccaeb),
+       TOBN(0x0f2da850, 0x12e1a962), TOBN(0xee471b94, 0xc1cf1cc4)},
+      {TOBN(0xcef19bc8, 0x0a08fb75), TOBN(0x704958f5, 0x81de3591),
+       TOBN(0x2867f8b2, 0x3aef4f88), TOBN(0x8d749384, 0xea9f9a5f)}},
+     {{TOBN(0x1b385537, 0x8c9049f4), TOBN(0x5be948f3, 0x7b92d8b6),
+       TOBN(0xd96f725d, 0xb6e2bd6b), TOBN(0x37a222bc, 0x958c454d)},
+      {TOBN(0xe7c61abb, 0x8809bf61), TOBN(0x46f07fbc, 0x1346f18d),
+       TOBN(0xfb567a7a, 0xe87c0d1c), TOBN(0x84a461c8, 0x7ef3d07a)}},
+     {{TOBN(0x0a5adce6, 0xd9278d98), TOBN(0x24d94813, 0x9dfc73e1),
+       TOBN(0x4f3528b6, 0x054321c3), TOBN(0x2e03fdde, 0x692ea706)},
+      {TOBN(0x10e60619, 0x47b533c0), TOBN(0x1a8bc73f, 0x2ca3c055),
+       TOBN(0xae58d4b2, 0x1bb62b8f), TOBN(0xb2045a73, 0x584a24e3)}},
+     {{TOBN(0x3ab3d5af, 0xbd76e195), TOBN(0x478dd1ad, 0x6938a810),
+       TOBN(0x6ffab393, 0x6ee3d5cb), TOBN(0xdfb693db, 0x22b361e4)},
+      {TOBN(0xf9694496, 0x51dbf1a7), TOBN(0xcab4b4ef, 0x08a2e762),
+       TOBN(0xe8c92f25, 0xd39bba9a), TOBN(0x850e61bc, 0xf1464d96)}},
+     {{TOBN(0xb7e830e3, 0xdc09508b), TOBN(0xfaf6d2cf, 0x74317655),
+       TOBN(0x72606ceb, 0xdf690355), TOBN(0x48bb92b3, 0xd0c3ded6)},
+      {TOBN(0x65b75484, 0x5c7cf892), TOBN(0xf6cd7ac9, 0xd5d5f01f),
+       TOBN(0xc2c30a59, 0x96401d69), TOBN(0x91268650, 0xed921878)}},
+     {{TOBN(0x380bf913, 0xb78c558f), TOBN(0x43c0baeb, 0xc8afdaa9),
+       TOBN(0x377f61d5, 0x54f169d3), TOBN(0xf8da07e3, 0xae5ff20b)},
+      {TOBN(0xb676c49d, 0xa8a90ea8), TOBN(0x81c1ff2b, 0x83a29b21),
+       TOBN(0x383297ac, 0x2ad8d276), TOBN(0x3001122f, 0xba89f982)}},
+     {{TOBN(0xe1d794be, 0x6718e448), TOBN(0x246c1482, 0x7c3e6e13),
+       TOBN(0x56646ef8, 0x5d26b5ef), TOBN(0x80f5091e, 0x88069cdd)},
+      {TOBN(0xc5992e2f, 0x724bdd38), TOBN(0x02e915b4, 0x8471e8c7),
+       TOBN(0x96ff320a, 0x0d0ff2a9), TOBN(0xbf886487, 0x4384d1a0)}},
+     {{TOBN(0xbbe1e6a6, 0xc93f72d6), TOBN(0xd5f75d12, 0xcad800ea),
+       TOBN(0xfa40a09f, 0xe7acf117), TOBN(0x32c8cdd5, 0x7581a355)},
+      {TOBN(0x74221992, 0x7023c499), TOBN(0xa8afe5d7, 0x38ec3901),
+       TOBN(0x5691afcb, 0xa90e83f0), TOBN(0x41bcaa03, 0x0b8f8eac)}},
+     {{TOBN(0xe38b5ff9, 0x8d2668d5), TOBN(0x0715281a, 0x7ad81965),
+       TOBN(0x1bc8fc7c, 0x03c6ce11), TOBN(0xcbbee6e2, 0x8b650436)},
+      {TOBN(0x06b00fe8, 0x0cdb9808), TOBN(0x17d6e066, 0xfe3ed315),
+       TOBN(0x2e9d38c6, 0x4d0b5018), TOBN(0xab8bfd56, 0x844dcaef)}},
+     {{TOBN(0x42894a59, 0x513aed8b), TOBN(0xf77f3b6d, 0x314bd07a),
+       TOBN(0xbbdecb8f, 0x8e42b582), TOBN(0xf10e2fa8, 0xd2390fe6)},
+      {TOBN(0xefb95022, 0x62a2f201), TOBN(0x4d59ea50, 0x50ee32b0),
+       TOBN(0xd87f7728, 0x6da789a8), TOBN(0xcf98a2cf, 0xf79492c4)}},
+     {{TOBN(0xf9577239, 0x720943c2), TOBN(0xba044cf5, 0x3990b9d0),
+       TOBN(0x5aa8e823, 0x95f2884a), TOBN(0x834de6ed, 0x0278a0af)},
+      {TOBN(0xc8e1ee9a, 0x5f25bd12), TOBN(0x9259ceaa, 0x6f7ab271),
+       TOBN(0x7e6d97a2, 0x77d00b76), TOBN(0x5c0c6eea, 0xa437832a)}},
+     {{TOBN(0x5232c20f, 0x5606b81d), TOBN(0xabd7b375, 0x0d991ee5),
+       TOBN(0x4d2bfe35, 0x8632d951), TOBN(0x78f85146, 0x98ed9364)},
+      {TOBN(0x951873f0, 0xf30c3282), TOBN(0x0da8ac80, 0xa789230b),
+       TOBN(0x3ac7789c, 0x5398967f), TOBN(0xa69b8f7f, 0xbdda0fb5)}},
+     {{TOBN(0xe5db7717, 0x6add8545), TOBN(0x1b71cb66, 0x72c49b66),
+       TOBN(0xd8560739, 0x68421d77), TOBN(0x03840fe8, 0x83e3afea)},
+      {TOBN(0xb391dad5, 0x1ec69977), TOBN(0xae243fb9, 0x307f6726),
+       TOBN(0xc88ac87b, 0xe8ca160c), TOBN(0x5174cced, 0x4ce355f4)}},
+     {{TOBN(0x98a35966, 0xe58ba37d), TOBN(0xfdcc8da2, 0x7817335d),
+       TOBN(0x5b752830, 0x83fbc7bf), TOBN(0x68e419d4, 0xd9c96984)},
+      {TOBN(0x409a39f4, 0x02a40380), TOBN(0x88940faf, 0x1fe977bc),
+       TOBN(0xc640a94b, 0x8f8edea6), TOBN(0x1e22cd17, 0xed11547d)}},
+     {{TOBN(0xe28568ce, 0x59ffc3e2), TOBN(0x60aa1b55, 0xc1dee4e7),
+       TOBN(0xc67497c8, 0x837cb363), TOBN(0x06fb438a, 0x105a2bf2)},
+      {TOBN(0x30357ec4, 0x500d8e20), TOBN(0x1ad9095d, 0x0670db10),
+       TOBN(0x7f589a05, 0xc73b7cfd), TOBN(0xf544607d, 0x880d6d28)}},
+     {{TOBN(0x17ba93b1, 0xa20ef103), TOBN(0xad859130, 0x6ba6577b),
+       TOBN(0x65c91cf6, 0x6fa214a0), TOBN(0xd7d49c6c, 0x27990da5)},
+      {TOBN(0xecd9ec8d, 0x20bb569d), TOBN(0xbd4b2502, 0xeeffbc33),
+       TOBN(0x2056ca5a, 0x6bed0467), TOBN(0x7916a1f7, 0x5b63728c)}},
+     {{TOBN(0xd4f9497d, 0x53a4f566), TOBN(0x89734664, 0x97b56810),
+       TOBN(0xf8e1da74, 0x0494a621), TOBN(0x82546a93, 0x8d011c68)},
+      {TOBN(0x1f3acb19, 0xc61ac162), TOBN(0x52f8fa9c, 0xabad0d3e),
+       TOBN(0x15356523, 0xb4b7ea43), TOBN(0x5a16ad61, 0xae608125)}},
+     {{TOBN(0xb0bcb87f, 0x4faed184), TOBN(0x5f236b1d, 0x5029f45f),
+       TOBN(0xd42c7607, 0x0bc6b1fc), TOBN(0xc644324e, 0x68aefce3)},
+      {TOBN(0x8e191d59, 0x5c5d8446), TOBN(0xc0208077, 0x13ae1979),
+       TOBN(0xadcaee55, 0x3ba59cc7), TOBN(0x20ed6d6b, 0xa2cb81ba)}},
+     {{TOBN(0x0952ba19, 0xb6efcffc), TOBN(0x60f12d68, 0x97c0b87c),
+       TOBN(0x4ee2c7c4, 0x9caa30bc), TOBN(0x767238b7, 0x97fbff4e)},
+      {TOBN(0xebc73921, 0x501b5d92), TOBN(0x3279e3df, 0xc2a37737),
+       TOBN(0x9fc12bc8, 0x6d197543), TOBN(0xfa94dc6f, 0x0a40db4e)}},
+     {{TOBN(0x7392b41a, 0x530ccbbd), TOBN(0x87c82146, 0xea823525),
+       TOBN(0xa52f984c, 0x05d98d0c), TOBN(0x2ae57d73, 0x5ef6974c)},
+      {TOBN(0x9377f7bf, 0x3042a6dd), TOBN(0xb1a007c0, 0x19647a64),
+       TOBN(0xfaa9079a, 0x0cca9767), TOBN(0x3d81a25b, 0xf68f72d5)}},
+     {{TOBN(0x752067f8, 0xff81578e), TOBN(0x78622150, 0x9045447d),
+       TOBN(0xc0c22fcf, 0x0505aa6f), TOBN(0x1030f0a6, 0x6bed1c77)},
+      {TOBN(0x31f29f15, 0x1f0bd739), TOBN(0x2d7989c7, 0xe6debe85),
+       TOBN(0x5c070e72, 0x8e677e98), TOBN(0x0a817bd3, 0x06e81fd5)}},
+     {{TOBN(0xc110d830, 0xb0f2ac95), TOBN(0x48d0995a, 0xab20e64e),
+       TOBN(0x0f3e00e1, 0x7729cd9a), TOBN(0x2a570c20, 0xdd556946)},
+      {TOBN(0x912dbcfd, 0x4e86214d), TOBN(0x2d014ee2, 0xcf615498),
+       TOBN(0x55e2b1e6, 0x3530d76e), TOBN(0xc5135ae4, 0xfd0fd6d1)}},
+     {{TOBN(0x0066273a, 0xd4f3049f), TOBN(0xbb8e9893, 0xe7087477),
+       TOBN(0x2dba1ddb, 0x14c6e5fd), TOBN(0xdba37886, 0x51f57e6c)},
+      {TOBN(0x5aaee0a6, 0x5a72f2cf), TOBN(0x1208bfbf, 0x7bea5642),
+       TOBN(0xf5c6aa3b, 0x67872c37), TOBN(0xd726e083, 0x43f93224)}},
+     {{TOBN(0x1854daa5, 0x061f1658), TOBN(0xc0016df1, 0xdf0cd2b3),
+       TOBN(0xc2a3f23e, 0x833d50de), TOBN(0x73b681d2, 0xbbbd3017)},
+      {TOBN(0x2f046dc4, 0x3ac343c0), TOBN(0x9c847e7d, 0x85716421),
+       TOBN(0xe1e13c91, 0x0917eed4), TOBN(0x3fc9eebd, 0x63a1b9c6)}},
+     {{TOBN(0x0f816a72, 0x7fe02299), TOBN(0x6335ccc2, 0x294f3319),
+       TOBN(0x3820179f, 0x4745c5be), TOBN(0xe647b782, 0x922f066e)},
+      {TOBN(0xc22e49de, 0x02cafb8a), TOBN(0x299bc2ff, 0xfcc2eccc),
+       TOBN(0x9a8feea2, 0x6e0e8282), TOBN(0xa627278b, 0xfe893205)}},
+     {{TOBN(0xa7e19733, 0x7933e47b), TOBN(0xf4ff6b13, 0x2e766402),
+       TOBN(0xa4d8be0a, 0x98440d9f), TOBN(0x658f5c2f, 0x38938808)},
+      {TOBN(0x90b75677, 0xc95b3b3e), TOBN(0xfa044269, 0x3137b6ff),
+       TOBN(0x077b039b, 0x43c47c29), TOBN(0xcca95dd3, 0x8a6445b2)}},
+     {{TOBN(0x0b498ba4, 0x2333fc4c), TOBN(0x274f8e68, 0xf736a1b1),
+       TOBN(0x6ca348fd, 0x5f1d4b2e), TOBN(0x24d3be78, 0xa8f10199)},
+      {TOBN(0x8535f858, 0xca14f530), TOBN(0xa6e7f163, 0x5b982e51),
+       TOBN(0x847c8512, 0x36e1bf62), TOBN(0xf6a7c58e, 0x03448418)}},
+     {{TOBN(0x583f3703, 0xf9374ab6), TOBN(0x864f9195, 0x6e564145),
+       TOBN(0x33bc3f48, 0x22526d50), TOBN(0x9f323c80, 0x1262a496)},
+      {TOBN(0xaa97a7ae, 0x3f046a9a), TOBN(0x70da183e, 0xdf8a039a),
+       TOBN(0x5b68f71c, 0x52aa0ba6), TOBN(0x9be0fe51, 0x21459c2d)}},
+     {{TOBN(0xc1e17eb6, 0xcbc613e5), TOBN(0x33131d55, 0x497ea61c),
+       TOBN(0x2f69d39e, 0xaf7eded5), TOBN(0x73c2f434, 0xde6af11b)},
+      {TOBN(0x4ca52493, 0xa4a375fa), TOBN(0x5f06787c, 0xb833c5c2),
+       TOBN(0x814e091f, 0x3e6e71cf), TOBN(0x76451f57, 0x8b746666)}}},
+    {{{TOBN(0x80f9bdef, 0x694db7e0), TOBN(0xedca8787, 0xb9fcddc6),
+       TOBN(0x51981c34, 0x03b8dce1), TOBN(0x4274dcf1, 0x70e10ba1)},
+      {TOBN(0xf72743b8, 0x6def6d1a), TOBN(0xd25b1670, 0xebdb1866),
+       TOBN(0xc4491e8c, 0x050c6f58), TOBN(0x2be2b2ab, 0x87fbd7f5)}},
+     {{TOBN(0x3e0e5c9d, 0xd111f8ec), TOBN(0xbcc33f8d, 0xb7c4e760),
+       TOBN(0x702f9a91, 0xbd392a51), TOBN(0x7da4a795, 0xc132e92d)},
+      {TOBN(0x1a0b0ae3, 0x0bb1151b), TOBN(0x54febac8, 0x02e32251),
+       TOBN(0xea3a5082, 0x694e9e78), TOBN(0xe58ffec1, 0xe4fe40b8)}},
+     {{TOBN(0xf85592fc, 0xd1e0cf9e), TOBN(0xdea75f0d, 0xc0e7b2e8),
+       TOBN(0xc04215cf, 0xc135584e), TOBN(0x174fc727, 0x2f57092a)},
+      {TOBN(0xe7277877, 0xeb930bea), TOBN(0x504caccb, 0x5eb02a5a),
+       TOBN(0xf9fe08f7, 0xf5241b9b), TOBN(0xe7fb62f4, 0x8d5ca954)}},
+     {{TOBN(0xfbb8349d, 0x29c4120b), TOBN(0x9f94391f, 0xc0d0d915),
+       TOBN(0xc4074fa7, 0x5410ba51), TOBN(0xa66adbf6, 0x150a5911)},
+      {TOBN(0xc164543c, 0x34bfca38), TOBN(0xe0f27560, 0xb9e1ccfc),
+       TOBN(0x99da0f53, 0xe820219c), TOBN(0xe8234498, 0xc6b4997a)}},
+     {{TOBN(0xcfb88b76, 0x9d4c5423), TOBN(0x9e56eb10, 0xb0521c49),
+       TOBN(0x418e0b5e, 0xbe8700a1), TOBN(0x00cbaad6, 0xf93cb58a)},
+      {TOBN(0xe923fbde, 0xd92a5e67), TOBN(0xca4979ac, 0x1f347f11),
+       TOBN(0x89162d85, 0x6bc0585b), TOBN(0xdd6254af, 0xac3c70e3)}},
+     {{TOBN(0x7b23c513, 0x516e19e4), TOBN(0x56e2e847, 0xc5c4d593),
+       TOBN(0x9f727d73, 0x5ce71ef6), TOBN(0x5b6304a6, 0xf79a44c5)},
+      {TOBN(0x6638a736, 0x3ab7e433), TOBN(0x1adea470, 0xfe742f83),
+       TOBN(0xe054b854, 0x5b7fc19f), TOBN(0xf935381a, 0xba1d0698)}},
+     {{TOBN(0x546eab2d, 0x799e9a74), TOBN(0x96239e0e, 0xa949f729),
+       TOBN(0xca274c6b, 0x7090055a), TOBN(0x835142c3, 0x9020c9b0)},
+      {TOBN(0xa405667a, 0xa2e8807f), TOBN(0x29f2c085, 0x1aa3d39e),
+       TOBN(0xcc555d64, 0x42fc72f5), TOBN(0xe856e0e7, 0xfbeacb3c)}},
+     {{TOBN(0xb5504f9d, 0x918e4936), TOBN(0x65035ef6, 0xb2513982),
+       TOBN(0x0553a0c2, 0x6f4d9cb9), TOBN(0x6cb10d56, 0xbea85509)},
+      {TOBN(0x48d957b7, 0xa242da11), TOBN(0x16a4d3dd, 0x672b7268),
+       TOBN(0x3d7e637c, 0x8502a96b), TOBN(0x27c7032b, 0x730d463b)}},
+     {{TOBN(0xbdc02b18, 0xe4136a14), TOBN(0xbacf969d, 0x678e32bf),
+       TOBN(0xc98d89a3, 0xdd9c3c03), TOBN(0x7b92420a, 0x23becc4f)},
+      {TOBN(0xd4b41f78, 0xc64d565c), TOBN(0x9f969d00, 0x10f28295),
+       TOBN(0xec7f7f76, 0xb13d051a), TOBN(0x08945e1e, 0xa92da585)}},
+     {{TOBN(0x55366b7d, 0x5846426f), TOBN(0xe7d09e89, 0x247d441d),
+       TOBN(0x510b404d, 0x736fbf48), TOBN(0x7fa003d0, 0xe784bd7d)},
+      {TOBN(0x25f7614f, 0x17fd9596), TOBN(0x49e0e0a1, 0x35cb98db),
+       TOBN(0x2c65957b, 0x2e83a76a), TOBN(0x5d40da8d, 0xcddbe0f8)}},
+     {{TOBN(0xf2b8c405, 0x050bad24), TOBN(0x8918426d, 0xc2aa4823),
+       TOBN(0x2aeab3dd, 0xa38365a7), TOBN(0x72031717, 0x7c91b690)},
+      {TOBN(0x8b00d699, 0x60a94120), TOBN(0x478a255d, 0xe99eaeec),
+       TOBN(0xbf656a5f, 0x6f60aafd), TOBN(0xdfd7cb75, 0x5dee77b3)}},
+     {{TOBN(0x37f68bb4, 0xa595939d), TOBN(0x03556479, 0x28740217),
+       TOBN(0x8e740e7c, 0x84ad7612), TOBN(0xd89bc843, 0x9044695f)},
+      {TOBN(0xf7f3da5d, 0x85a9184d), TOBN(0x562563bb, 0x9fc0b074),
+       TOBN(0x06d2e6aa, 0xf88a888e), TOBN(0x612d8643, 0x161fbe7c)}},
+     {{TOBN(0x465edba7, 0xf64085e7), TOBN(0xb230f304, 0x29aa8511),
+       TOBN(0x53388426, 0xcda2d188), TOBN(0x90885735, 0x4b666649)},
+      {TOBN(0x6f02ff9a, 0x652f54f6), TOBN(0x65c82294, 0x5fae2bf0),
+       TOBN(0x7816ade0, 0x62f5eee3), TOBN(0xdcdbdf43, 0xfcc56d70)}},
+     {{TOBN(0x9fb3bba3, 0x54530bb2), TOBN(0xbde3ef77, 0xcb0869ea),
+       TOBN(0x89bc9046, 0x0b431163), TOBN(0x4d03d7d2, 0xe4819a35)},
+      {TOBN(0x33ae4f9e, 0x43b6a782), TOBN(0x216db307, 0x9c88a686),
+       TOBN(0x91dd88e0, 0x00ffedd9), TOBN(0xb280da9f, 0x12bd4840)}},
+     {{TOBN(0x32a7cb8a, 0x1635e741), TOBN(0xfe14008a, 0x78be02a7),
+       TOBN(0x3fafb334, 0x1b7ae030), TOBN(0x7fd508e7, 0x5add0ce9)},
+      {TOBN(0x72c83219, 0xd607ad51), TOBN(0x0f229c0a, 0x8d40964a),
+       TOBN(0x1be2c336, 0x1c878da2), TOBN(0xe0c96742, 0xeab2ab86)}},
+     {{TOBN(0x458f8691, 0x3e538cd7), TOBN(0xa7001f6c, 0x8e08ad53),
+       TOBN(0x52b8c6e6, 0xbf5d15ff), TOBN(0x548234a4, 0x011215dd)},
+      {TOBN(0xff5a9d2d, 0x3d5b4045), TOBN(0xb0ffeeb6, 0x4a904190),
+       TOBN(0x55a3aca4, 0x48607f8b), TOBN(0x8cbd665c, 0x30a0672a)}},
+     {{TOBN(0x87f834e0, 0x42583068), TOBN(0x02da2aeb, 0xf3f6e683),
+       TOBN(0x6b763e5d, 0x05c12248), TOBN(0x7230378f, 0x65a8aefc)},
+      {TOBN(0x93bd80b5, 0x71e8e5ca), TOBN(0x53ab041c, 0xb3b62524),
+       TOBN(0x1b860513, 0x6c9c552e), TOBN(0xe84d402c, 0xd5524e66)}},
+     {{TOBN(0xa37f3573, 0xf37f5937), TOBN(0xeb0f6c7d, 0xd1e4fca5),
+       TOBN(0x2965a554, 0xac8ab0fc), TOBN(0x17fbf56c, 0x274676ac)},
+      {TOBN(0x2e2f6bd9, 0xacf7d720), TOBN(0x41fc8f88, 0x10224766),
+       TOBN(0x517a14b3, 0x85d53bef), TOBN(0xdae327a5, 0x7d76a7d1)}},
+     {{TOBN(0x6ad0a065, 0xc4818267), TOBN(0x33aa189b, 0x37c1bbc1),
+       TOBN(0x64970b52, 0x27392a92), TOBN(0x21699a1c, 0x2d1535ea)},
+      {TOBN(0xcd20779c, 0xc2d7a7fd), TOBN(0xe3186059, 0x99c83cf2),
+       TOBN(0x9b69440b, 0x72c0b8c7), TOBN(0xa81497d7, 0x7b9e0e4d)}},
+     {{TOBN(0x515d5c89, 0x1f5f82dc), TOBN(0x9a7f67d7, 0x6361079e),
+       TOBN(0xa8da81e3, 0x11a35330), TOBN(0xe44990c4, 0x4b18be1b)},
+      {TOBN(0xc7d5ed95, 0xaf103e59), TOBN(0xece8aba7, 0x8dac9261),
+       TOBN(0xbe82b099, 0x9394b8d3), TOBN(0x6830f09a, 0x16adfe83)}},
+     {{TOBN(0x250a29b4, 0x88172d01), TOBN(0x8b20bd65, 0xcaff9e02),
+       TOBN(0xb8a7661e, 0xe8a6329a), TOBN(0x4520304d, 0xd3fce920)},
+      {TOBN(0xae45da1f, 0x2b47f7ef), TOBN(0xe07f5288, 0x5bffc540),
+       TOBN(0xf7997009, 0x3464f874), TOBN(0x2244c2cd, 0xa6fa1f38)}},
+     {{TOBN(0x43c41ac1, 0x94d7d9b1), TOBN(0x5bafdd82, 0xc82e7f17),
+       TOBN(0xdf0614c1, 0x5fda0fca), TOBN(0x74b043a7, 0xa8ae37ad)},
+      {TOBN(0x3ba6afa1, 0x9e71734c), TOBN(0x15d5437e, 0x9c450f2e),
+       TOBN(0x4a5883fe, 0x67e242b1), TOBN(0x5143bdc2, 0x2c1953c2)}},
+     {{TOBN(0x542b8b53, 0xfc5e8920), TOBN(0x363bf9a8, 0x9a9cee08),
+       TOBN(0x02375f10, 0xc3486e08), TOBN(0x2037543b, 0x8c5e70d2)},
+      {TOBN(0x7109bccc, 0x625640b4), TOBN(0xcbc1051e, 0x8bc62c3b),
+       TOBN(0xf8455fed, 0x803f26ea), TOBN(0x6badceab, 0xeb372424)}},
+     {{TOBN(0xa2a9ce7c, 0x6b53f5f9), TOBN(0x64246595, 0x1b176d99),
+       TOBN(0xb1298d36, 0xb95c081b), TOBN(0x53505bb8, 0x1d9a9ee6)},
+      {TOBN(0x3f6f9e61, 0xf2ba70b0), TOBN(0xd07e16c9, 0x8afad453),
+       TOBN(0x9f1694bb, 0xe7eb4a6a), TOBN(0xdfebced9, 0x3cb0bc8e)}},
+     {{TOBN(0x92d3dcdc, 0x53868c8b), TOBN(0x174311a2, 0x386107a6),
+       TOBN(0x4109e07c, 0x689b4e64), TOBN(0x30e4587f, 0x2df3dcb6)},
+      {TOBN(0x841aea31, 0x0811b3b2), TOBN(0x6144d41d, 0x0cce43ea),
+       TOBN(0x464c4581, 0x2a9a7803), TOBN(0xd03d371f, 0x3e158930)}},
+     {{TOBN(0xc676d7f2, 0xb1f3390b), TOBN(0x9f7a1b8c, 0xa5b61272),
+       TOBN(0x4ebebfc9, 0xc2e127a9), TOBN(0x4602500c, 0x5dd997bf)},
+      {TOBN(0x7f09771c, 0x4711230f), TOBN(0x058eb37c, 0x020f09c1),
+       TOBN(0xab693d4b, 0xfee5e38b), TOBN(0x9289eb1f, 0x4653cbc0)}},
+     {{TOBN(0xbecf46ab, 0xd51b9cf5), TOBN(0xd2aa9c02, 0x9f0121af),
+       TOBN(0x36aaf7d2, 0xe90dc274), TOBN(0x909e4ea0, 0x48b95a3c)},
+      {TOBN(0xe6b70496, 0x6f32dbdb), TOBN(0x672188a0, 0x8b030b3e),
+       TOBN(0xeeffe5b3, 0xcfb617e2), TOBN(0x87e947de, 0x7c82709e)}},
+     {{TOBN(0xa44d2b39, 0x1770f5a7), TOBN(0xe4d4d791, 0x0e44eb82),
+       TOBN(0x42e69d1e, 0x3f69712a), TOBN(0xbf11c4d6, 0xac6a820e)},
+      {TOBN(0xb5e7f3e5, 0x42c4224c), TOBN(0xd6b4e81c, 0x449d941c),
+       TOBN(0x5d72bd16, 0x5450e878), TOBN(0x6a61e28a, 0xee25ac54)}},
+     {{TOBN(0x33272094, 0xe6f1cd95), TOBN(0x7512f30d, 0x0d18673f),
+       TOBN(0x32f7a4ca, 0x5afc1464), TOBN(0x2f095656, 0x6bbb977b)},
+      {TOBN(0x586f47ca, 0xa8226200), TOBN(0x02c868ad, 0x1ac07369),
+       TOBN(0x4ef2b845, 0xc613acbe), TOBN(0x43d7563e, 0x0386054c)}},
+     {{TOBN(0x54da9dc7, 0xab952578), TOBN(0xb5423df2, 0x26e84d0b),
+       TOBN(0xa8b64eeb, 0x9b872042), TOBN(0xac205782, 0x5990f6df)},
+      {TOBN(0x4ff696eb, 0x21f4c77a), TOBN(0x1a79c3e4, 0xaab273af),
+       TOBN(0x29bc922e, 0x9436b3f1), TOBN(0xff807ef8, 0xd6d9a27a)}},
+     {{TOBN(0x82acea3d, 0x778f22a0), TOBN(0xfb10b2e8, 0x5b5e7469),
+       TOBN(0xc0b16980, 0x2818ee7d), TOBN(0x011afff4, 0xc91c1a2f)},
+      {TOBN(0x95a6d126, 0xad124418), TOBN(0x31c081a5, 0xe72e295f),
+       TOBN(0x36bb283a, 0xf2f4db75), TOBN(0xd115540f, 0x7acef462)}},
+     {{TOBN(0xc7f3a8f8, 0x33f6746c), TOBN(0x21e46f65, 0xfea990ca),
+       TOBN(0x915fd5c5, 0xcaddb0a9), TOBN(0xbd41f016, 0x78614555)},
+      {TOBN(0x346f4434, 0x426ffb58), TOBN(0x80559436, 0x14dbc204),
+       TOBN(0xf3dd20fe, 0x5a969b7f), TOBN(0x9d59e956, 0xe899a39a)}},
+     {{TOBN(0xf1b0971c, 0x8ad4cf4b), TOBN(0x03448860, 0x2ffb8fb8),
+       TOBN(0xf071ac3c, 0x65340ba4), TOBN(0x408d0596, 0xb27fd758)},
+      {TOBN(0xe7c78ea4, 0x98c364b0), TOBN(0xa4aac4a5, 0x051e8ab5),
+       TOBN(0xb9e1d560, 0x485d9002), TOBN(0x9acd518a, 0x88844455)}},
+     {{TOBN(0xe4ca688f, 0xd06f56c0), TOBN(0xa48af70d, 0xdf027972),
+       TOBN(0x691f0f04, 0x5e9a609d), TOBN(0xa9dd82cd, 0xee61270e)},
+      {TOBN(0x8903ca63, 0xa0ef18d3), TOBN(0x9fb7ee35, 0x3d6ca3bd),
+       TOBN(0xa7b4a09c, 0xabf47d03), TOBN(0x4cdada01, 0x1c67de8e)}},
+     {{TOBN(0x52003749, 0x9355a244), TOBN(0xe77fd2b6, 0x4f2151a9),
+       TOBN(0x695d6cf6, 0x66b4efcb), TOBN(0xc5a0cacf, 0xda2cfe25)},
+      {TOBN(0x104efe5c, 0xef811865), TOBN(0xf52813e8, 0x9ea5cc3d),
+       TOBN(0x855683dc, 0x40b58dbc), TOBN(0x0338ecde, 0x175fcb11)}},
+     {{TOBN(0xf9a05637, 0x74921592), TOBN(0xb4f1261d, 0xb9bb9d31),
+       TOBN(0x551429b7, 0x4e9c5459), TOBN(0xbe182e6f, 0x6ea71f53)},
+      {TOBN(0xd3a3b07c, 0xdfc50573), TOBN(0x9ba1afda, 0x62be8d44),
+       TOBN(0x9bcfd2cb, 0x52ab65d3), TOBN(0xdf11d547, 0xa9571802)}},
+     {{TOBN(0x099403ee, 0x02a2404a), TOBN(0x497406f4, 0x21088a71),
+       TOBN(0x99479409, 0x5004ae71), TOBN(0xbdb42078, 0xa812c362)},
+      {TOBN(0x2b72a30f, 0xd8828442), TOBN(0x283add27, 0xfcb5ed1c),
+       TOBN(0xf7c0e200, 0x66a40015), TOBN(0x3e3be641, 0x08b295ef)}},
+     {{TOBN(0xac127dc1, 0xe038a675), TOBN(0x729deff3, 0x8c5c6320),
+       TOBN(0xb7df8fd4, 0xa90d2c53), TOBN(0x9b74b0ec, 0x681e7cd3)},
+      {TOBN(0x5cb5a623, 0xdab407e5), TOBN(0xcdbd3615, 0x76b340c6),
+       TOBN(0xa184415a, 0x7d28392c), TOBN(0xc184c1d8, 0xe96f7830)}},
+     {{TOBN(0xc3204f19, 0x81d3a80f), TOBN(0xfde0c841, 0xc8e02432),
+       TOBN(0x78203b3e, 0x8149e0c1), TOBN(0x5904bdbb, 0x08053a73)},
+      {TOBN(0x30fc1dd1, 0x101b6805), TOBN(0x43c223bc, 0x49aa6d49),
+       TOBN(0x9ed67141, 0x7a174087), TOBN(0x311469a0, 0xd5997008)}},
+     {{TOBN(0xb189b684, 0x5e43fc61), TOBN(0xf3282375, 0xe0d3ab57),
+       TOBN(0x4fa34b67, 0xb1181da8), TOBN(0x621ed0b2, 0x99ee52b8)},
+      {TOBN(0x9b178de1, 0xad990676), TOBN(0xd51de67b, 0x56d54065),
+       TOBN(0x2a2c27c4, 0x7538c201), TOBN(0x33856ec8, 0x38a40f5c)}},
+     {{TOBN(0x2522fc15, 0xbe6cdcde), TOBN(0x1e603f33, 0x9f0c6f89),
+       TOBN(0x7994edc3, 0x103e30a6), TOBN(0x033a00db, 0x220c853e)},
+      {TOBN(0xd3cfa409, 0xf7bb7fd7), TOBN(0x70f8781e, 0x462d18f6),
+       TOBN(0xbbd82980, 0x687fe295), TOBN(0x6eef4c32, 0x595669f3)}},
+     {{TOBN(0x86a9303b, 0x2f7e85c3), TOBN(0x5fce4621, 0x71988f9b),
+       TOBN(0x5b935bf6, 0xc138acb5), TOBN(0x30ea7d67, 0x25661212)},
+      {TOBN(0xef1eb5f4, 0xe51ab9a2), TOBN(0x0587c98a, 0xae067c78),
+       TOBN(0xb3ce1b3c, 0x77ca9ca6), TOBN(0x2a553d4d, 0x54b5f057)}},
+     {{TOBN(0xc7898236, 0x4da29ec2), TOBN(0xdbdd5d13, 0xb9c57316),
+       TOBN(0xc57d6e6b, 0x2cd80d47), TOBN(0x80b460cf, 0xfe9e7391)},
+      {TOBN(0x98648cab, 0xf963c31e), TOBN(0x67f9f633, 0xcc4d32fd),
+       TOBN(0x0af42a9d, 0xfdf7c687), TOBN(0x55f292a3, 0x0b015ea7)}},
+     {{TOBN(0x89e468b2, 0xcd21ab3d), TOBN(0xe504f022, 0xc393d392),
+       TOBN(0xab21e1d4, 0xa5013af9), TOBN(0xe3283f78, 0xc2c28acb)},
+      {TOBN(0xf38b35f6, 0x226bf99f), TOBN(0xe8354274, 0x0e291e69),
+       TOBN(0x61673a15, 0xb20c162d), TOBN(0xc101dc75, 0xb04fbdbe)}},
+     {{TOBN(0x8323b4c2, 0x255bd617), TOBN(0x6c969693, 0x6c2a9154),
+       TOBN(0xc6e65860, 0x62679387), TOBN(0x8e01db0c, 0xb8c88e23)},
+      {TOBN(0x33c42873, 0x893a5559), TOBN(0x7630f04b, 0x47a3e149),
+       TOBN(0xb5d80805, 0xddcf35f8), TOBN(0x582ca080, 0x77dfe732)}},
+     {{TOBN(0x2c7156e1, 0x0b1894a0), TOBN(0x92034001, 0xd81c68c0),
+       TOBN(0xed225d00, 0xc8b115b5), TOBN(0x237f9c22, 0x83b907f2)},
+      {TOBN(0x0ea2f32f, 0x4470e2c0), TOBN(0xb725f7c1, 0x58be4e95),
+       TOBN(0x0f1dcafa, 0xb1ae5463), TOBN(0x59ed5187, 0x1ba2fc04)}},
+     {{TOBN(0xf6e0f316, 0xd0115d4d), TOBN(0x5180b12f, 0xd3691599),
+       TOBN(0x157e32c9, 0x527f0a41), TOBN(0x7b0b081d, 0xa8e0ecc0)},
+      {TOBN(0x6dbaaa8a, 0xbf4f0dd0), TOBN(0x99b289c7, 0x4d252696),
+       TOBN(0x79b7755e, 0xdbf864fe), TOBN(0x6974e2b1, 0x76cad3ab)}},
+     {{TOBN(0x35dbbee2, 0x06ddd657), TOBN(0xe7cbdd11, 0x2ff3a96d),
+       TOBN(0x88381968, 0x076be758), TOBN(0x2d737e72, 0x08c91f5d)},
+      {TOBN(0x5f83ab62, 0x86ec3776), TOBN(0x98aa649d, 0x945fa7a1),
+       TOBN(0xf477ec37, 0x72ef0933), TOBN(0x66f52b1e, 0x098c17b1)}},
+     {{TOBN(0x9eec58fb, 0xd803738b), TOBN(0x91aaade7, 0xe4e86aa4),
+       TOBN(0x6b1ae617, 0xa5b51492), TOBN(0x63272121, 0xbbc45974)},
+      {TOBN(0x7e0e28f0, 0x862c5129), TOBN(0x0a8f79a9, 0x3321a4a0),
+       TOBN(0xe26d1664, 0x5041c88f), TOBN(0x0571b805, 0x53233e3a)}},
+     {{TOBN(0xd1b0ccde, 0xc9520711), TOBN(0x55a9e4ed, 0x3c8b84bf),
+       TOBN(0x9426bd39, 0xa1fef314), TOBN(0x4f5f638e, 0x6eb93f2b)},
+      {TOBN(0xba2a1ed3, 0x2bf9341b), TOBN(0xd63c1321, 0x4d42d5a9),
+       TOBN(0xd2964a89, 0x316dc7c5), TOBN(0xd1759606, 0xca511851)}},
+     {{TOBN(0xd8a9201f, 0xf9e6ed35), TOBN(0xb7b5ee45, 0x6736925a),
+       TOBN(0x0a83fbbc, 0x99581af7), TOBN(0x3076bc40, 0x64eeb051)},
+      {TOBN(0x5511c98c, 0x02dec312), TOBN(0x270de898, 0x238dcb78),
+       TOBN(0x2cf4cf9c, 0x539c08c9), TOBN(0xa70cb65e, 0x38d3b06e)}},
+     {{TOBN(0xb12ec10e, 0xcfe57bbd), TOBN(0x82c7b656, 0x35a0c2b5),
+       TOBN(0xddc7d5cd, 0x161c67bd), TOBN(0xe32e8985, 0xae3a32cc)},
+      {TOBN(0x7aba9444, 0xd11a5529), TOBN(0xe964ed02, 0x2427fa1a),
+       TOBN(0x1528392d, 0x24a1770a), TOBN(0xa152ce2c, 0x12c72fcd)}},
+     {{TOBN(0x714553a4, 0x8ec07649), TOBN(0x18b4c290, 0x459dd453),
+       TOBN(0xea32b714, 0x7b64b110), TOBN(0xb871bfa5, 0x2e6f07a2)},
+      {TOBN(0xb67112e5, 0x9e2e3c9b), TOBN(0xfbf250e5, 0x44aa90f6),
+       TOBN(0xf77aedb8, 0xbd539006), TOBN(0x3b0cdf9a, 0xd172a66f)}},
+     {{TOBN(0xedf69fea, 0xf8c51187), TOBN(0x05bb67ec, 0x741e4da7),
+       TOBN(0x47df0f32, 0x08114345), TOBN(0x56facb07, 0xbb9792b1)},
+      {TOBN(0xf3e007e9, 0x8f6229e4), TOBN(0x62d103f4, 0x526fba0f),
+       TOBN(0x4f33bef7, 0xb0339d79), TOBN(0x9841357b, 0xb59bfec1)}},
+     {{TOBN(0xfa8dbb59, 0xc34e6705), TOBN(0xc3c7180b, 0x7fdaa84c),
+       TOBN(0xf95872fc, 0xa4108537), TOBN(0x8750cc3b, 0x932a3e5a)},
+      {TOBN(0xb61cc69d, 0xb7275d7d), TOBN(0xffa0168b, 0x2e59b2e9),
+       TOBN(0xca032abc, 0x6ecbb493), TOBN(0x1d86dbd3, 0x2c9082d8)}},
+     {{TOBN(0xae1e0b67, 0xe28ef5ba), TOBN(0x2c9a4699, 0xcb18e169),
+       TOBN(0x0ecd0e33, 0x1e6bbd20), TOBN(0x571b360e, 0xaf5e81d2)},
+      {TOBN(0xcd9fea58, 0x101c1d45), TOBN(0x6651788e, 0x18880452),
+       TOBN(0xa9972635, 0x1f8dd446), TOBN(0x44bed022, 0xe37281d0)}},
+     {{TOBN(0x094b2b2d, 0x33da525d), TOBN(0xf193678e, 0x13144fd8),
+       TOBN(0xb8ab5ba4, 0xf4c1061d), TOBN(0x4343b5fa, 0xdccbe0f4)},
+      {TOBN(0xa8702371, 0x63812713), TOBN(0x47bf6d2d, 0xf7611d93),
+       TOBN(0x46729b8c, 0xbd21e1d7), TOBN(0x7484d4e0, 0xd629e77d)}},
+     {{TOBN(0x830e6eea, 0x60dbac1f), TOBN(0x23d8c484, 0xda06a2f7),
+       TOBN(0x896714b0, 0x50ca535b), TOBN(0xdc8d3644, 0xebd97a9b)},
+      {TOBN(0x106ef9fa, 0xb12177b4), TOBN(0xf79bf464, 0x534d5d9c),
+       TOBN(0x2537a349, 0xa6ab360b), TOBN(0xc7c54253, 0xa00c744f)}},
+     {{TOBN(0xb3c7a047, 0xe5911a76), TOBN(0x61ffa5c8, 0x647f1ee7),
+       TOBN(0x15aed36f, 0x8f56ab42), TOBN(0x6a0d41b0, 0xa3ff9ac9)},
+      {TOBN(0x68f469f5, 0xcc30d357), TOBN(0xbe9adf81, 0x6b72be96),
+       TOBN(0x1cd926fe, 0x903ad461), TOBN(0x7e89e38f, 0xcaca441b)}},
+     {{TOBN(0xf0f82de5, 0xfacf69d4), TOBN(0x363b7e76, 0x4775344c),
+       TOBN(0x6894f312, 0xb2e36d04), TOBN(0x3c6cb4fe, 0x11d1c9a5)},
+      {TOBN(0x85d9c339, 0x4008e1f2), TOBN(0x5e9a85ea, 0x249f326c),
+       TOBN(0xdc35c60a, 0x678c5e06), TOBN(0xc08b944f, 0x9f86fba9)}},
+     {{TOBN(0xde40c02c, 0x89f71f0f), TOBN(0xad8f3e31, 0xff3da3c0),
+       TOBN(0x3ea5096b, 0x42125ded), TOBN(0x13879cbf, 0xa7379183)},
+      {TOBN(0x6f4714a5, 0x6b306a0b), TOBN(0x359c2ea6, 0x67646c5e),
+       TOBN(0xfacf8943, 0x07726368), TOBN(0x07a58935, 0x65ff431e)}},
+     {{TOBN(0x24d661d1, 0x68754ab0), TOBN(0x801fce1d, 0x6f429a76),
+       TOBN(0xc068a85f, 0xa58ce769), TOBN(0xedc35c54, 0x5d5eca2b)},
+      {TOBN(0xea31276f, 0xa3f660d1), TOBN(0xa0184ebe, 0xb8fc7167),
+       TOBN(0x0f20f21a, 0x1d8db0ae), TOBN(0xd96d095f, 0x56c35e12)}},
+     {{TOBN(0xedf402b5, 0xf8c2a25b), TOBN(0x1bb772b9, 0x059204b6),
+       TOBN(0x50cbeae2, 0x19b4e34c), TOBN(0x93109d80, 0x3fa0845a)},
+      {TOBN(0x54f7ccf7, 0x8ef59fb5), TOBN(0x3b438fe2, 0x88070963),
+       TOBN(0x9e28c659, 0x31f3ba9b), TOBN(0x9cc31b46, 0xead9da92)}},
+     {{TOBN(0x3c2f0ba9, 0xb733aa5f), TOBN(0xdece47cb, 0xf05af235),
+       TOBN(0xf8e3f715, 0xa2ac82a5), TOBN(0xc97ba641, 0x2203f18a)},
+      {TOBN(0xc3af5504, 0x09c11060), TOBN(0x56ea2c05, 0x46af512d),
+       TOBN(0xfac28daf, 0xf3f28146), TOBN(0x87fab43a, 0x959ef494)}}},
+    {{{TOBN(0x09891641, 0xd4c5105f), TOBN(0x1ae80f8e, 0x6d7fbd65),
+       TOBN(0x9d67225f, 0xbee6bdb0), TOBN(0x3b433b59, 0x7fc4d860)},
+      {TOBN(0x44e66db6, 0x93e85638), TOBN(0xf7b59252, 0xe3e9862f),
+       TOBN(0xdb785157, 0x665c32ec), TOBN(0x702fefd7, 0xae362f50)}},
+     {{TOBN(0x3754475d, 0x0fefb0c3), TOBN(0xd48fb56b, 0x46d7c35d),
+       TOBN(0xa070b633, 0x363798a4), TOBN(0xae89f3d2, 0x8fdb98e6)},
+      {TOBN(0x970b89c8, 0x6363d14c), TOBN(0x89817521, 0x67abd27d),
+       TOBN(0x9bf7d474, 0x44d5a021), TOBN(0xb3083baf, 0xcac72aee)}},
+     {{TOBN(0x389741de, 0xbe949a44), TOBN(0x638e9388, 0x546a4fa5),
+       TOBN(0x3fe6419c, 0xa0047bdc), TOBN(0x7047f648, 0xaaea57ca)},
+      {TOBN(0x54e48a90, 0x41fbab17), TOBN(0xda8e0b28, 0x576bdba2),
+       TOBN(0xe807eebc, 0xc72afddc), TOBN(0x07d3336d, 0xf42577bf)}},
+     {{TOBN(0x62a8c244, 0xbfe20925), TOBN(0x91c19ac3, 0x8fdce867),
+       TOBN(0x5a96a5d5, 0xdd387063), TOBN(0x61d587d4, 0x21d324f6)},
+      {TOBN(0xe87673a2, 0xa37173ea), TOBN(0x23848008, 0x53778b65),
+       TOBN(0x10f8441e, 0x05bab43e), TOBN(0xfa11fe12, 0x4621efbe)}},
+     {{TOBN(0x047b772e, 0x81685d7b), TOBN(0x23f27d81, 0xbf34a976),
+       TOBN(0xc27608e2, 0x915f48ef), TOBN(0x3b0b43fa, 0xa521d5c3)},
+      {TOBN(0x7613fb26, 0x63ca7284), TOBN(0x7f5729b4, 0x1d4db837),
+       TOBN(0x87b14898, 0x583b526b), TOBN(0x00b732a6, 0xbbadd3d1)}},
+     {{TOBN(0x8e02f426, 0x2048e396), TOBN(0x436b50b6, 0x383d9de4),
+       TOBN(0xf78d3481, 0x471e85ad), TOBN(0x8b01ea6a, 0xd005c8d6)},
+      {TOBN(0xd3c7afee, 0x97015c07), TOBN(0x46cdf1a9, 0x4e3ba2ae),
+       TOBN(0x7a42e501, 0x83d3a1d2), TOBN(0xd54b5268, 0xb541dff4)}},
+     {{TOBN(0x3f24cf30, 0x4e23e9bc), TOBN(0x4387f816, 0x126e3624),
+       TOBN(0x26a46a03, 0x3b0b6d61), TOBN(0xaf1bc845, 0x8b2d777c)},
+      {TOBN(0x25c401ba, 0x527de79c), TOBN(0x0e1346d4, 0x4261bbb6),
+       TOBN(0x4b96c44b, 0x287b4bc7), TOBN(0x658493c7, 0x5254562f)}},
+     {{TOBN(0x23f949fe, 0xb8a24a20), TOBN(0x17ebfed1, 0xf52ca53f),
+       TOBN(0x9b691bbe, 0xbcfb4853), TOBN(0x5617ff6b, 0x6278a05d)},
+      {TOBN(0x241b34c5, 0xe3c99ebd), TOBN(0xfc64242e, 0x1784156a),
+       TOBN(0x4206482f, 0x695d67df), TOBN(0xb967ce0e, 0xee27c011)}},
+     {{TOBN(0x65db3751, 0x21c80b5d), TOBN(0x2e7a563c, 0xa31ecca0),
+       TOBN(0xe56ffc4e, 0x5238a07e), TOBN(0x3d6c2966, 0x32ced854)},
+      {TOBN(0xe99d7d1a, 0xaf70b885), TOBN(0xafc3bad9, 0x2d686459),
+       TOBN(0x9c78bf46, 0x0cc8ba5b), TOBN(0x5a439519, 0x18955aa3)}},
+     {{TOBN(0xf8b517a8, 0x5fe4e314), TOBN(0xe60234d0, 0xfcb8906f),
+       TOBN(0xffe542ac, 0xf2061b23), TOBN(0x287e191f, 0x6b4cb59c)},
+      {TOBN(0x21857ddc, 0x09d877d8), TOBN(0x1c23478c, 0x14678941),
+       TOBN(0xbbf0c056, 0xb6e05ea4), TOBN(0x82da4b53, 0xb01594fe)}},
+     {{TOBN(0xf7526791, 0xfadb8608), TOBN(0x049e832d, 0x7b74cdf6),
+       TOBN(0xa43581cc, 0xc2b90a34), TOBN(0x73639eb8, 0x9360b10c)},
+      {TOBN(0x4fba331f, 0xe1e4a71b), TOBN(0x6ffd6b93, 0x8072f919),
+       TOBN(0x6e53271c, 0x65679032), TOBN(0x67206444, 0xf14272ce)}},
+     {{TOBN(0xc0f734a3, 0xb2335834), TOBN(0x9526205a, 0x90ef6860),
+       TOBN(0xcb8be717, 0x04e2bb0d), TOBN(0x2418871e, 0x02f383fa)},
+      {TOBN(0xd7177681, 0x4082c157), TOBN(0xcc914ad0, 0x29c20073),
+       TOBN(0xf186c1eb, 0xe587e728), TOBN(0x6fdb3c22, 0x61bcd5fd)}},
+     {{TOBN(0x30d014a6, 0xf2f9f8e9), TOBN(0x963ece23, 0x4fec49d2),
+       TOBN(0x862025c5, 0x9605a8d9), TOBN(0x39874445, 0x19f8929a)},
+      {TOBN(0x01b6ff65, 0x12bf476a), TOBN(0x598a64d8, 0x09cf7d91),
+       TOBN(0xd7ec7749, 0x93be56ca), TOBN(0x10899785, 0xcbb33615)}},
+     {{TOBN(0xb8a092fd, 0x02eee3ad), TOBN(0xa86b3d35, 0x30145270),
+       TOBN(0x323d98c6, 0x8512b675), TOBN(0x4b8bc785, 0x62ebb40f)},
+      {TOBN(0x7d301f54, 0x413f9cde), TOBN(0xa5e4fb4f, 0x2bab5664),
+       TOBN(0x1d2b252d, 0x1cbfec23), TOBN(0xfcd576bb, 0xe177120d)}},
+     {{TOBN(0x04427d3e, 0x83731a34), TOBN(0x2bb9028e, 0xed836e8e),
+       TOBN(0xb36acff8, 0xb612ca7c), TOBN(0xb88fe5ef, 0xd3d9c73a)},
+      {TOBN(0xbe2a6bc6, 0xedea4eb3), TOBN(0x43b93133, 0x488eec77),
+       TOBN(0xf41ff566, 0xb17106e1), TOBN(0x469e9172, 0x654efa32)}},
+     {{TOBN(0xb4480f04, 0x41c23fa3), TOBN(0xb4712eb0, 0xc1989a2e),
+       TOBN(0x3ccbba0f, 0x93a29ca7), TOBN(0x6e205c14, 0xd619428c)},
+      {TOBN(0x90db7957, 0xb3641686), TOBN(0x0432691d, 0x45ac8b4e),
+       TOBN(0x07a759ac, 0xf64e0350), TOBN(0x0514d89c, 0x9c972517)}},
+     {{TOBN(0x1701147f, 0xa8e67fc3), TOBN(0x9e2e0b8b, 0xab2085be),
+       TOBN(0xd5651824, 0xac284e57), TOBN(0x890d4325, 0x74893664)},
+      {TOBN(0x8a7c5e6e, 0xc55e68a3), TOBN(0xbf12e90b, 0x4339c85a),
+       TOBN(0x31846b85, 0xf922b655), TOBN(0x9a54ce4d, 0x0bf4d700)}},
+     {{TOBN(0xd7f4e83a, 0xf1a14295), TOBN(0x916f955c, 0xb285d4f9),
+       TOBN(0xe57bb0e0, 0x99ffdaba), TOBN(0x28a43034, 0xeab0d152)},
+      {TOBN(0x0a36ffa2, 0xb8a9cef8), TOBN(0x5517407e, 0xb9ec051a),
+       TOBN(0x9c796096, 0xea68e672), TOBN(0x853db5fb, 0xfb3c77fb)}},
+     {{TOBN(0x21474ba9, 0xe864a51a), TOBN(0x6c267699, 0x6e8a1b8b),
+       TOBN(0x7c823626, 0x94120a28), TOBN(0xe61e9a48, 0x8383a5db)},
+      {TOBN(0x7dd75003, 0x9f84216d), TOBN(0xab020d07, 0xad43cd85),
+       TOBN(0x9437ae48, 0xda12c659), TOBN(0x6449c2eb, 0xe65452ad)}},
+     {{TOBN(0xcc7c4c1c, 0x2cf9d7c1), TOBN(0x1320886a, 0xee95e5ab),
+       TOBN(0xbb7b9056, 0xbeae170c), TOBN(0xc8a5b250, 0xdbc0d662)},
+      {TOBN(0x4ed81432, 0xc11d2303), TOBN(0x7da66912, 0x1f03769f),
+       TOBN(0x3ac7a5fd, 0x84539828), TOBN(0x14dada94, 0x3bccdd02)}},
+     {{TOBN(0x8b84c321, 0x7ef6b0d1), TOBN(0x52a9477a, 0x7c933f22),
+       TOBN(0x5ef6728a, 0xfd440b82), TOBN(0x5c3bd859, 0x6ce4bd5e)},
+      {TOBN(0x918b80f5, 0xf22c2d3e), TOBN(0x368d5040, 0xb7bb6cc5),
+       TOBN(0xb66142a1, 0x2695a11c), TOBN(0x60ac583a, 0xeb19ea70)}},
+     {{TOBN(0x317cbb98, 0x0eab2437), TOBN(0x8cc08c55, 0x5e2654c8),
+       TOBN(0xfe2d6520, 0xe6d8307f), TOBN(0xe9f147f3, 0x57428993)},
+      {TOBN(0x5f9c7d14, 0xd2fd6cf1), TOBN(0xa3ecd064, 0x2d4fcbb0),
+       TOBN(0xad83fef0, 0x8e7341f7), TOBN(0x643f23a0, 0x3a63115c)}},
+     {{TOBN(0xd38a78ab, 0xe65ab743), TOBN(0xbf7c75b1, 0x35edc89c),
+       TOBN(0x3dd8752e, 0x530df568), TOBN(0xf85c4a76, 0xe308c682)},
+      {TOBN(0x4c9955b2, 0xe68acf37), TOBN(0xa544df3d, 0xab32af85),
+       TOBN(0x4b8ec3f5, 0xa25cf493), TOBN(0x4d8f2764, 0x1a622feb)}},
+     {{TOBN(0x7bb4f7aa, 0xf0dcbc49), TOBN(0x7de551f9, 0x70bbb45b),
+       TOBN(0xcfd0f3e4, 0x9f2ca2e5), TOBN(0xece58709, 0x1f5c76ef)},
+      {TOBN(0x32920edd, 0x167d79ae), TOBN(0x039df8a2, 0xfa7d7ec1),
+       TOBN(0xf46206c0, 0xbb30af91), TOBN(0x1ff5e2f5, 0x22676b59)}},
+     {{TOBN(0x11f4a039, 0x6ea51d66), TOBN(0x506c1445, 0x807d7a26),
+       TOBN(0x60da5705, 0x755a9b24), TOBN(0x8fc8cc32, 0x1f1a319e)},
+      {TOBN(0x83642d4d, 0x9433d67d), TOBN(0x7fa5cb8f, 0x6a7dd296),
+       TOBN(0x576591db, 0x9b7bde07), TOBN(0x13173d25, 0x419716fb)}},
+     {{TOBN(0xea30599d, 0xd5b340ff), TOBN(0xfc6b5297, 0xb0fe76c5),
+       TOBN(0x1c6968c8, 0xab8f5adc), TOBN(0xf723c7f5, 0x901c928d)},
+      {TOBN(0x4203c321, 0x9773d402), TOBN(0xdf7c6aa3, 0x1b51dd47),
+       TOBN(0x3d49e37a, 0x552be23c), TOBN(0x57febee8, 0x0b5a6e87)}},
+     {{TOBN(0xc5ecbee4, 0x7bd8e739), TOBN(0x79d44994, 0xae63bf75),
+       TOBN(0x168bd00f, 0x38fb8923), TOBN(0x75d48ee4, 0xd0533130)},
+      {TOBN(0x554f77aa, 0xdb5cdf33), TOBN(0x3396e896, 0x3c696769),
+       TOBN(0x2fdddbf2, 0xd3fd674e), TOBN(0xbbb8f6ee, 0x99d0e3e5)}},
+     {{TOBN(0x51b90651, 0xcbae2f70), TOBN(0xefc4bc05, 0x93aaa8eb),
+       TOBN(0x8ecd8689, 0xdd1df499), TOBN(0x1aee99a8, 0x22f367a5)},
+      {TOBN(0x95d485b9, 0xae8274c5), TOBN(0x6c14d445, 0x7d30b39c),
+       TOBN(0xbafea90b, 0xbcc1ef81), TOBN(0x7c5f317a, 0xa459a2ed)}},
+     {{TOBN(0x01211075, 0x4ef44227), TOBN(0xa17bed6e, 0xdc20f496),
+       TOBN(0x0cdfe424, 0x819853cd), TOBN(0x13793298, 0xf71e2ce7)},
+      {TOBN(0x3c1f3078, 0xdbbe307b), TOBN(0x6dd1c20e, 0x76ee9936),
+       TOBN(0x23ee4b57, 0x423caa20), TOBN(0x4ac3793b, 0x8efb840e)}},
+     {{TOBN(0x934438eb, 0xed1f8ca0), TOBN(0x3e546658, 0x4ebb25a2),
+       TOBN(0xc415af0e, 0xc069896f), TOBN(0xc13eddb0, 0x9a5aa43d)},
+      {TOBN(0x7a04204f, 0xd49eb8f6), TOBN(0xd0d5bdfc, 0xd74f1670),
+       TOBN(0x3697e286, 0x56fc0558), TOBN(0x10207371, 0x01cebade)}},
+     {{TOBN(0x5f87e690, 0x0647a82b), TOBN(0x908e0ed4, 0x8f40054f),
+       TOBN(0xa9f633d4, 0x79853803), TOBN(0x8ed13c9a, 0x4a28b252)},
+      {TOBN(0x3e2ef676, 0x1f460f64), TOBN(0x53930b9b, 0x36d06336),
+       TOBN(0x347073ac, 0x8fc4979b), TOBN(0x84380e0e, 0x5ecd5597)}},
+     {{TOBN(0xe3b22c6b, 0xc4fe3c39), TOBN(0xba4a8153, 0x6c7bebdf),
+       TOBN(0xf23ab6b7, 0x25693459), TOBN(0x53bc3770, 0x14922b11)},
+      {TOBN(0x4645c8ab, 0x5afc60db), TOBN(0xaa022355, 0x20b9f2a3),
+       TOBN(0x52a2954c, 0xce0fc507), TOBN(0x8c2731bb, 0x7ce1c2e7)}},
+     {{TOBN(0xf39608ab, 0x18a0339d), TOBN(0xac7a658d, 0x3735436c),
+       TOBN(0xb22c2b07, 0xcd992b4f), TOBN(0x4e83daec, 0xf40dcfd4)},
+      {TOBN(0x8a34c7be, 0x2f39ea3e), TOBN(0xef0c005f, 0xb0a56d2e),
+       TOBN(0x62731f6a, 0x6edd8038), TOBN(0x5721d740, 0x4e3cb075)}},
+     {{TOBN(0x1ea41511, 0xfbeeee1b), TOBN(0xd1ef5e73, 0xef1d0c05),
+       TOBN(0x42feefd1, 0x73c07d35), TOBN(0xe530a00a, 0x8a329493)},
+      {TOBN(0x5d55b7fe, 0xf15ebfb0), TOBN(0x549de03c, 0xd322491a),
+       TOBN(0xf7b5f602, 0x745b3237), TOBN(0x3632a3a2, 0x1ab6e2b6)}},
+     {{TOBN(0x0d3bba89, 0x0ef59f78), TOBN(0x0dfc6443, 0xc9e52b9a),
+       TOBN(0x1dc79699, 0x72631447), TOBN(0xef033917, 0xb3be20b1)},
+      {TOBN(0x0c92735d, 0xb1383948), TOBN(0xc1fc29a2, 0xc0dd7d7d),
+       TOBN(0x6485b697, 0x403ed068), TOBN(0x13bfaab3, 0xaac93bdc)}},
+     {{TOBN(0x410dc6a9, 0x0deeaf52), TOBN(0xb003fb02, 0x4c641c15),
+       TOBN(0x1384978c, 0x5bc504c4), TOBN(0x37640487, 0x864a6a77)},
+      {TOBN(0x05991bc6, 0x222a77da), TOBN(0x62260a57, 0x5e47eb11),
+       TOBN(0xc7af6613, 0xf21b432c), TOBN(0x22f3acc9, 0xab4953e9)}},
+     {{TOBN(0x52934922, 0x8e41d155), TOBN(0x4d024568, 0x3ac059ef),
+       TOBN(0xb0201755, 0x4d884411), TOBN(0xce8055cf, 0xa59a178f)},
+      {TOBN(0xcd77d1af, 0xf6204549), TOBN(0xa0a00a3e, 0xc7066759),
+       TOBN(0x471071ef, 0x0272c229), TOBN(0x009bcf6b, 0xd3c4b6b0)}},
+     {{TOBN(0x2a2638a8, 0x22305177), TOBN(0xd51d59df, 0x41645bbf),
+       TOBN(0xa81142fd, 0xc0a7a3c0), TOBN(0xa17eca6d, 0x4c7063ee)},
+      {TOBN(0x0bb887ed, 0x60d9dcec), TOBN(0xd6d28e51, 0x20ad2455),
+       TOBN(0xebed6308, 0xa67102ba), TOBN(0x042c3114, 0x8bffa408)}},
+     {{TOBN(0xfd099ac5, 0x8aa68e30), TOBN(0x7a6a3d7c, 0x1483513e),
+       TOBN(0xffcc6b75, 0xba2d8f0c), TOBN(0x54dacf96, 0x1e78b954)},
+      {TOBN(0xf645696f, 0xa4a9af89), TOBN(0x3a411940, 0x06ac98ec),
+       TOBN(0x41b8b3f6, 0x22a67a20), TOBN(0x2d0b1e0f, 0x99dec626)}},
+     {{TOBN(0x27c89192, 0x40be34e8), TOBN(0xc7162b37, 0x91907f35),
+       TOBN(0x90188ec1, 0xa956702b), TOBN(0xca132f7d, 0xdf93769c)},
+      {TOBN(0x3ece44f9, 0x0e2025b4), TOBN(0x67aaec69, 0x0c62f14c),
+       TOBN(0xad741418, 0x22e3cc11), TOBN(0xcf9b75c3, 0x7ff9a50e)}},
+     {{TOBN(0x02fa2b16, 0x4d348272), TOBN(0xbd99d61a, 0x9959d56d),
+       TOBN(0xbc4f19db, 0x18762916), TOBN(0xcc7cce50, 0x49c1ac80)},
+      {TOBN(0x4d59ebaa, 0xd846bd83), TOBN(0x8775a9dc, 0xa9202849),
+       TOBN(0x07ec4ae1, 0x6e1f4ca9), TOBN(0x27eb5875, 0xba893f11)}},
+     {{TOBN(0x00284d51, 0x662cc565), TOBN(0x82353a6b, 0x0db4138d),
+       TOBN(0xd9c7aaaa, 0xaa32a594), TOBN(0xf5528b5e, 0xa5669c47)},
+      {TOBN(0xf3220231, 0x2f23c5ff), TOBN(0xe3e8147a, 0x6affa3a1),
+       TOBN(0xfb423d5c, 0x202ddda0), TOBN(0x3d6414ac, 0x6b871bd4)}},
+     {{TOBN(0x586f82e1, 0xa51a168a), TOBN(0xb712c671, 0x48ae5448),
+       TOBN(0x9a2e4bd1, 0x76233eb8), TOBN(0x0188223a, 0x78811ca9)},
+      {TOBN(0x553c5e21, 0xf7c18de1), TOBN(0x7682e451, 0xb27bb286),
+       TOBN(0x3ed036b3, 0x0e51e929), TOBN(0xf487211b, 0xec9cb34f)}},
+     {{TOBN(0x0d094277, 0x0c24efc8), TOBN(0x0349fd04, 0xbef737a4),
+       TOBN(0x6d1c9dd2, 0x514cdd28), TOBN(0x29c135ff, 0x30da9521)},
+      {TOBN(0xea6e4508, 0xf78b0b6f), TOBN(0x176f5dd2, 0x678c143c),
+       TOBN(0x08148418, 0x4be21e65), TOBN(0x27f7525c, 0xe7df38c4)}},
+     {{TOBN(0x1fb70e09, 0x748ab1a4), TOBN(0x9cba50a0, 0x5efe4433),
+       TOBN(0x7846c7a6, 0x15f75af2), TOBN(0x2a7c2c57, 0x5ee73ea8)},
+      {TOBN(0x42e566a4, 0x3f0a449a), TOBN(0x45474c3b, 0xad90fc3d),
+       TOBN(0x7447be3d, 0x8b61d057), TOBN(0x3e9d1cf1, 0x3a4ec092)}},
+     {{TOBN(0x1603e453, 0xf380a6e6), TOBN(0x0b86e431, 0x9b1437c2),
+       TOBN(0x7a4173f2, 0xef29610a), TOBN(0x8fa729a7, 0xf03d57f7)},
+      {TOBN(0x3e186f6e, 0x6c9c217e), TOBN(0xbe1d3079, 0x91919524),
+       TOBN(0x92a62a70, 0x153d4fb1), TOBN(0x32ed3e34, 0xd68c2f71)}},
+     {{TOBN(0xd785027f, 0x9eb1a8b7), TOBN(0xbc37eb77, 0xc5b22fe8),
+       TOBN(0x466b34f0, 0xb9d6a191), TOBN(0x008a89af, 0x9a05f816)},
+      {TOBN(0x19b028fb, 0x7d42c10a), TOBN(0x7fe8c92f, 0x49b3f6b8),
+       TOBN(0x58907cc0, 0xa5a0ade3), TOBN(0xb3154f51, 0x559d1a7c)}},
+     {{TOBN(0x5066efb6, 0xd9790ed6), TOBN(0xa77a0cbc, 0xa6aa793b),
+       TOBN(0x1a915f3c, 0x223e042e), TOBN(0x1c5def04, 0x69c5874b)},
+      {TOBN(0x0e830078, 0x73b6c1da), TOBN(0x55cf85d2, 0xfcd8557a),
+       TOBN(0x0f7c7c76, 0x0460f3b1), TOBN(0x87052acb, 0x46e58063)}},
+     {{TOBN(0x09212b80, 0x907eae66), TOBN(0x3cb068e0, 0x4d721c89),
+       TOBN(0xa87941ae, 0xdd45ac1c), TOBN(0xde8d5c0d, 0x0daa0dbb)},
+      {TOBN(0xda421fdc, 0xe3502e6e), TOBN(0xc8944201, 0x4d89a084),
+       TOBN(0x7307ba5e, 0xf0c24bfb), TOBN(0xda212beb, 0x20bde0ef)}},
+     {{TOBN(0xea2da24b, 0xf82ce682), TOBN(0x058d3816, 0x07f71fe4),
+       TOBN(0x35a02462, 0x5ffad8de), TOBN(0xcd7b05dc, 0xaadcefab)},
+      {TOBN(0xd442f8ed, 0x1d9f54ec), TOBN(0x8be3d618, 0xb2d3b5ca),
+       TOBN(0xe2220ed0, 0xe06b2ce2), TOBN(0x82699a5f, 0x1b0da4c0)}},
+     {{TOBN(0x3ff106f5, 0x71c0c3a7), TOBN(0x8f580f5a, 0x0d34180c),
+       TOBN(0x4ebb120e, 0x22d7d375), TOBN(0x5e5782cc, 0xe9513675)},
+      {TOBN(0x2275580c, 0x99c82a70), TOBN(0xe8359fbf, 0x15ea8c4c),
+       TOBN(0x53b48db8, 0x7b415e70), TOBN(0xaacf2240, 0x100c6014)}},
+     {{TOBN(0x9faaccf5, 0xe4652f1d), TOBN(0xbd6fdd2a, 0xd56157b2),
+       TOBN(0xa4f4fb1f, 0x6261ec50), TOBN(0x244e55ad, 0x476bcd52)},
+      {TOBN(0x881c9305, 0x047d320b), TOBN(0x1ca983d5, 0x6181263f),
+       TOBN(0x354e9a44, 0x278fb8ee), TOBN(0xad2dbc0f, 0x396e4964)}},
+     {{TOBN(0x723f3aa2, 0x9268b3de), TOBN(0x0d1ca29a, 0xe6e0609a),
+       TOBN(0x794866aa, 0x6cf44252), TOBN(0x0b59f3e3, 0x01af87ed)},
+      {TOBN(0xe234e5ff, 0x7f4a6c51), TOBN(0xa8768fd2, 0x61dc2f7e),
+       TOBN(0xdafc7332, 0x0a94d81f), TOBN(0xd7f84282, 0x06938ce1)}},
+     {{TOBN(0xae0b3c0e, 0x0546063e), TOBN(0x7fbadcb2, 0x5d61abc6),
+       TOBN(0xd5d7a2c9, 0x369ac400), TOBN(0xa5978d09, 0xae67d10c)},
+      {TOBN(0x290f211e, 0x4f85eaac), TOBN(0xe61e2ad1, 0xfacac681),
+       TOBN(0xae125225, 0x388384cd), TOBN(0xa7fb68e9, 0xccfde30f)}},
+     {{TOBN(0x7a59b936, 0x3daed4c2), TOBN(0x80a9aa40, 0x2606f789),
+       TOBN(0xb40c1ea5, 0xf6a6d90a), TOBN(0x948364d3, 0x514d5885)},
+      {TOBN(0x062ebc60, 0x70985182), TOBN(0xa6db5b0e, 0x33310895),
+       TOBN(0x64a12175, 0xe329c2f5), TOBN(0xc5f25bd2, 0x90ea237e)}},
+     {{TOBN(0x7915c524, 0x2d0a4c23), TOBN(0xeb5d26e4, 0x6bb3cc52),
+       TOBN(0x369a9116, 0xc09e2c92), TOBN(0x0c527f92, 0xcf182cf8)},
+      {TOBN(0x9e591938, 0x2aede0ac), TOBN(0xb2922208, 0x6cc34939),
+       TOBN(0x3c9d8962, 0x99a34361), TOBN(0x3c81836d, 0xc1905fe6)}},
+     {{TOBN(0x4bfeb57f, 0xa001ec5a), TOBN(0xe993f5bb, 0xa0dc5dba),
+       TOBN(0x47884109, 0x724a1380), TOBN(0x8a0369ab, 0x32fe9a04)},
+      {TOBN(0xea068d60, 0x8c927db8), TOBN(0xbf5f37cf, 0x94655741),
+       TOBN(0x47d402a2, 0x04b6c7ea), TOBN(0x4551c295, 0x6af259cb)}},
+     {{TOBN(0x698b71e7, 0xed77ee8b), TOBN(0xbddf7bd0, 0xf309d5c7),
+       TOBN(0x6201c22c, 0x34e780ca), TOBN(0xab04f7d8, 0x4c295ef4)},
+      {TOBN(0x1c947294, 0x4313a8ce), TOBN(0xe532e4ac, 0x92ca4cfe),
+       TOBN(0x89738f80, 0xd0a7a97a), TOBN(0xec088c88, 0xa580fd5b)}},
+     {{TOBN(0x612b1ecc, 0x42ce9e51), TOBN(0x8f9840fd, 0xb25fdd2a),
+       TOBN(0x3cda78c0, 0x01e7f839), TOBN(0x546b3d3a, 0xece05480)},
+      {TOBN(0x271719a9, 0x80d30916), TOBN(0x45497107, 0x584c20c4),
+       TOBN(0xaf8f9478, 0x5bc78608), TOBN(0x28c7d484, 0x277e2a4c)}},
+     {{TOBN(0xfce01767, 0x88a2ffe4), TOBN(0xdc506a35, 0x28e169a5),
+       TOBN(0x0ea10861, 0x7af9c93a), TOBN(0x1ed24361, 0x03fa0e08)},
+      {TOBN(0x96eaaa92, 0xa3d694e7), TOBN(0xc0f43b4d, 0xef50bc74),
+       TOBN(0xce6aa58c, 0x64114db4), TOBN(0x8218e8ea, 0x7c000fd4)}},
+     {{TOBN(0xac815dfb, 0x185f8844), TOBN(0xcd7e90cb, 0x1557abfb),
+       TOBN(0x23d16655, 0xafbfecdf), TOBN(0x80f3271f, 0x085cac4a)},
+      {TOBN(0x7fc39aa7, 0xd0e62f47), TOBN(0x88d519d1, 0x460a48e5),
+       TOBN(0x59559ac4, 0xd28f101e), TOBN(0x7981d9e9, 0xca9ae816)}},
+     {{TOBN(0x5c38652c, 0x9ac38203), TOBN(0x86eaf87f, 0x57657fe5),
+       TOBN(0x568fc472, 0xe21f5416), TOBN(0x2afff39c, 0xe7e597b5)},
+      {TOBN(0x3adbbb07, 0x256d4eab), TOBN(0x22598692, 0x8285ab89),
+       TOBN(0x35f8112a, 0x041caefe), TOBN(0x95df02e3, 0xa5064c8b)}},
+     {{TOBN(0x4d63356e, 0xc7004bf3), TOBN(0x230a08f4, 0xdb83c7de),
+       TOBN(0xca27b270, 0x8709a7b7), TOBN(0x0d1c4cc4, 0xcb9abd2d)},
+      {TOBN(0x8a0bc66e, 0x7550fee8), TOBN(0x369cd4c7, 0x9cf7247e),
+       TOBN(0x75562e84, 0x92b5b7e7), TOBN(0x8fed0da0, 0x5802af7b)}},
+     {{TOBN(0x6a7091c2, 0xe48fb889), TOBN(0x26882c13, 0x7b8a9d06),
+       TOBN(0xa2498663, 0x1b82a0e2), TOBN(0x844ed736, 0x3518152d)},
+      {TOBN(0x282f476f, 0xd86e27c7), TOBN(0xa04edaca, 0x04afefdc),
+       TOBN(0x8b256ebc, 0x6119e34d), TOBN(0x56a413e9, 0x0787d78b)}}},
+    {{{TOBN(0x82ee061d, 0x5a74be50), TOBN(0xe41781c4, 0xdea16ff5),
+       TOBN(0xe0b0c81e, 0x99bfc8a2), TOBN(0x624f4d69, 0x0b547e2d)},
+      {TOBN(0x3a83545d, 0xbdcc9ae4), TOBN(0x2573dbb6, 0x409b1e8e),
+       TOBN(0x482960c4, 0xa6c93539), TOBN(0xf01059ad, 0x5ae18798)}},
+     {{TOBN(0x715c9f97, 0x3112795f), TOBN(0xe8244437, 0x984e6ee1),
+       TOBN(0x55cb4858, 0xecb66bcd), TOBN(0x7c136735, 0xabaffbee)},
+      {TOBN(0x54661595, 0x5dbec38e), TOBN(0x51c0782c, 0x388ad153),
+       TOBN(0x9ba4c53a, 0xc6e0952f), TOBN(0x27e6782a, 0x1b21dfa8)}},
+     {{TOBN(0x682f903d, 0x4ed2dbc2), TOBN(0x0eba59c8, 0x7c3b2d83),
+       TOBN(0x8e9dc84d, 0x9c7e9335), TOBN(0x5f9b21b0, 0x0eb226d7)},
+      {TOBN(0xe33bd394, 0xaf267bae), TOBN(0xaa86cc25, 0xbe2e15ae),
+       TOBN(0x4f0bf67d, 0x6a8ec500), TOBN(0x5846aa44, 0xf9630658)}},
+     {{TOBN(0xfeb09740, 0xe2c2bf15), TOBN(0x627a2205, 0xa9e99704),
+       TOBN(0xec8d73d0, 0xc2fbc565), TOBN(0x223eed8f, 0xc20c8de8)},
+      {TOBN(0x1ee32583, 0xa8363b49), TOBN(0x1a0b6cb9, 0xc9c2b0a6),
+       TOBN(0x49f7c3d2, 0x90dbc85c), TOBN(0xa8dfbb97, 0x1ef4c1ac)}},
+     {{TOBN(0xafb34d4c, 0x65c7c2ab), TOBN(0x1d4610e7, 0xe2c5ea84),
+       TOBN(0x893f6d1b, 0x973c4ab5), TOBN(0xa3cdd7e9, 0x945ba5c4)},
+      {TOBN(0x60514983, 0x064417ee), TOBN(0x1459b23c, 0xad6bdf2b),
+       TOBN(0x23b2c341, 0x5cf726c3), TOBN(0x3a829635, 0x32d6354a)}},
+     {{TOBN(0x294f901f, 0xab192c18), TOBN(0xec5fcbfe, 0x7030164f),
+       TOBN(0xe2e2fcb7, 0xe2246ba6), TOBN(0x1e7c88b3, 0x221a1a0c)},
+      {TOBN(0x72c7dd93, 0xc92d88c5), TOBN(0x41c2148e, 0x1106fb59),
+       TOBN(0x547dd4f5, 0xa0f60f14), TOBN(0xed9b52b2, 0x63960f31)}},
+     {{TOBN(0x6c8349eb, 0xb0a5b358), TOBN(0xb154c5c2, 0x9e7e2ed6),
+       TOBN(0xcad5eccf, 0xeda462db), TOBN(0xf2d6dbe4, 0x2de66b69)},
+      {TOBN(0x426aedf3, 0x8665e5b2), TOBN(0x488a8513, 0x7b7f5723),
+       TOBN(0x15cc43b3, 0x8bcbb386), TOBN(0x27ad0af3, 0xd791d879)}},
+     {{TOBN(0xc16c236e, 0x846e364f), TOBN(0x7f33527c, 0xdea50ca0),
+       TOBN(0xc4810775, 0x0926b86d), TOBN(0x6c2a3609, 0x0598e70c)},
+      {TOBN(0xa6755e52, 0xf024e924), TOBN(0xe0fa07a4, 0x9db4afca),
+       TOBN(0x15c3ce7d, 0x66831790), TOBN(0x5b4ef350, 0xa6cbb0d6)}},
+     {{TOBN(0x2c4aafc4, 0xb6205969), TOBN(0x42563f02, 0xf6c7854f),
+       TOBN(0x016aced5, 0x1d983b48), TOBN(0xfeb356d8, 0x99949755)},
+      {TOBN(0x8c2a2c81, 0xd1a39bd7), TOBN(0x8f44340f, 0xe6934ae9),
+       TOBN(0x148cf91c, 0x447904da), TOBN(0x7340185f, 0x0f51a926)}},
+     {{TOBN(0x2f8f00fb, 0x7409ab46), TOBN(0x057e78e6, 0x80e289b2),
+       TOBN(0x03e5022c, 0xa888e5d1), TOBN(0x3c87111a, 0x9dede4e2)},
+      {TOBN(0x5b9b0e1c, 0x7809460b), TOBN(0xe751c852, 0x71c9abc7),
+       TOBN(0x8b944e28, 0xc7cc1dc9), TOBN(0x4f201ffa, 0x1d3cfa08)}},
+     {{TOBN(0x02fc905c, 0x3e6721ce), TOBN(0xd52d70da, 0xd0b3674c),
+       TOBN(0x5dc2e5ca, 0x18810da4), TOBN(0xa984b273, 0x5c69dd99)},
+      {TOBN(0x63b92527, 0x84de5ca4), TOBN(0x2f1c9872, 0xc852dec4),
+       TOBN(0x18b03593, 0xc2e3de09), TOBN(0x19d70b01, 0x9813dc2f)}},
+     {{TOBN(0x42806b2d, 0xa6dc1d29), TOBN(0xd3030009, 0xf871e144),
+       TOBN(0xa1feb333, 0xaaf49276), TOBN(0xb5583b9e, 0xc70bc04b)},
+      {TOBN(0x1db0be78, 0x95695f20), TOBN(0xfc841811, 0x89d012b5),
+       TOBN(0x6409f272, 0x05f61643), TOBN(0x40d34174, 0xd5883128)}},
+     {{TOBN(0xd79196f5, 0x67419833), TOBN(0x6059e252, 0x863b7b08),
+       TOBN(0x84da1817, 0x1c56700c), TOBN(0x5758ee56, 0xb28d3ec4)},
+      {TOBN(0x7da2771d, 0x013b0ea6), TOBN(0xfddf524b, 0x54c5e9b9),
+       TOBN(0x7df4faf8, 0x24305d80), TOBN(0x58f5c1bf, 0x3a97763f)}},
+     {{TOBN(0xa5af37f1, 0x7c696042), TOBN(0xd4cba22c, 0x4a2538de),
+       TOBN(0x211cb995, 0x9ea42600), TOBN(0xcd105f41, 0x7b069889)},
+      {TOBN(0xb1e1cf19, 0xddb81e74), TOBN(0x472f2d89, 0x5157b8ca),
+       TOBN(0x086fb008, 0xee9db885), TOBN(0x365cd570, 0x0f26d131)}},
+     {{TOBN(0x284b02bb, 0xa2be7053), TOBN(0xdcbbf7c6, 0x7ab9a6d6),
+       TOBN(0x4425559c, 0x20f7a530), TOBN(0x961f2dfa, 0x188767c8)},
+      {TOBN(0xe2fd9435, 0x70dc80c4), TOBN(0x104d6b63, 0xf0784120),
+       TOBN(0x7f592bc1, 0x53567122), TOBN(0xf6bc1246, 0xf688ad77)}},
+     {{TOBN(0x05214c05, 0x0f15dde9), TOBN(0xa47a76a8, 0x0d5f2b82),
+       TOBN(0xbb254d30, 0x62e82b62), TOBN(0x11a05fe0, 0x3ec955ee)},
+      {TOBN(0x7eaff46e, 0x9d529b36), TOBN(0x55ab1301, 0x8f9e3df6),
+       TOBN(0xc463e371, 0x99317698), TOBN(0xfd251438, 0xccda47ad)}},
+     {{TOBN(0xca9c3547, 0x23d695ea), TOBN(0x48ce626e, 0x16e589b5),
+       TOBN(0x6b5b64c7, 0xb187d086), TOBN(0xd02e1794, 0xb2207948)},
+      {TOBN(0x8b58e98f, 0x7198111d), TOBN(0x90ca6305, 0xdcf9c3cc),
+       TOBN(0x5691fe72, 0xf34089b0), TOBN(0x60941af1, 0xfc7c80ff)}},
+     {{TOBN(0xa09bc0a2, 0x22eb51e5), TOBN(0xc0bb7244, 0xaa9cf09a),
+       TOBN(0x36a8077f, 0x80159f06), TOBN(0x8b5c989e, 0xdddc560e)},
+      {TOBN(0x19d2f316, 0x512e1f43), TOBN(0x02eac554, 0xad08ff62),
+       TOBN(0x012ab84c, 0x07d20b4e), TOBN(0x37d1e115, 0xd6d4e4e1)}},
+     {{TOBN(0xb6443e1a, 0xab7b19a8), TOBN(0xf08d067e, 0xdef8cd45),
+       TOBN(0x63adf3e9, 0x685e03da), TOBN(0xcf15a10e, 0x4792b916)},
+      {TOBN(0xf44bcce5, 0xb738a425), TOBN(0xebe131d5, 0x9636b2fd),
+       TOBN(0x94068841, 0x7850d605), TOBN(0x09684eaa, 0xb40d749d)}},
+     {{TOBN(0x8c3c669c, 0x72ba075b), TOBN(0x89f78b55, 0xba469015),
+       TOBN(0x5706aade, 0x3e9f8ba8), TOBN(0x6d8bd565, 0xb32d7ed7)},
+      {TOBN(0x25f4e63b, 0x805f08d6), TOBN(0x7f48200d, 0xc3bcc1b5),
+       TOBN(0x4e801968, 0xb025d847), TOBN(0x74afac04, 0x87cbe0a8)}},
+     {{TOBN(0x43ed2c2b, 0x7e63d690), TOBN(0xefb6bbf0, 0x0223cdb8),
+       TOBN(0x4fec3cae, 0x2884d3fe), TOBN(0x065ecce6, 0xd75e25a4)},
+      {TOBN(0x6c2294ce, 0x69f79071), TOBN(0x0d9a8e5f, 0x044b8666),
+       TOBN(0x5009f238, 0x17b69d8f), TOBN(0x3c29f8fe, 0xc5dfdaf7)}},
+     {{TOBN(0x9067528f, 0xebae68c4), TOBN(0x5b385632, 0x30c5ba21),
+       TOBN(0x540df119, 0x1fdd1aec), TOBN(0xcf37825b, 0xcfba4c78)},
+      {TOBN(0x77eff980, 0xbeb11454), TOBN(0x40a1a991, 0x60c1b066),
+       TOBN(0xe8018980, 0xf889a1c7), TOBN(0xb9c52ae9, 0x76c24be0)}},
+     {{TOBN(0x05fbbcce, 0x45650ef4), TOBN(0xae000f10, 0x8aa29ac7),
+       TOBN(0x884b7172, 0x4f04c470), TOBN(0x7cd4fde2, 0x19bb5c25)},
+      {TOBN(0x6477b22a, 0xe8840869), TOBN(0xa8868859, 0x5fbd0686),
+       TOBN(0xf23cc02e, 0x1116dfba), TOBN(0x76cd563f, 0xd87d7776)}},
+     {{TOBN(0xe2a37598, 0xa9d82abf), TOBN(0x5f188ccb, 0xe6c170f5),
+       TOBN(0x81682200, 0x5066b087), TOBN(0xda22c212, 0xc7155ada)},
+      {TOBN(0x151e5d3a, 0xfbddb479), TOBN(0x4b606b84, 0x6d715b99),
+       TOBN(0x4a73b54b, 0xf997cb2e), TOBN(0x9a1bfe43, 0x3ecd8b66)}},
+     {{TOBN(0x1c312809, 0x2a67d48a), TOBN(0xcd6a671e, 0x031fa9e2),
+       TOBN(0xbec3312a, 0x0e43a34a), TOBN(0x1d935639, 0x55ef47d3)},
+      {TOBN(0x5ea02489, 0x8fea73ea), TOBN(0x8247b364, 0xa035afb2),
+       TOBN(0xb58300a6, 0x5265b54c), TOBN(0x3286662f, 0x722c7148)}},
+     {{TOBN(0xb77fd76b, 0xb4ec4c20), TOBN(0xf0a12fa7, 0x0f3fe3fd),
+       TOBN(0xf845bbf5, 0x41d8c7e8), TOBN(0xe4d969ca, 0x5ec10aa8)},
+      {TOBN(0x4c0053b7, 0x43e232a3), TOBN(0xdc7a3fac, 0x37f8a45a),
+       TOBN(0x3c4261c5, 0x20d81c8f), TOBN(0xfd4b3453, 0xb00eab00)}},
+     {{TOBN(0x76d48f86, 0xd36e3062), TOBN(0x626c5277, 0xa143ff02),
+       TOBN(0x538174de, 0xaf76f42e), TOBN(0x2267aa86, 0x6407ceac)},
+      {TOBN(0xfad76351, 0x72e572d5), TOBN(0xab861af7, 0xba7330eb),
+       TOBN(0xa0a1c8c7, 0x418d8657), TOBN(0x988821cb, 0x20289a52)}},
+     {{TOBN(0x79732522, 0xcccc18ad), TOBN(0xaadf3f8d, 0xf1a6e027),
+       TOBN(0xf7382c93, 0x17c2354d), TOBN(0x5ce1680c, 0xd818b689)},
+      {TOBN(0x359ebbfc, 0xd9ecbee9), TOBN(0x4330689c, 0x1cae62ac),
+       TOBN(0xb55ce5b4, 0xc51ac38a), TOBN(0x7921dfea, 0xfe238ee8)}},
+     {{TOBN(0x3972bef8, 0x271d1ca5), TOBN(0x3e423bc7, 0xe8aabd18),
+       TOBN(0x57b09f3f, 0x44a3e5e3), TOBN(0x5da886ae, 0x7b444d66)},
+      {TOBN(0x68206634, 0xa9964375), TOBN(0x356a2fa3, 0x699cd0ff),
+       TOBN(0xaf0faa24, 0xdba515e9), TOBN(0x536e1f5c, 0xb321d79a)}},
+     {{TOBN(0xd3b9913a, 0x5c04e4ea), TOBN(0xd549dcfe, 0xd6f11513),
+       TOBN(0xee227bf5, 0x79fd1d94), TOBN(0x9f35afee, 0xb43f2c67)},
+      {TOBN(0xd2638d24, 0xf1314f53), TOBN(0x62baf948, 0xcabcd822),
+       TOBN(0x5542de29, 0x4ef48db0), TOBN(0xb3eb6a04, 0xfc5f6bb2)}},
+     {{TOBN(0x23c110ae, 0x1208e16a), TOBN(0x1a4d15b5, 0xf8363e24),
+       TOBN(0x30716844, 0x164be00b), TOBN(0xa8e24824, 0xf6f4690d)},
+      {TOBN(0x548773a2, 0x90b170cf), TOBN(0xa1bef331, 0x42f191f4),
+       TOBN(0x70f418d0, 0x9247aa97), TOBN(0xea06028e, 0x48be9147)}},
+     {{TOBN(0xe13122f3, 0xdbfb894e), TOBN(0xbe9b79f6, 0xce274b18),
+       TOBN(0x85a49de5, 0xca58aadf), TOBN(0x24957758, 0x11487351)},
+      {TOBN(0x111def61, 0xbb939099), TOBN(0x1d6a974a, 0x26d13694),
+       TOBN(0x4474b4ce, 0xd3fc253b), TOBN(0x3a1485e6, 0x4c5db15e)}},
+     {{TOBN(0xe79667b4, 0x147c15b4), TOBN(0xe34f553b, 0x7bc61301),
+       TOBN(0x032b80f8, 0x17094381), TOBN(0x55d8bafd, 0x723eaa21)},
+      {TOBN(0x5a987995, 0xf1c0e74e), TOBN(0x5a9b292e, 0xebba289c),
+       TOBN(0x413cd4b2, 0xeb4c8251), TOBN(0x98b5d243, 0xd162db0a)}},
+     {{TOBN(0xbb47bf66, 0x68342520), TOBN(0x08d68949, 0xbaa862d1),
+       TOBN(0x11f349c7, 0xe906abcd), TOBN(0x454ce985, 0xed7bf00e)},
+      {TOBN(0xacab5c9e, 0xb55b803b), TOBN(0xb03468ea, 0x31e3c16d),
+       TOBN(0x5c24213d, 0xd273bf12), TOBN(0x211538eb, 0x71587887)}},
+     {{TOBN(0x198e4a2f, 0x731dea2d), TOBN(0xd5856cf2, 0x74ed7b2a),
+       TOBN(0x86a632eb, 0x13a664fe), TOBN(0x932cd909, 0xbda41291)},
+      {TOBN(0x850e95d4, 0xc0c4ddc0), TOBN(0xc0f422f8, 0x347fc2c9),
+       TOBN(0xe68cbec4, 0x86076bcb), TOBN(0xf9e7c0c0, 0xcd6cd286)}},
+     {{TOBN(0x65994ddb, 0x0f5f27ca), TOBN(0xe85461fb, 0xa80d59ff),
+       TOBN(0xff05481a, 0x66601023), TOBN(0xc665427a, 0xfc9ebbfb)},
+      {TOBN(0xb0571a69, 0x7587fd52), TOBN(0x935289f8, 0x8d49efce),
+       TOBN(0x61becc60, 0xea420688), TOBN(0xb22639d9, 0x13a786af)}},
+     {{TOBN(0x1a8e6220, 0x361ecf90), TOBN(0x001f23e0, 0x25506463),
+       TOBN(0xe4ae9b5d, 0x0a5c2b79), TOBN(0xebc9cdad, 0xd8149db5)},
+      {TOBN(0xb33164a1, 0x934aa728), TOBN(0x750eb00e, 0xae9b60f3),
+       TOBN(0x5a91615b, 0x9b9cfbfd), TOBN(0x97015cbf, 0xef45f7f6)}},
+     {{TOBN(0xb462c4a5, 0xbf5151df), TOBN(0x21adcc41, 0xb07118f2),
+       TOBN(0xd60c545b, 0x043fa42c), TOBN(0xfc21aa54, 0xe96be1ab)},
+      {TOBN(0xe84bc32f, 0x4e51ea80), TOBN(0x3dae45f0, 0x259b5d8d),
+       TOBN(0xbb73c7eb, 0xc38f1b5e), TOBN(0xe405a74a, 0xe8ae617d)}},
+     {{TOBN(0xbb1ae9c6, 0x9f1c56bd), TOBN(0x8c176b98, 0x49f196a4),
+       TOBN(0xc448f311, 0x6875092b), TOBN(0xb5afe3de, 0x9f976033)},
+      {TOBN(0xa8dafd49, 0x145813e5), TOBN(0x687fc4d9, 0xe2b34226),
+       TOBN(0xf2dfc92d, 0x4c7ff57f), TOBN(0x004e3fc1, 0x401f1b46)}},
+     {{TOBN(0x5afddab6, 0x1430c9ab), TOBN(0x0bdd41d3, 0x2238e997),
+       TOBN(0xf0947430, 0x418042ae), TOBN(0x71f9adda, 0xcdddc4cb)},
+      {TOBN(0x7090c016, 0xc52dd907), TOBN(0xd9bdf44d, 0x29e2047f),
+       TOBN(0xe6f1fe80, 0x1b1011a6), TOBN(0xb63accbc, 0xd9acdc78)}},
+     {{TOBN(0xcfc7e235, 0x1272a95b), TOBN(0x0c667717, 0xa6276ac8),
+       TOBN(0x3c0d3709, 0xe2d7eef7), TOBN(0x5add2b06, 0x9a685b3e)},
+      {TOBN(0x363ad32d, 0x14ea5d65), TOBN(0xf8e01f06, 0x8d7dd506),
+       TOBN(0xc9ea2213, 0x75b4aac6), TOBN(0xed2a2bf9, 0x0d353466)}},
+     {{TOBN(0x439d79b5, 0xe9d3a7c3), TOBN(0x8e0ee5a6, 0x81b7f34b),
+       TOBN(0xcf3dacf5, 0x1dc4ba75), TOBN(0x1d3d1773, 0xeb3310c7)},
+      {TOBN(0xa8e67112, 0x7747ae83), TOBN(0x31f43160, 0x197d6b40),
+       TOBN(0x0521ccee, 0xcd961400), TOBN(0x67246f11, 0xf6535768)}},
+     {{TOBN(0x702fcc5a, 0xef0c3133), TOBN(0x247cc45d, 0x7e16693b),
+       TOBN(0xfd484e49, 0xc729b749), TOBN(0x522cef7d, 0xb218320f)},
+      {TOBN(0xe56ef405, 0x59ab93b3), TOBN(0x225fba11, 0x9f181071),
+       TOBN(0x33bd6595, 0x15330ed0), TOBN(0xc4be69d5, 0x1ddb32f7)}},
+     {{TOBN(0x264c7668, 0x0448087c), TOBN(0xac30903f, 0x71432dae),
+       TOBN(0x3851b266, 0x00f9bf47), TOBN(0x400ed311, 0x6cdd6d03)},
+      {TOBN(0x045e79fe, 0xf8fd2424), TOBN(0xfdfd974a, 0xfa6da98b),
+       TOBN(0x45c9f641, 0x0c1e673a), TOBN(0x76f2e733, 0x5b2c5168)}},
+     {{TOBN(0x1adaebb5, 0x2a601753), TOBN(0xb286514c, 0xc57c2d49),
+       TOBN(0xd8769670, 0x1e0bfd24), TOBN(0x950c547e, 0x04478922)},
+      {TOBN(0xd1d41969, 0xe5d32bfe), TOBN(0x30bc1472, 0x750d6c3e),
+       TOBN(0x8f3679fe, 0xe0e27f3a), TOBN(0x8f64a7dc, 0xa4a6ee0c)}},
+     {{TOBN(0x2fe59937, 0x633dfb1f), TOBN(0xea82c395, 0x977f2547),
+       TOBN(0xcbdfdf1a, 0x661ea646), TOBN(0xc7ccc591, 0xb9085451)},
+      {TOBN(0x82177962, 0x81761e13), TOBN(0xda57596f, 0x9196885c),
+       TOBN(0xbc17e849, 0x28ffbd70), TOBN(0x1e6e0a41, 0x2671d36f)}},
+     {{TOBN(0x61ae872c, 0x4152fcf5), TOBN(0x441c87b0, 0x9e77e754),
+       TOBN(0xd0799dd5, 0xa34dff09), TOBN(0x766b4e44, 0x88a6b171)},
+      {TOBN(0xdc06a512, 0x11f1c792), TOBN(0xea02ae93, 0x4be35c3e),
+       TOBN(0xe5ca4d6d, 0xe90c469e), TOBN(0x4df4368e, 0x56e4ff5c)}},
+     {{TOBN(0x7817acab, 0x4baef62e), TOBN(0x9f5a2202, 0xa85b91e8),
+       TOBN(0x9666ebe6, 0x6ce57610), TOBN(0x32ad31f3, 0xf73bfe03)},
+      {TOBN(0x628330a4, 0x25bcf4d6), TOBN(0xea950593, 0x515056e6),
+       TOBN(0x59811c89, 0xe1332156), TOBN(0xc89cf1fe, 0x8c11b2d7)}},
+     {{TOBN(0x75b63913, 0x04e60cc0), TOBN(0xce811e8d, 0x4625d375),
+       TOBN(0x030e43fc, 0x2d26e562), TOBN(0xfbb30b4b, 0x608d36a0)},
+      {TOBN(0x634ff82c, 0x48528118), TOBN(0x7c6fe085, 0xcd285911),
+       TOBN(0x7f2830c0, 0x99358f28), TOBN(0x2e60a95e, 0x665e6c09)}},
+     {{TOBN(0x08407d3d, 0x9b785dbf), TOBN(0x530889ab, 0xa759bce7),
+       TOBN(0xf228e0e6, 0x52f61239), TOBN(0x2b6d1461, 0x6879be3c)},
+      {TOBN(0xe6902c04, 0x51a7bbf7), TOBN(0x30ad99f0, 0x76f24a64),
+       TOBN(0x66d9317a, 0x98bc6da0), TOBN(0xf4f877f3, 0xcb596ac0)}},
+     {{TOBN(0xb05ff62d, 0x4c44f119), TOBN(0x4555f536, 0xe9b77416),
+       TOBN(0xc7c0d059, 0x8caed63b), TOBN(0x0cd2b7ce, 0xc358b2a9)},
+      {TOBN(0x3f33287b, 0x46945fa3), TOBN(0xf8785b20, 0xd67c8791),
+       TOBN(0xc54a7a61, 0x9637bd08), TOBN(0x54d4598c, 0x18be79d7)}},
+     {{TOBN(0x889e5acb, 0xc46d7ce1), TOBN(0x9a515bb7, 0x8b085877),
+       TOBN(0xfac1a03d, 0x0b7a5050), TOBN(0x7d3e738a, 0xf2926035)},
+      {TOBN(0x861cc2ce, 0x2a6cb0eb), TOBN(0x6f2e2955, 0x8f7adc79),
+       TOBN(0x61c4d451, 0x33016376), TOBN(0xd9fd2c80, 0x5ad59090)}},
+     {{TOBN(0xe5a83738, 0xb2b836a1), TOBN(0x855b41a0, 0x7c0d6622),
+       TOBN(0x186fe317, 0x7cc19af1), TOBN(0x6465c1ff, 0xfdd99acb)},
+      {TOBN(0x46e5c23f, 0x6974b99e), TOBN(0x75a7cf8b, 0xa2717cbe),
+       TOBN(0x4d2ebc3f, 0x062be658), TOBN(0x094b4447, 0x5f209c98)}},
+     {{TOBN(0x4af285ed, 0xb940cb5a), TOBN(0x6706d792, 0x7cc82f10),
+       TOBN(0xc8c8776c, 0x030526fa), TOBN(0xfa8e6f76, 0xa0da9140)},
+      {TOBN(0x77ea9d34, 0x591ee4f0), TOBN(0x5f46e337, 0x40274166),
+       TOBN(0x1bdf98bb, 0xea671457), TOBN(0xd7c08b46, 0x862a1fe2)}},
+     {{TOBN(0x46cc303c, 0x1c08ad63), TOBN(0x99543440, 0x4c845e7b),
+       TOBN(0x1b8fbdb5, 0x48f36bf7), TOBN(0x5b82c392, 0x8c8273a7)},
+      {TOBN(0x08f712c4, 0x928435d5), TOBN(0x071cf0f1, 0x79330380),
+       TOBN(0xc74c2d24, 0xa8da054a), TOBN(0xcb0e7201, 0x43c46b5c)}},
+     {{TOBN(0x0ad7337a, 0xc0b7eff3), TOBN(0x8552225e, 0xc5e48b3c),
+       TOBN(0xe6f78b0c, 0x73f13a5f), TOBN(0x5e70062e, 0x82349cbe)},
+      {TOBN(0x6b8d5048, 0xe7073969), TOBN(0x392d2a29, 0xc33cb3d2),
+       TOBN(0xee4f727c, 0x4ecaa20f), TOBN(0xa068c99e, 0x2ccde707)}},
+     {{TOBN(0xfcd5651f, 0xb87a2913), TOBN(0xea3e3c15, 0x3cc252f0),
+       TOBN(0x777d92df, 0x3b6cd3e4), TOBN(0x7a414143, 0xc5a732e7)},
+      {TOBN(0xa895951a, 0xa71ff493), TOBN(0xfe980c92, 0xbbd37cf6),
+       TOBN(0x45bd5e64, 0xdecfeeff), TOBN(0x910dc2a9, 0xa44c43e9)}},
+     {{TOBN(0xcb403f26, 0xcca9f54d), TOBN(0x928bbdfb, 0x9303f6db),
+       TOBN(0x3c37951e, 0xa9eee67c), TOBN(0x3bd61a52, 0xf79961c3)},
+      {TOBN(0x09a238e6, 0x395c9a79), TOBN(0x6940ca2d, 0x61eb352d),
+       TOBN(0x7d1e5c5e, 0xc1875631), TOBN(0x1e19742c, 0x1e1b20d1)}},
+     {{TOBN(0x4633d908, 0x23fc2e6e), TOBN(0xa76e29a9, 0x08959149),
+       TOBN(0x61069d9c, 0x84ed7da5), TOBN(0x0baa11cf, 0x5dbcad51)},
+      {TOBN(0xd01eec64, 0x961849da), TOBN(0x93b75f1f, 0xaf3d8c28),
+       TOBN(0x57bc4f9f, 0x1ca2ee44), TOBN(0x5a26322d, 0x00e00558)}},
+     {{TOBN(0x1888d658, 0x61a023ef), TOBN(0x1d72aab4, 0xb9e5246e),
+       TOBN(0xa9a26348, 0xe5563ec0), TOBN(0xa0971963, 0xc3439a43)},
+      {TOBN(0x567dd54b, 0xadb9b5b7), TOBN(0x73fac1a1, 0xc45a524b),
+       TOBN(0x8fe97ef7, 0xfe38e608), TOBN(0x608748d2, 0x3f384f48)}},
+     {{TOBN(0xb0571794, 0xc486094f), TOBN(0x869254a3, 0x8bf3a8d6),
+       TOBN(0x148a8dd1, 0x310b0e25), TOBN(0x99ab9f3f, 0x9aa3f7d8)},
+      {TOBN(0x0927c68a, 0x6706c02e), TOBN(0x22b5e76c, 0x69790e6c),
+       TOBN(0x6c325260, 0x6c71376c), TOBN(0x53a57690, 0x09ef6657)}},
+     {{TOBN(0x8d63f852, 0xedffcf3a), TOBN(0xb4d2ed04, 0x3c0a6f55),
+       TOBN(0xdb3aa8de, 0x12519b9e), TOBN(0x5d38e9c4, 0x1e0a569a)},
+      {TOBN(0x871528bf, 0x303747e2), TOBN(0xa208e77c, 0xf5b5c18d),
+       TOBN(0x9d129c88, 0xca6bf923), TOBN(0xbcbf197f, 0xbf02839f)}},
+     {{TOBN(0x9b9bf030, 0x27323194), TOBN(0x3b055a8b, 0x339ca59d),
+       TOBN(0xb46b2312, 0x0f669520), TOBN(0x19789f1f, 0x497e5f24)},
+      {TOBN(0x9c499468, 0xaaf01801), TOBN(0x72ee1190, 0x8b69d59c),
+       TOBN(0x8bd39595, 0xacf4c079), TOBN(0x3ee11ece, 0x8e0cd048)}},
+     {{TOBN(0xebde86ec, 0x1ed66f18), TOBN(0x225d906b, 0xd61fce43),
+       TOBN(0x5cab07d6, 0xe8bed74d), TOBN(0x16e4617f, 0x27855ab7)},
+      {TOBN(0x6568aadd, 0xb2fbc3dd), TOBN(0xedb5484f, 0x8aeddf5b),
+       TOBN(0x878f20e8, 0x6dcf2fad), TOBN(0x3516497c, 0x615f5699)}}},
+    {{{TOBN(0xef0a3fec, 0xfa181e69), TOBN(0x9ea02f81, 0x30d69a98),
+       TOBN(0xb2e9cf8e, 0x66eab95d), TOBN(0x520f2beb, 0x24720021)},
+      {TOBN(0x621c540a, 0x1df84361), TOBN(0x12037721, 0x71fa6d5d),
+       TOBN(0x6e3c7b51, 0x0ff5f6ff), TOBN(0x817a069b, 0xabb2bef3)}},
+     {{TOBN(0x83572fb6, 0xb294cda6), TOBN(0x6ce9bf75, 0xb9039f34),
+       TOBN(0x20e012f0, 0x095cbb21), TOBN(0xa0aecc1b, 0xd063f0da)},
+      {TOBN(0x57c21c3a, 0xf02909e5), TOBN(0xc7d59ecf, 0x48ce9cdc),
+       TOBN(0x2732b844, 0x8ae336f8), TOBN(0x056e3723, 0x3f4f85f4)}},
+     {{TOBN(0x8a10b531, 0x89e800ca), TOBN(0x50fe0c17, 0x145208fd),
+       TOBN(0x9e43c0d3, 0xb714ba37), TOBN(0x427d200e, 0x34189acc)},
+      {TOBN(0x05dee24f, 0xe616e2c0), TOBN(0x9c25f4c8, 0xee1854c1),
+       TOBN(0x4d3222a5, 0x8f342a73), TOBN(0x0807804f, 0xa027c952)}},
+     {{TOBN(0xc222653a, 0x4f0d56f3), TOBN(0x961e4047, 0xca28b805),
+       TOBN(0x2c03f8b0, 0x4a73434b), TOBN(0x4c966787, 0xab712a19)},
+      {TOBN(0xcc196c42, 0x864fee42), TOBN(0xc1be93da, 0x5b0ece5c),
+       TOBN(0xa87d9f22, 0xc131c159), TOBN(0x2bb6d593, 0xdce45655)}},
+     {{TOBN(0x22c49ec9, 0xb809b7ce), TOBN(0x8a41486b, 0xe2c72c2c),
+       TOBN(0x813b9420, 0xfea0bf36), TOBN(0xb3d36ee9, 0xa66dac69)},
+      {TOBN(0x6fddc08a, 0x328cc987), TOBN(0x0a3bcd2c, 0x3a326461),
+       TOBN(0x7103c49d, 0xd810dbba), TOBN(0xf9d81a28, 0x4b78a4c4)}},
+     {{TOBN(0x3de865ad, 0xe4d55941), TOBN(0xdedafa5e, 0x30384087),
+       TOBN(0x6f414abb, 0x4ef18b9b), TOBN(0x9ee9ea42, 0xfaee5268)},
+      {TOBN(0x260faa16, 0x37a55a4a), TOBN(0xeb19a514, 0x015f93b9),
+       TOBN(0x51d7ebd2, 0x9e9c3598), TOBN(0x523fc56d, 0x1932178e)}},
+     {{TOBN(0x501d070c, 0xb98fe684), TOBN(0xd60fbe9a, 0x124a1458),
+       TOBN(0xa45761c8, 0x92bc6b3f), TOBN(0xf5384858, 0xfe6f27cb)},
+      {TOBN(0x4b0271f7, 0xb59e763b), TOBN(0x3d4606a9, 0x5b5a8e5e),
+       TOBN(0x1eda5d9b, 0x05a48292), TOBN(0xda7731d0, 0xe6fec446)}},
+     {{TOBN(0xa3e33693, 0x90d45871), TOBN(0xe9764040, 0x06166d8d),
+       TOBN(0xb5c33682, 0x89a90403), TOBN(0x4bd17983, 0x72f1d637)},
+      {TOBN(0xa616679e, 0xd5d2c53a), TOBN(0x5ec4bcd8, 0xfdcf3b87),
+       TOBN(0xae6d7613, 0xb66a694e), TOBN(0x7460fc76, 0xe3fc27e5)}},
+     {{TOBN(0x70469b82, 0x95caabee), TOBN(0xde024ca5, 0x889501e3),
+       TOBN(0x6bdadc06, 0x076ed265), TOBN(0x0cb1236b, 0x5a0ef8b2)},
+      {TOBN(0x4065ddbf, 0x0972ebf9), TOBN(0xf1dd3875, 0x22aca432),
+       TOBN(0xa88b97cf, 0x744aff76), TOBN(0xd1359afd, 0xfe8e3d24)}},
+     {{TOBN(0x52a3ba2b, 0x91502cf3), TOBN(0x2c3832a8, 0x084db75d),
+       TOBN(0x04a12ddd, 0xde30b1c9), TOBN(0x7802eabc, 0xe31fd60c)},
+      {TOBN(0x33707327, 0xa37fddab), TOBN(0x65d6f2ab, 0xfaafa973),
+       TOBN(0x3525c5b8, 0x11e6f91a), TOBN(0x76aeb0c9, 0x5f46530b)}},
+     {{TOBN(0xe8815ff6, 0x2f93a675), TOBN(0xa6ec9684, 0x05f48679),
+       TOBN(0x6dcbb556, 0x358ae884), TOBN(0x0af61472, 0xe19e3873)},
+      {TOBN(0x72334372, 0xa5f696be), TOBN(0xc65e57ea, 0x6f22fb70),
+       TOBN(0x268da30c, 0x946cea90), TOBN(0x136a8a87, 0x65681b2a)}},
+     {{TOBN(0xad5e81dc, 0x0f9f44d4), TOBN(0xf09a6960, 0x2c46585a),
+       TOBN(0xd1649164, 0xc447d1b1), TOBN(0x3b4b36c8, 0x879dc8b1)},
+      {TOBN(0x20d4177b, 0x3b6b234c), TOBN(0x096a2505, 0x1730d9d0),
+       TOBN(0x0611b9b8, 0xef80531d), TOBN(0xba904b3b, 0x64bb495d)}},
+     {{TOBN(0x1192d9d4, 0x93a3147a), TOBN(0x9f30a5dc, 0x9a565545),
+       TOBN(0x90b1f9cb, 0x6ef07212), TOBN(0x29958546, 0x0d87fc13)},
+      {TOBN(0xd3323eff, 0xc17db9ba), TOBN(0xcb18548c, 0xcb1644a8),
+       TOBN(0x18a306d4, 0x4f49ffbc), TOBN(0x28d658f1, 0x4c2e8684)}},
+     {{TOBN(0x44ba60cd, 0xa99f8c71), TOBN(0x67b7abdb, 0x4bf742ff),
+       TOBN(0x66310f9c, 0x914b3f99), TOBN(0xae430a32, 0xf412c161)},
+      {TOBN(0x1e6776d3, 0x88ace52f), TOBN(0x4bc0fa24, 0x52d7067d),
+       TOBN(0x03c286aa, 0x8f07cd1b), TOBN(0x4cb8f38c, 0xa985b2c1)}},
+     {{TOBN(0x83ccbe80, 0x8c3bff36), TOBN(0x005a0bd2, 0x5263e575),
+       TOBN(0x460d7dda, 0x259bdcd1), TOBN(0x4a1c5642, 0xfa5cab6b)},
+      {TOBN(0x2b7bdbb9, 0x9fe4fc88), TOBN(0x09418e28, 0xcc97bbb5),
+       TOBN(0xd8274fb4, 0xa12321ae), TOBN(0xb137007d, 0x5c87b64e)}},
+     {{TOBN(0x80531fe1, 0xc63c4962), TOBN(0x50541e89, 0x981fdb25),
+       TOBN(0xdc1291a1, 0xfd4c2b6b), TOBN(0xc0693a17, 0xa6df4fca)},
+      {TOBN(0xb2c4604e, 0x0117f203), TOBN(0x245f1963, 0x0a99b8d0),
+       TOBN(0xaedc20aa, 0xc6212c44), TOBN(0xb1ed4e56, 0x520f52a8)}},
+     {{TOBN(0xfe48f575, 0xf8547be3), TOBN(0x0a7033cd, 0xa9e45f98),
+       TOBN(0x4b45d3a9, 0x18c50100), TOBN(0xb2a6cd6a, 0xa61d41da)},
+      {TOBN(0x60bbb4f5, 0x57933c6b), TOBN(0xa7538ebd, 0x2b0d7ffc),
+       TOBN(0x9ea3ab8d, 0x8cd626b6), TOBN(0x8273a484, 0x3601625a)}},
+     {{TOBN(0x88859845, 0x0168e508), TOBN(0x8cbc9bb2, 0x99a94abd),
+       TOBN(0x713ac792, 0xfab0a671), TOBN(0xa3995b19, 0x6c9ebffc)},
+      {TOBN(0xe711668e, 0x1239e152), TOBN(0x56892558, 0xbbb8dff4),
+       TOBN(0x8bfc7dab, 0xdbf17963), TOBN(0x5b59fe5a, 0xb3de1253)}},
+     {{TOBN(0x7e3320eb, 0x34a9f7ae), TOBN(0xe5e8cf72, 0xd751efe4),
+       TOBN(0x7ea003bc, 0xd9be2f37), TOBN(0xc0f551a0, 0xb6c08ef7)},
+      {TOBN(0x56606268, 0x038f6725), TOBN(0x1dd38e35, 0x6d92d3b6),
+       TOBN(0x07dfce7c, 0xc3cbd686), TOBN(0x4e549e04, 0x651c5da8)}},
+     {{TOBN(0x4058f93b, 0x08b19340), TOBN(0xc2fae6f4, 0xcac6d89d),
+       TOBN(0x4bad8a8c, 0x8f159cc7), TOBN(0x0ddba4b3, 0xcb0b601c)},
+      {TOBN(0xda4fc7b5, 0x1dd95f8c), TOBN(0x1d163cd7, 0xcea5c255),
+       TOBN(0x30707d06, 0x274a8c4c), TOBN(0x79d9e008, 0x2802e9ce)}},
+     {{TOBN(0x02a29ebf, 0xe6ddd505), TOBN(0x37064e74, 0xb50bed1a),
+       TOBN(0x3f6bae65, 0xa7327d57), TOBN(0x3846f5f1, 0xf83920bc)},
+      {TOBN(0x87c37491, 0x60df1b9b), TOBN(0x4cfb2895, 0x2d1da29f),
+       TOBN(0x10a478ca, 0x4ed1743c), TOBN(0x390c6030, 0x3edd47c6)}},
+     {{TOBN(0x8f3e5312, 0x8c0a78de), TOBN(0xccd02bda, 0x1e85df70),
+       TOBN(0xd6c75c03, 0xa61b6582), TOBN(0x0762921c, 0xfc0eebd1)},
+      {TOBN(0xd34d0823, 0xd85010c0), TOBN(0xd73aaacb, 0x0044cf1f),
+       TOBN(0xfb4159bb, 0xa3b5e78a), TOBN(0x2287c7f7, 0xe5826f3f)}},
+     {{TOBN(0x4aeaf742, 0x580b1a01), TOBN(0xf080415d, 0x60423b79),
+       TOBN(0xe12622cd, 0xa7dea144), TOBN(0x49ea4996, 0x59d62472)},
+      {TOBN(0xb42991ef, 0x571f3913), TOBN(0x0610f214, 0xf5b25a8a),
+       TOBN(0x47adc585, 0x30b79e8f), TOBN(0xf90e3df6, 0x07a065a2)}},
+     {{TOBN(0x5d0a5deb, 0x43e2e034), TOBN(0x53fb5a34, 0x444024aa),
+       TOBN(0xa8628c68, 0x6b0c9f7f), TOBN(0x9c69c29c, 0xac563656)},
+      {TOBN(0x5a231feb, 0xbace47b6), TOBN(0xbdce0289, 0x9ea5a2ec),
+       TOBN(0x05da1fac, 0x9463853e), TOBN(0x96812c52, 0x509e78aa)}},
+     {{TOBN(0xd3fb5771, 0x57151692), TOBN(0xeb2721f8, 0xd98e1c44),
+       TOBN(0xc0506087, 0x32399be1), TOBN(0xda5a5511, 0xd979d8b8)},
+      {TOBN(0x737ed55d, 0xc6f56780), TOBN(0xe20d3004, 0x0dc7a7f4),
+       TOBN(0x02ce7301, 0xf5941a03), TOBN(0x91ef5215, 0xed30f83a)}},
+     {{TOBN(0x28727fc1, 0x4092d85f), TOBN(0x72d223c6, 0x5c49e41a),
+       TOBN(0xa7cf30a2, 0xba6a4d81), TOBN(0x7c086209, 0xb030d87d)},
+      {TOBN(0x04844c7d, 0xfc588b09), TOBN(0x728cd499, 0x5874bbb0),
+       TOBN(0xcc1281ee, 0xe84c0495), TOBN(0x0769b5ba, 0xec31958f)}},
+     {{TOBN(0x665c228b, 0xf99c2471), TOBN(0xf2d8a11b, 0x191eb110),
+       TOBN(0x4594f494, 0xd36d7024), TOBN(0x482ded8b, 0xcdcb25a1)},
+      {TOBN(0xc958a9d8, 0xdadd4885), TOBN(0x7004477e, 0xf1d2b547),
+       TOBN(0x0a45f6ef, 0x2a0af550), TOBN(0x4fc739d6, 0x2f8d6351)}},
+     {{TOBN(0x75cdaf27, 0x786f08a9), TOBN(0x8700bb26, 0x42c2737f),
+       TOBN(0x855a7141, 0x1c4e2670), TOBN(0x810188c1, 0x15076fef)},
+      {TOBN(0xc251d0c9, 0xabcd3297), TOBN(0xae4c8967, 0xf48108eb),
+       TOBN(0xbd146de7, 0x18ceed30), TOBN(0xf9d4f07a, 0xc986bced)}},
+     {{TOBN(0x5ad98ed5, 0x83fa1e08), TOBN(0x7780d33e, 0xbeabd1fb),
+       TOBN(0xe330513c, 0x903b1196), TOBN(0xba11de9e, 0xa47bc8c4)},
+      {TOBN(0x684334da, 0x02c2d064), TOBN(0x7ecf360d, 0xa48de23b),
+       TOBN(0x57a1b474, 0x0a9089d8), TOBN(0xf28fa439, 0xff36734c)}},
+     {{TOBN(0xf2a482cb, 0xea4570b3), TOBN(0xee65d68b, 0xa5ebcee9),
+       TOBN(0x988d0036, 0xb9694cd5), TOBN(0x53edd0e9, 0x37885d32)},
+      {TOBN(0xe37e3307, 0xbeb9bc6d), TOBN(0xe9abb907, 0x9f5c6768),
+       TOBN(0x4396ccd5, 0x51f2160f), TOBN(0x2500888c, 0x47336da6)}},
+     {{TOBN(0x383f9ed9, 0x926fce43), TOBN(0x809dd1c7, 0x04da2930),
+       TOBN(0x30f6f596, 0x8a4cb227), TOBN(0x0d700c7f, 0x73a56b38)},
+      {TOBN(0x1825ea33, 0xab64a065), TOBN(0xaab9b735, 0x1338df80),
+       TOBN(0x1516100d, 0x9b63f57f), TOBN(0x2574395a, 0x27a6a634)}},
+     {{TOBN(0xb5560fb6, 0x700a1acd), TOBN(0xe823fd73, 0xfd999681),
+       TOBN(0xda915d1f, 0x6cb4e1ba), TOBN(0x0d030118, 0x6ebe00a3)},
+      {TOBN(0x744fb0c9, 0x89fca8cd), TOBN(0x970d01db, 0xf9da0e0b),
+       TOBN(0x0ad8c564, 0x7931d76f), TOBN(0xb15737bf, 0xf659b96a)}},
+     {{TOBN(0xdc9933e8, 0xa8b484e7), TOBN(0xb2fdbdf9, 0x7a26dec7),
+       TOBN(0x2349e9a4, 0x9f1f0136), TOBN(0x7860368e, 0x70fddddb)},
+      {TOBN(0xd93d2c1c, 0xf9ad3e18), TOBN(0x6d6c5f17, 0x689f4e79),
+       TOBN(0x7a544d91, 0xb24ff1b6), TOBN(0x3e12a5eb, 0xfe16cd8c)}},
+     {{TOBN(0x543574e9, 0xa56b872f), TOBN(0xa1ad550c, 0xfcf68ea2),
+       TOBN(0x689e37d2, 0x3f560ef7), TOBN(0x8c54b9ca, 0xc9d47a8b)},
+      {TOBN(0x46d40a4a, 0x088ac342), TOBN(0xec450c7c, 0x1576c6d0),
+       TOBN(0xb589e31c, 0x1f9689e9), TOBN(0xdacf2602, 0xb8781718)}},
+     {{TOBN(0xa89237c6, 0xc8cb6b42), TOBN(0x1326fc93, 0xb96ef381),
+       TOBN(0x55d56c6d, 0xb5f07825), TOBN(0xacba2eea, 0x7449e22d)},
+      {TOBN(0x74e0887a, 0x633c3000), TOBN(0xcb6cd172, 0xd7cbcf71),
+       TOBN(0x309e81de, 0xc36cf1be), TOBN(0x07a18a6d, 0x60ae399b)}},
+     {{TOBN(0xb36c2679, 0x9edce57e), TOBN(0x52b892f4, 0xdf001d41),
+       TOBN(0xd884ae5d, 0x16a1f2c6), TOBN(0x9b329424, 0xefcc370a)},
+      {TOBN(0x3120daf2, 0xbd2e21df), TOBN(0x55298d2d, 0x02470a99),
+       TOBN(0x0b78af6c, 0xa05db32e), TOBN(0x5c76a331, 0x601f5636)}},
+     {{TOBN(0xaae861ff, 0xf8a4f29c), TOBN(0x70dc9240, 0xd68f8d49),
+       TOBN(0x960e649f, 0x81b1321c), TOBN(0x3d2c801b, 0x8792e4ce)},
+      {TOBN(0xf479f772, 0x42521876), TOBN(0x0bed93bc, 0x416c79b1),
+       TOBN(0xa67fbc05, 0x263e5bc9), TOBN(0x01e8e630, 0x521db049)}},
+     {{TOBN(0x76f26738, 0xc6f3431e), TOBN(0xe609cb02, 0xe3267541),
+       TOBN(0xb10cff2d, 0x818c877c), TOBN(0x1f0e75ce, 0x786a13cb)},
+      {TOBN(0xf4fdca64, 0x1158544d), TOBN(0x5d777e89, 0x6cb71ed0),
+       TOBN(0x3c233737, 0xa9aa4755), TOBN(0x7b453192, 0xe527ab40)}},
+     {{TOBN(0xdb59f688, 0x39f05ffe), TOBN(0x8f4f4be0, 0x6d82574e),
+       TOBN(0xcce3450c, 0xee292d1b), TOBN(0xaa448a12, 0x61ccd086)},
+      {TOBN(0xabce91b3, 0xf7914967), TOBN(0x4537f09b, 0x1908a5ed),
+       TOBN(0xa812421e, 0xf51042e7), TOBN(0xfaf5cebc, 0xec0b3a34)}},
+     {{TOBN(0x730ffd87, 0x4ca6b39a), TOBN(0x70fb72ed, 0x02efd342),
+       TOBN(0xeb4735f9, 0xd75c8edb), TOBN(0xc11f2157, 0xc278aa51)},
+      {TOBN(0xc459f635, 0xbf3bfebf), TOBN(0x3a1ff0b4, 0x6bd9601f),
+       TOBN(0xc9d12823, 0xc420cb73), TOBN(0x3e9af3e2, 0x3c2915a3)}},
+     {{TOBN(0xe0c82c72, 0xb41c3440), TOBN(0x175239e5, 0xe3039a5f),
+       TOBN(0xe1084b8a, 0x558795a3), TOBN(0x328d0a1d, 0xd01e5c60)},
+      {TOBN(0x0a495f2e, 0xd3788a04), TOBN(0x25d8ff16, 0x66c11a9f),
+       TOBN(0xf5155f05, 0x9ed692d6), TOBN(0x954fa107, 0x4f425fe4)}},
+     {{TOBN(0xd16aabf2, 0xe98aaa99), TOBN(0x90cd8ba0, 0x96b0f88a),
+       TOBN(0x957f4782, 0xc154026a), TOBN(0x54ee0734, 0x52af56d2)},
+      {TOBN(0xbcf89e54, 0x45b4147a), TOBN(0x3d102f21, 0x9a52816c),
+       TOBN(0x6808517e, 0x39b62e77), TOBN(0x92e25421, 0x69169ad8)}},
+     {{TOBN(0xd721d871, 0xbb608558), TOBN(0x60e4ebae, 0xf6d4ff9b),
+       TOBN(0x0ba10819, 0x41f2763e), TOBN(0xca2e45be, 0x51ee3247)},
+      {TOBN(0x66d172ec, 0x2bfd7a5f), TOBN(0x528a8f2f, 0x74d0b12d),
+       TOBN(0xe17f1e38, 0xdabe70dc), TOBN(0x1d5d7316, 0x9f93983c)}},
+     {{TOBN(0x51b2184a, 0xdf423e31), TOBN(0xcb417291, 0xaedb1a10),
+       TOBN(0x2054ca93, 0x625bcab9), TOBN(0x54396860, 0xa98998f0)},
+      {TOBN(0x4e53f6c4, 0xa54ae57e), TOBN(0x0ffeb590, 0xee648e9d),
+       TOBN(0xfbbdaadc, 0x6afaf6bc), TOBN(0xf88ae796, 0xaa3bfb8a)}},
+     {{TOBN(0x209f1d44, 0xd2359ed9), TOBN(0xac68dd03, 0xf3544ce2),
+       TOBN(0xf378da47, 0xfd51e569), TOBN(0xe1abd860, 0x2cc80097)},
+      {TOBN(0x23ca18d9, 0x343b6e3a), TOBN(0x480797e8, 0xb40a1bae),
+       TOBN(0xd1f0c717, 0x533f3e67), TOBN(0x44896970, 0x06e6cdfc)}},
+     {{TOBN(0x8ca21055, 0x52a82e8d), TOBN(0xb2caf785, 0x78460cdc),
+       TOBN(0x4c1b7b62, 0xe9037178), TOBN(0xefc09d2c, 0xdb514b58)},
+      {TOBN(0x5f2df9ee, 0x9113be5c), TOBN(0x2fbda78f, 0xb3f9271c),
+       TOBN(0xe09a81af, 0x8f83fc54), TOBN(0x06b13866, 0x8afb5141)}},
+     {{TOBN(0x38f6480f, 0x43e3865d), TOBN(0x72dd77a8, 0x1ddf47d9),
+       TOBN(0xf2a8e971, 0x4c205ff7), TOBN(0x46d449d8, 0x9d088ad8)},
+      {TOBN(0x926619ea, 0x185d706f), TOBN(0xe47e02eb, 0xc7dd7f62),
+       TOBN(0xe7f120a7, 0x8cbc2031), TOBN(0xc18bef00, 0x998d4ac9)}},
+     {{TOBN(0x18f37a9c, 0x6bdf22da), TOBN(0xefbc432f, 0x90dc82df),
+       TOBN(0xc52cef8e, 0x5d703651), TOBN(0x82887ba0, 0xd99881a5)},
+      {TOBN(0x7cec9dda, 0xb920ec1d), TOBN(0xd0d7e8c3, 0xec3e8d3b),
+       TOBN(0x445bc395, 0x4ca88747), TOBN(0xedeaa2e0, 0x9fd53535)}},
+     {{TOBN(0x461b1d93, 0x6cc87475), TOBN(0xd92a52e2, 0x6d2383bd),
+       TOBN(0xfabccb59, 0xd7903546), TOBN(0x6111a761, 0x3d14b112)},
+      {TOBN(0x0ae584fe, 0xb3d5f612), TOBN(0x5ea69b8d, 0x60e828ec),
+       TOBN(0x6c078985, 0x54087030), TOBN(0x649cab04, 0xac4821fe)}},
+     {{TOBN(0x25ecedcf, 0x8bdce214), TOBN(0xb5622f72, 0x86af7361),
+       TOBN(0x0e1227aa, 0x7038b9e2), TOBN(0xd0efb273, 0xac20fa77)},
+      {TOBN(0x817ff88b, 0x79df975b), TOBN(0x856bf286, 0x1999503e),
+       TOBN(0xb4d5351f, 0x5038ec46), TOBN(0x740a52c5, 0xfc42af6e)}},
+     {{TOBN(0x2e38bb15, 0x2cbb1a3f), TOBN(0xc3eb99fe, 0x17a83429),
+       TOBN(0xca4fcbf1, 0xdd66bb74), TOBN(0x880784d6, 0xcde5e8fc)},
+      {TOBN(0xddc84c1c, 0xb4e7a0be), TOBN(0x8780510d, 0xbd15a72f),
+       TOBN(0x44bcf1af, 0x81ec30e1), TOBN(0x141e50a8, 0x0a61073e)}},
+     {{TOBN(0x0d955718, 0x47be87ae), TOBN(0x68a61417, 0xf76a4372),
+       TOBN(0xf57e7e87, 0xc607c3d3), TOBN(0x043afaf8, 0x5252f332)},
+      {TOBN(0xcc14e121, 0x1552a4d2), TOBN(0xb6dee692, 0xbb4d4ab4),
+       TOBN(0xb6ab74c8, 0xa03816a4), TOBN(0x84001ae4, 0x6f394a29)}},
+     {{TOBN(0x5bed8344, 0xd795fb45), TOBN(0x57326e7d, 0xb79f55a5),
+       TOBN(0xc9533ce0, 0x4accdffc), TOBN(0x53473caf, 0x3993fa04)},
+      {TOBN(0x7906eb93, 0xa13df4c8), TOBN(0xa73e51f6, 0x97cbe46f),
+       TOBN(0xd1ab3ae1, 0x0ae4ccf8), TOBN(0x25614508, 0x8a5b3dbc)}},
+     {{TOBN(0x61eff962, 0x11a71b27), TOBN(0xdf71412b, 0x6bb7fa39),
+       TOBN(0xb31ba6b8, 0x2bd7f3ef), TOBN(0xb0b9c415, 0x69180d29)},
+      {TOBN(0xeec14552, 0x014cdde5), TOBN(0x702c624b, 0x227b4bbb),
+       TOBN(0x2b15e8c2, 0xd3e988f3), TOBN(0xee3bcc6d, 0xa4f7fd04)}},
+     {{TOBN(0x9d00822a, 0x42ac6c85), TOBN(0x2db0cea6, 0x1df9f2b7),
+       TOBN(0xd7cad2ab, 0x42de1e58), TOBN(0x346ed526, 0x2d6fbb61)},
+      {TOBN(0xb3962995, 0x1a2faf09), TOBN(0x2fa8a580, 0x7c25612e),
+       TOBN(0x30ae04da, 0x7cf56490), TOBN(0x75662908, 0x0eea3961)}},
+     {{TOBN(0x3609f5c5, 0x3d080847), TOBN(0xcb081d39, 0x5241d4f6),
+       TOBN(0xb4fb3810, 0x77961a63), TOBN(0xc20c5984, 0x2abb66fc)},
+      {TOBN(0x3d40aa7c, 0xf902f245), TOBN(0x9cb12736, 0x4e536b1e),
+       TOBN(0x5eda24da, 0x99b3134f), TOBN(0xafbd9c69, 0x5cd011af)}},
+     {{TOBN(0x9a16e30a, 0xc7088c7d), TOBN(0x5ab65710, 0x3207389f),
+       TOBN(0x1b09547f, 0xe7407a53), TOBN(0x2322f9d7, 0x4fdc6eab)},
+      {TOBN(0xc0f2f22d, 0x7430de4d), TOBN(0x19382696, 0xe68ca9a9),
+       TOBN(0x17f1eff1, 0x918e5868), TOBN(0xe3b5b635, 0x586f4204)}},
+     {{TOBN(0x146ef980, 0x3fbc4341), TOBN(0x359f2c80, 0x5b5eed4e),
+       TOBN(0x9f35744e, 0x7482e41d), TOBN(0x9a9ac3ec, 0xf3b224c2)},
+      {TOBN(0x9161a6fe, 0x91fc50ae), TOBN(0x89ccc66b, 0xc613fa7c),
+       TOBN(0x89268b14, 0xc732f15a), TOBN(0x7cd6f4e2, 0xb467ed03)}},
+     {{TOBN(0xfbf79869, 0xce56b40e), TOBN(0xf93e094c, 0xc02dde98),
+       TOBN(0xefe0c3a8, 0xedee2cd7), TOBN(0x90f3ffc0, 0xb268fd42)},
+      {TOBN(0x81a7fd56, 0x08241aed), TOBN(0x95ab7ad8, 0x00b1afe8),
+       TOBN(0x40127056, 0x3e310d52), TOBN(0xd3ffdeb1, 0x09d9fc43)}},
+     {{TOBN(0xc8f85c91, 0xd11a8594), TOBN(0x2e74d258, 0x31cf6db8),
+       TOBN(0x829c7ca3, 0x02b5dfd0), TOBN(0xe389cfbe, 0x69143c86)},
+      {TOBN(0xd01b6405, 0x941768d8), TOBN(0x45103995, 0x03bf825d),
+       TOBN(0xcc4ee166, 0x56cd17e2), TOBN(0xbea3c283, 0xba037e79)}},
+     {{TOBN(0x4e1ac06e, 0xd9a47520), TOBN(0xfbfe18aa, 0xaf852404),
+       TOBN(0x5615f8e2, 0x8087648a), TOBN(0x7301e47e, 0xb9d150d9)},
+      {TOBN(0x79f9f9dd, 0xb299b977), TOBN(0x76697a7b, 0xa5b78314),
+       TOBN(0x10d67468, 0x7d7c90e7), TOBN(0x7afffe03, 0x937210b5)}},
+     {{TOBN(0x5aef3e4b, 0x28c22cee), TOBN(0xefb0ecd8, 0x09fd55ae),
+       TOBN(0x4cea7132, 0x0d2a5d6a), TOBN(0x9cfb5fa1, 0x01db6357)},
+      {TOBN(0x395e0b57, 0xf36e1ac5), TOBN(0x008fa9ad, 0x36cafb7d),
+       TOBN(0x8f6cdf70, 0x5308c4db), TOBN(0x51527a37, 0x95ed2477)}},
+     {{TOBN(0xba0dee30, 0x5bd21311), TOBN(0x6ed41b22, 0x909c90d7),
+       TOBN(0xc5f6b758, 0x7c8696d3), TOBN(0x0db8eaa8, 0x3ce83a80)},
+      {TOBN(0xd297fe37, 0xb24b4b6f), TOBN(0xfe58afe8, 0x522d1f0d),
+       TOBN(0x97358736, 0x8c98dbd9), TOBN(0x6bc226ca, 0x9454a527)}},
+     {{TOBN(0xa12b384e, 0xce53c2d0), TOBN(0x779d897d, 0x5e4606da),
+       TOBN(0xa53e47b0, 0x73ec12b0), TOBN(0x462dbbba, 0x5756f1ad)},
+      {TOBN(0x69fe09f2, 0xcafe37b6), TOBN(0x273d1ebf, 0xecce2e17),
+       TOBN(0x8ac1d538, 0x3cf607fd), TOBN(0x8035f7ff, 0x12e10c25)}}},
+    {{{TOBN(0x854d34c7, 0x7e6c5520), TOBN(0xc27df9ef, 0xdcb9ea58),
+       TOBN(0x405f2369, 0xd686666d), TOBN(0x29d1febf, 0x0417aa85)},
+      {TOBN(0x9846819e, 0x93470afe), TOBN(0x3e6a9669, 0xe2a27f9e),
+       TOBN(0x24d008a2, 0xe31e6504), TOBN(0xdba7cecf, 0x9cb7680a)}},
+     {{TOBN(0xecaff541, 0x338d6e43), TOBN(0x56f7dd73, 0x4541d5cc),
+       TOBN(0xb5d426de, 0x96bc88ca), TOBN(0x48d94f6b, 0x9ed3a2c3)},
+      {TOBN(0x6354a3bb, 0x2ef8279c), TOBN(0xd575465b, 0x0b1867f2),
+       TOBN(0xef99b0ff, 0x95225151), TOBN(0xf3e19d88, 0xf94500d8)}},
+     {{TOBN(0x92a83268, 0xe32dd620), TOBN(0x913ec99f, 0x627849a2),
+       TOBN(0xedd8fdfa, 0x2c378882), TOBN(0xaf96f33e, 0xee6f8cfe)},
+      {TOBN(0xc06737e5, 0xdc3fa8a5), TOBN(0x236bb531, 0xb0b03a1d),
+       TOBN(0x33e59f29, 0x89f037b0), TOBN(0x13f9b5a7, 0xd9a12a53)}},
+     {{TOBN(0x0d0df6ce, 0x51efb310), TOBN(0xcb5b2eb4, 0x958df5be),
+       TOBN(0xd6459e29, 0x36158e59), TOBN(0x82aae2b9, 0x1466e336)},
+      {TOBN(0xfb658a39, 0x411aa636), TOBN(0x7152ecc5, 0xd4c0a933),
+       TOBN(0xf10c758a, 0x49f026b7), TOBN(0xf4837f97, 0xcb09311f)}},
+     {{TOBN(0xddfb02c4, 0xc753c45f), TOBN(0x18ca81b6, 0xf9c840fe),
+       TOBN(0x846fd09a, 0xb0f8a3e6), TOBN(0xb1162add, 0xe7733dbc)},
+      {TOBN(0x7070ad20, 0x236e3ab6), TOBN(0xf88cdaf5, 0xb2a56326),
+       TOBN(0x05fc8719, 0x997cbc7a), TOBN(0x442cd452, 0x4b665272)}},
+     {{TOBN(0x7807f364, 0xb71698f5), TOBN(0x6ba418d2, 0x9f7b605e),
+       TOBN(0xfd20b00f, 0xa03b2cbb), TOBN(0x883eca37, 0xda54386f)},
+      {TOBN(0xff0be43f, 0xf3437f24), TOBN(0xe910b432, 0xa48bb33c),
+       TOBN(0x4963a128, 0x329df765), TOBN(0xac1dd556, 0xbe2fe6f7)}},
+     {{TOBN(0x557610f9, 0x24a0a3fc), TOBN(0x38e17bf4, 0xe881c3f9),
+       TOBN(0x6ba84faf, 0xed0dac99), TOBN(0xd4a222c3, 0x59eeb918)},
+      {TOBN(0xc79c1dbe, 0x13f542b6), TOBN(0x1fc65e0d, 0xe425d457),
+       TOBN(0xeffb754f, 0x1debb779), TOBN(0x638d8fd0, 0x9e08af60)}},
+     {{TOBN(0x994f523a, 0x626332d5), TOBN(0x7bc38833, 0x5561bb44),
+       TOBN(0x005ed4b0, 0x3d845ea2), TOBN(0xd39d3ee1, 0xc2a1f08a)},
+      {TOBN(0x6561fdd3, 0xe7676b0d), TOBN(0x620e35ff, 0xfb706017),
+       TOBN(0x36ce424f, 0xf264f9a8), TOBN(0xc4c3419f, 0xda2681f7)}},
+     {{TOBN(0xfb6afd2f, 0x69beb6e8), TOBN(0x3a50b993, 0x6d700d03),
+       TOBN(0xc840b2ad, 0x0c83a14f), TOBN(0x573207be, 0x54085bef)},
+      {TOBN(0x5af882e3, 0x09fe7e5b), TOBN(0x957678a4, 0x3b40a7e1),
+       TOBN(0x172d4bdd, 0x543056e2), TOBN(0x9c1b26b4, 0x0df13c0a)}},
+     {{TOBN(0x1c30861c, 0xf405ff06), TOBN(0xebac86bd, 0x486e828b),
+       TOBN(0xe791a971, 0x636933fc), TOBN(0x50e7c2be, 0x7aeee947)},
+      {TOBN(0xc3d4a095, 0xfa90d767), TOBN(0xae60eb7b, 0xe670ab7b),
+       TOBN(0x17633a64, 0x397b056d), TOBN(0x93a21f33, 0x105012aa)}},
+     {{TOBN(0x663c370b, 0xabb88643), TOBN(0x91df36d7, 0x22e21599),
+       TOBN(0x183ba835, 0x8b761671), TOBN(0x381eea1d, 0x728f3bf1)},
+      {TOBN(0xb9b2f1ba, 0x39966e6c), TOBN(0x7c464a28, 0xe7295492),
+       TOBN(0x0fd5f70a, 0x09b26b7f), TOBN(0xa9aba1f9, 0xfbe009df)}},
+     {{TOBN(0x857c1f22, 0x369b87ad), TOBN(0x3c00e5d9, 0x32fca556),
+       TOBN(0x1ad74cab, 0x90b06466), TOBN(0xa7112386, 0x550faaf2)},
+      {TOBN(0x7435e198, 0x6d9bd5f5), TOBN(0x2dcc7e38, 0x59c3463f),
+       TOBN(0xdc7df748, 0xca7bd4b2), TOBN(0x13cd4c08, 0x9dec2f31)}},
+     {{TOBN(0x0d3b5df8, 0xe3237710), TOBN(0x0dadb26e, 0xcbd2f7b0),
+       TOBN(0x9f5966ab, 0xe4aa082b), TOBN(0x666ec8de, 0x350e966e)},
+      {TOBN(0x1bfd1ed5, 0xee524216), TOBN(0xcd93c59b, 0x41dab0b6),
+       TOBN(0x658a8435, 0xd186d6ba), TOBN(0x1b7d34d2, 0x159d1195)}},
+     {{TOBN(0x5936e460, 0x22caf46b), TOBN(0x6a45dd8f, 0x9a96fe4f),
+       TOBN(0xf7925434, 0xb98f474e), TOBN(0x41410412, 0x0053ef15)},
+      {TOBN(0x71cf8d12, 0x41de97bf), TOBN(0xb8547b61, 0xbd80bef4),
+       TOBN(0xb47d3970, 0xc4db0037), TOBN(0xf1bcd328, 0xfef20dff)}},
+     {{TOBN(0x31a92e09, 0x10caad67), TOBN(0x1f591960, 0x5531a1e1),
+       TOBN(0x3bb852e0, 0x5f4fc840), TOBN(0x63e297ca, 0x93a72c6c)},
+      {TOBN(0x3c2b0b2e, 0x49abad67), TOBN(0x6ec405fc, 0xed3db0d9),
+       TOBN(0xdc14a530, 0x7fef1d40), TOBN(0xccd19846, 0x280896fc)}},
+     {{TOBN(0x00f83176, 0x9bb81648), TOBN(0xd69eb485, 0x653120d0),
+       TOBN(0xd17d75f4, 0x4ccabc62), TOBN(0x34a07f82, 0xb749fcb1)},
+      {TOBN(0x2c3af787, 0xbbfb5554), TOBN(0xb06ed4d0, 0x62e283f8),
+       TOBN(0x5722889f, 0xa19213a0), TOBN(0x162b085e, 0xdcf3c7b4)}},
+     {{TOBN(0xbcaecb31, 0xe0dd3eca), TOBN(0xc6237fbc, 0xe52f13a5),
+       TOBN(0xcc2b6b03, 0x27bac297), TOBN(0x2ae1cac5, 0xb917f54a)},
+      {TOBN(0x474807d4, 0x7845ae4f), TOBN(0xfec7dd92, 0xce5972e0),
+       TOBN(0xc3bd2541, 0x1d7915bb), TOBN(0x66f85dc4, 0xd94907ca)}},
+     {{TOBN(0xd981b888, 0xbdbcf0ca), TOBN(0xd75f5da6, 0xdf279e9f),
+       TOBN(0x128bbf24, 0x7054e934), TOBN(0x3c6ff6e5, 0x81db134b)},
+      {TOBN(0x795b7cf4, 0x047d26e4), TOBN(0xf370f7b8, 0x5049ec37),
+       TOBN(0xc6712d4d, 0xced945af), TOBN(0xdf30b5ec, 0x095642bc)}},
+     {{TOBN(0x9b034c62, 0x4896246e), TOBN(0x5652c016, 0xee90bbd1),
+       TOBN(0xeb38636f, 0x87fedb73), TOBN(0x5e32f847, 0x0135a613)},
+      {TOBN(0x0703b312, 0xcf933c83), TOBN(0xd05bb76e, 0x1a7f47e6),
+       TOBN(0x825e4f0c, 0x949c2415), TOBN(0x569e5622, 0x7250d6f8)}},
+     {{TOBN(0xbbe9eb3a, 0x6568013e), TOBN(0x8dbd203f, 0x22f243fc),
+       TOBN(0x9dbd7694, 0xb342734a), TOBN(0x8f6d12f8, 0x46afa984)},
+      {TOBN(0xb98610a2, 0xc9eade29), TOBN(0xbab4f323, 0x47dd0f18),
+       TOBN(0x5779737b, 0x671c0d46), TOBN(0x10b6a7c6, 0xd3e0a42a)}},
+     {{TOBN(0xfb19ddf3, 0x3035b41c), TOBN(0xd336343f, 0x99c45895),
+       TOBN(0x61fe4938, 0x54c857e5), TOBN(0xc4d506be, 0xae4e57d5)},
+      {TOBN(0x3cd8c8cb, 0xbbc33f75), TOBN(0x7281f08a, 0x9262c77d),
+       TOBN(0x083f4ea6, 0xf11a2823), TOBN(0x8895041e, 0x9fba2e33)}},
+     {{TOBN(0xfcdfea49, 0x9c438edf), TOBN(0x7678dcc3, 0x91edba44),
+       TOBN(0xf07b3b87, 0xe2ba50f0), TOBN(0xc13888ef, 0x43948c1b)},
+      {TOBN(0xc2135ad4, 0x1140af42), TOBN(0x8e5104f3, 0x926ed1a7),
+       TOBN(0xf24430cb, 0x88f6695f), TOBN(0x0ce0637b, 0x6d73c120)}},
+     {{TOBN(0xb2db01e6, 0xfe631e8f), TOBN(0x1c5563d7, 0xd7bdd24b),
+       TOBN(0x8daea3ba, 0x369ad44f), TOBN(0x000c81b6, 0x8187a9f9)},
+      {TOBN(0x5f48a951, 0xaae1fd9a), TOBN(0xe35626c7, 0x8d5aed8a),
+       TOBN(0x20952763, 0x0498c622), TOBN(0x76d17634, 0x773aa504)}},
+     {{TOBN(0x36d90dda, 0xeb300f7a), TOBN(0x9dcf7dfc, 0xedb5e801),
+       TOBN(0x645cb268, 0x74d5244c), TOBN(0xa127ee79, 0x348e3aa2)},
+      {TOBN(0x488acc53, 0x575f1dbb), TOBN(0x95037e85, 0x80e6161e),
+       TOBN(0x57e59283, 0x292650d0), TOBN(0xabe67d99, 0x14938216)}},
+     {{TOBN(0x3c7f944b, 0x3f8e1065), TOBN(0xed908cb6, 0x330e8924),
+       TOBN(0x08ee8fd5, 0x6f530136), TOBN(0x2227b7d5, 0xd7ffc169)},
+      {TOBN(0x4f55c893, 0xb5cd6dd5), TOBN(0x82225e11, 0xa62796e8),
+       TOBN(0x5c6cead1, 0xcb18e12c), TOBN(0x4381ae0c, 0x84f5a51a)}},
+     {{TOBN(0x345913d3, 0x7fafa4c8), TOBN(0x3d918082, 0x0491aac0),
+       TOBN(0x9347871f, 0x3e69264c), TOBN(0xbea9dd3c, 0xb4f4f0cd)},
+      {TOBN(0xbda5d067, 0x3eadd3e7), TOBN(0x0033c1b8, 0x0573bcd8),
+       TOBN(0x25589379, 0x5da2486c), TOBN(0xcb89ee5b, 0x86abbee7)}},
+     {{TOBN(0x8fe0a8f3, 0x22532e5d), TOBN(0xb6410ff0, 0x727dfc4c),
+       TOBN(0x619b9d58, 0x226726db), TOBN(0x5ec25669, 0x7a2b2dc7)},
+      {TOBN(0xaf4d2e06, 0x4c3beb01), TOBN(0x852123d0, 0x7acea556),
+       TOBN(0x0e9470fa, 0xf783487a), TOBN(0x75a7ea04, 0x5664b3eb)}},
+     {{TOBN(0x4ad78f35, 0x6798e4ba), TOBN(0x9214e6e5, 0xc7d0e091),
+       TOBN(0xc420b488, 0xb1290403), TOBN(0x64049e0a, 0xfc295749)},
+      {TOBN(0x03ef5af1, 0x3ae9841f), TOBN(0xdbe4ca19, 0xb0b662a6),
+       TOBN(0x46845c5f, 0xfa453458), TOBN(0xf8dabf19, 0x10b66722)}},
+     {{TOBN(0xb650f0aa, 0xcce2793b), TOBN(0x71db851e, 0xc5ec47c1),
+       TOBN(0x3eb78f3e, 0x3b234fa9), TOBN(0xb0c60f35, 0xfc0106ce)},
+      {TOBN(0x05427121, 0x774eadbd), TOBN(0x25367faf, 0xce323863),
+       TOBN(0x7541b5c9, 0xcd086976), TOBN(0x4ff069e2, 0xdc507ad1)}},
+     {{TOBN(0x74145256, 0x8776e667), TOBN(0x6e76142c, 0xb23c6bb5),
+       TOBN(0xdbf30712, 0x1b3a8a87), TOBN(0x60e7363e, 0x98450836)},
+      {TOBN(0x5741450e, 0xb7366d80), TOBN(0xe4ee14ca, 0x4837dbdf),
+       TOBN(0xa765eb9b, 0x69d4316f), TOBN(0x04548dca, 0x8ef43825)}},
+     {{TOBN(0x9c9f4e4c, 0x5ae888eb), TOBN(0x733abb51, 0x56e9ac99),
+       TOBN(0xdaad3c20, 0xba6ac029), TOBN(0x9b8dd3d3, 0x2ba3e38e)},
+      {TOBN(0xa9bb4c92, 0x0bc5d11a), TOBN(0xf20127a7, 0x9c5f88a3),
+       TOBN(0x4f52b06e, 0x161d3cb8), TOBN(0x26c1ff09, 0x6afaf0a6)}},
+     {{TOBN(0x32670d2f, 0x7189e71f), TOBN(0xc6438748, 0x5ecf91e7),
+       TOBN(0x15758e57, 0xdb757a21), TOBN(0x427d09f8, 0x290a9ce5)},
+      {TOBN(0x846a308f, 0x38384a7a), TOBN(0xaac3acb4, 0xb0732b99),
+       TOBN(0x9e941009, 0x17845819), TOBN(0x95cba111, 0xa7ce5e03)}},
+     {{TOBN(0x6f3d4f7f, 0xb00009c4), TOBN(0xb8396c27, 0x8ff28b5f),
+       TOBN(0xb1a9ae43, 0x1c97975d), TOBN(0x9d7ba8af, 0xe5d9fed5)},
+      {TOBN(0x338cf09f, 0x34f485b6), TOBN(0xbc0ddacc, 0x64122516),
+       TOBN(0xa450da12, 0x05d471fe), TOBN(0x4c3a6250, 0x628dd8c9)}},
+     {{TOBN(0x69c7d103, 0xd1295837), TOBN(0xa2893e50, 0x3807eb2f),
+       TOBN(0xd6e1e1de, 0xbdb41491), TOBN(0xc630745b, 0x5e138235)},
+      {TOBN(0xc892109e, 0x48661ae1), TOBN(0x8d17e7eb, 0xea2b2674),
+       TOBN(0x00ec0f87, 0xc328d6b5), TOBN(0x6d858645, 0xf079ff9e)}},
+     {{TOBN(0x6cdf243e, 0x19115ead), TOBN(0x1ce1393e, 0x4bac4fcf),
+       TOBN(0x2c960ed0, 0x9c29f25b), TOBN(0x59be4d8e, 0x9d388a05)},
+      {TOBN(0x0d46e06c, 0xd0def72b), TOBN(0xb923db5d, 0xe0342748),
+       TOBN(0xf7d3aacd, 0x936d4a3d), TOBN(0x558519cc, 0x0b0b099e)}},
+     {{TOBN(0x3ea8ebf8, 0x827097ef), TOBN(0x259353db, 0xd054f55d),
+       TOBN(0x84c89abc, 0x6d2ed089), TOBN(0x5c548b69, 0x8e096a7c)},
+      {TOBN(0xd587f616, 0x994b995d), TOBN(0x4d1531f6, 0xa5845601),
+       TOBN(0x792ab31e, 0x451fd9f0), TOBN(0xc8b57bb2, 0x65adf6ca)}},
+     {{TOBN(0x68440fcb, 0x1cd5ad73), TOBN(0xb9c860e6, 0x6144da4f),
+       TOBN(0x2ab286aa, 0x8462beb8), TOBN(0xcc6b8fff, 0xef46797f)},
+      {TOBN(0xac820da4, 0x20c8a471), TOBN(0x69ae05a1, 0x77ff7faf),
+       TOBN(0xb9163f39, 0xbfb5da77), TOBN(0xbd03e590, 0x2c73ab7a)}},
+     {{TOBN(0x7e862b5e, 0xb2940d9e), TOBN(0x3c663d86, 0x4b9af564),
+       TOBN(0xd8309031, 0xbde3033d), TOBN(0x298231b2, 0xd42c5bc6)},
+      {TOBN(0x42090d2c, 0x552ad093), TOBN(0xa4799d1c, 0xff854695),
+       TOBN(0x0a88b5d6, 0xd31f0d00), TOBN(0xf8b40825, 0xa2f26b46)}},
+     {{TOBN(0xec29b1ed, 0xf1bd7218), TOBN(0xd491c53b, 0x4b24c86e),
+       TOBN(0xd2fe588f, 0x3395ea65), TOBN(0x6f3764f7, 0x4456ef15)},
+      {TOBN(0xdb43116d, 0xcdc34800), TOBN(0xcdbcd456, 0xc1e33955),
+       TOBN(0xefdb5540, 0x74ab286b), TOBN(0x948c7a51, 0xd18c5d7c)}},
+     {{TOBN(0xeb81aa37, 0x7378058e), TOBN(0x41c746a1, 0x04411154),
+       TOBN(0xa10c73bc, 0xfb828ac7), TOBN(0x6439be91, 0x9d972b29)},
+      {TOBN(0x4bf3b4b0, 0x43a2fbad), TOBN(0x39e6dadf, 0x82b5e840),
+       TOBN(0x4f716408, 0x6397bd4c), TOBN(0x0f7de568, 0x7f1eeccb)}},
+     {{TOBN(0x5865c5a1, 0xd2ffbfc1), TOBN(0xf74211fa, 0x4ccb6451),
+       TOBN(0x66368a88, 0xc0b32558), TOBN(0x5b539dc2, 0x9ad7812e)},
+      {TOBN(0x579483d0, 0x2f3af6f6), TOBN(0x52132078, 0x99934ece),
+       TOBN(0x50b9650f, 0xdcc9e983), TOBN(0xca989ec9, 0xaee42b8a)}},
+     {{TOBN(0x6a44c829, 0xd6f62f99), TOBN(0x8f06a309, 0x4c2a7c0c),
+       TOBN(0x4ea2b3a0, 0x98a0cb0a), TOBN(0x5c547b70, 0xbeee8364)},
+      {TOBN(0x461d40e1, 0x682afe11), TOBN(0x9e0fc77a, 0x7b41c0a8),
+       TOBN(0x79e4aefd, 0xe20d5d36), TOBN(0x2916e520, 0x32dd9f63)}},
+     {{TOBN(0xf59e52e8, 0x3f883faf), TOBN(0x396f9639, 0x2b868d35),
+       TOBN(0xc902a9df, 0x4ca19881), TOBN(0x0fc96822, 0xdb2401a6)},
+      {TOBN(0x41237587, 0x66f1c68d), TOBN(0x10fc6de3, 0xfb476c0d),
+       TOBN(0xf8b6b579, 0x841f5d90), TOBN(0x2ba8446c, 0xfa24f44a)}},
+     {{TOBN(0xa237b920, 0xef4a9975), TOBN(0x60bb6004, 0x2330435f),
+       TOBN(0xd6f4ab5a, 0xcfb7e7b5), TOBN(0xb2ac5097, 0x83435391)},
+      {TOBN(0xf036ee2f, 0xb0d1ea67), TOBN(0xae779a6a, 0x74c56230),
+       TOBN(0x59bff8c8, 0xab838ae6), TOBN(0xcd83ca99, 0x9b38e6f0)}},
+     {{TOBN(0xbb27bef5, 0xe33deed3), TOBN(0xe6356f6f, 0x001892a8),
+       TOBN(0xbf3be6cc, 0x7adfbd3e), TOBN(0xaecbc81c, 0x33d1ac9d)},
+      {TOBN(0xe4feb909, 0xe6e861dc), TOBN(0x90a247a4, 0x53f5f801),
+       TOBN(0x01c50acb, 0x27346e57), TOBN(0xce29242e, 0x461acc1b)}},
+     {{TOBN(0x04dd214a, 0x2f998a91), TOBN(0x271ee9b1, 0xd4baf27b),
+       TOBN(0x7e3027d1, 0xe8c26722), TOBN(0x21d1645c, 0x1820dce5)},
+      {TOBN(0x086f242c, 0x7501779c), TOBN(0xf0061407, 0xfa0e8009),
+       TOBN(0xf23ce477, 0x60187129), TOBN(0x05bbdedb, 0x0fde9bd0)}},
+     {{TOBN(0x682f4832, 0x25d98473), TOBN(0xf207fe85, 0x5c658427),
+       TOBN(0xb6fdd7ba, 0x4166ffa1), TOBN(0x0c314056, 0x9eed799d)},
+      {TOBN(0x0db8048f, 0x4107e28f), TOBN(0x74ed3871, 0x41216840),
+       TOBN(0x74489f8f, 0x56a3c06e), TOBN(0x1e1c005b, 0x12777134)}},
+     {{TOBN(0xdb332a73, 0xf37ec3c3), TOBN(0xc65259bd, 0xdd59eba0),
+       TOBN(0x2291709c, 0xdb4d3257), TOBN(0x9a793b25, 0xbd389390)},
+      {TOBN(0xf39fe34b, 0xe43756f0), TOBN(0x2f76bdce, 0x9afb56c9),
+       TOBN(0x9f37867a, 0x61208b27), TOBN(0xea1d4307, 0x089972c3)}},
+     {{TOBN(0x8c595330, 0x8bdf623a), TOBN(0x5f5accda, 0x8441fb7d),
+       TOBN(0xfafa9418, 0x32ddfd95), TOBN(0x6ad40c5a, 0x0fde9be7)},
+      {TOBN(0x43faba89, 0xaeca8709), TOBN(0xc64a7cf1, 0x2c248a9d),
+       TOBN(0x16620252, 0x72637a76), TOBN(0xaee1c791, 0x22b8d1bb)}},
+     {{TOBN(0xf0f798fd, 0x21a843b2), TOBN(0x56e4ed4d, 0x8d005cb1),
+       TOBN(0x355f7780, 0x1f0d8abe), TOBN(0x197b04cf, 0x34522326)},
+      {TOBN(0x41f9b31f, 0xfd42c13f), TOBN(0x5ef7feb2, 0xb40f933d),
+       TOBN(0x27326f42, 0x5d60bad4), TOBN(0x027ecdb2, 0x8c92cf89)}},
+     {{TOBN(0x04aae4d1, 0x4e3352fe), TOBN(0x08414d2f, 0x73591b90),
+       TOBN(0x5ed6124e, 0xb7da7d60), TOBN(0xb985b931, 0x4d13d4ec)},
+      {TOBN(0xa592d3ab, 0x96bf36f9), TOBN(0x012dbed5, 0xbbdf51df),
+       TOBN(0xa57963c0, 0xdf6c177d), TOBN(0x010ec869, 0x87ca29cf)}},
+     {{TOBN(0xba1700f6, 0xbf926dff), TOBN(0x7c9fdbd1, 0xf4bf6bc2),
+       TOBN(0xdc18dc8f, 0x64da11f5), TOBN(0xa6074b7a, 0xd938ae75)},
+      {TOBN(0x14270066, 0xe84f44a4), TOBN(0x99998d38, 0xd27b954e),
+       TOBN(0xc1be8ab2, 0xb4f38e9a), TOBN(0x8bb55bbf, 0x15c01016)}},
+     {{TOBN(0xf73472b4, 0x0ea2ab30), TOBN(0xd365a340, 0xf73d68dd),
+       TOBN(0xc01a7168, 0x19c2e1eb), TOBN(0x32f49e37, 0x34061719)},
+      {TOBN(0xb73c57f1, 0x01d8b4d6), TOBN(0x03c8423c, 0x26b47700),
+       TOBN(0x321d0bc8, 0xa4d8826a), TOBN(0x6004213c, 0x4bc0e638)}},
+     {{TOBN(0xf78c64a1, 0xc1c06681), TOBN(0x16e0a16f, 0xef018e50),
+       TOBN(0x31cbdf91, 0xdb42b2b3), TOBN(0xf8f4ffce, 0xe0d36f58)},
+      {TOBN(0xcdcc71cd, 0x4cc5e3e0), TOBN(0xd55c7cfa, 0xa129e3e0),
+       TOBN(0xccdb6ba0, 0x0fb2cbf1), TOBN(0x6aba0005, 0xc4bce3cb)}},
+     {{TOBN(0x501cdb30, 0xd232cfc4), TOBN(0x9ddcf12e, 0xd58a3cef),
+       TOBN(0x02d2cf9c, 0x87e09149), TOBN(0xdc5d7ec7, 0x2c976257)},
+      {TOBN(0x6447986e, 0x0b50d7dd), TOBN(0x88fdbaf7, 0x807f112a),
+       TOBN(0x58c9822a, 0xb00ae9f6), TOBN(0x6abfb950, 0x6d3d27e0)}},
+     {{TOBN(0xd0a74487, 0x8a429f4f), TOBN(0x0649712b, 0xdb516609),
+       TOBN(0xb826ba57, 0xe769b5df), TOBN(0x82335df2, 0x1fc7aaf2)},
+      {TOBN(0x2389f067, 0x5c93d995), TOBN(0x59ac367a, 0x68677be6),
+       TOBN(0xa77985ff, 0x21d9951b), TOBN(0x038956fb, 0x85011cce)}},
+     {{TOBN(0x608e48cb, 0xbb734e37), TOBN(0xc08c0bf2, 0x2be5b26f),
+       TOBN(0x17bbdd3b, 0xf9b1a0d9), TOBN(0xeac7d898, 0x10483319)},
+      {TOBN(0xc95c4baf, 0xbc1a6dea), TOBN(0xfdd0e2bf, 0x172aafdb),
+       TOBN(0x40373cbc, 0x8235c41a), TOBN(0x14303f21, 0xfb6f41d5)}},
+     {{TOBN(0xba063621, 0x0408f237), TOBN(0xcad3b09a, 0xecd2d1ed),
+       TOBN(0x4667855a, 0x52abb6a2), TOBN(0xba9157dc, 0xaa8b417b)},
+      {TOBN(0xfe7f3507, 0x4f013efb), TOBN(0x1b112c4b, 0xaa38c4a2),
+       TOBN(0xa1406a60, 0x9ba64345), TOBN(0xe53cba33, 0x6993c80b)}},
+     {{TOBN(0x45466063, 0xded40d23), TOBN(0x3d5f1f4d, 0x54908e25),
+       TOBN(0x9ebefe62, 0x403c3c31), TOBN(0x274ea0b5, 0x0672a624)},
+      {TOBN(0xff818d99, 0x451d1b71), TOBN(0x80e82643, 0x8f79cf79),
+       TOBN(0xa165df13, 0x73ce37f5), TOBN(0xa744ef4f, 0xfe3a21fd)}},
+     {{TOBN(0x73f1e7f5, 0xcf551396), TOBN(0xc616898e, 0x868c676b),
+       TOBN(0x671c28c7, 0x8c442c36), TOBN(0xcfe5e558, 0x5e0a317d)},
+      {TOBN(0x1242d818, 0x7051f476), TOBN(0x56fad2a6, 0x14f03442),
+       TOBN(0x262068bc, 0x0a44d0f6), TOBN(0xdfa2cd6e, 0xce6edf4e)}},
+     {{TOBN(0x0f43813a, 0xd15d1517), TOBN(0x61214cb2, 0x377d44f5),
+       TOBN(0xd399aa29, 0xc639b35f), TOBN(0x42136d71, 0x54c51c19)},
+      {TOBN(0x9774711b, 0x08417221), TOBN(0x0a5546b3, 0x52545a57),
+       TOBN(0x80624c41, 0x1150582d), TOBN(0x9ec5c418, 0xfbc555bc)}},
+     {{TOBN(0x2c87dcad, 0x771849f1), TOBN(0xb0c932c5, 0x01d7bf6f),
+       TOBN(0x6aa5cd3e, 0x89116eb2), TOBN(0xd378c25a, 0x51ca7bd3)},
+      {TOBN(0xc612a0da, 0x9e6e3e31), TOBN(0x0417a54d, 0xb68ad5d0),
+       TOBN(0x00451e4a, 0x22c6edb8), TOBN(0x9fbfe019, 0xb42827ce)}},
+     {{TOBN(0x2fa92505, 0xba9384a2), TOBN(0x21b8596e, 0x64ad69c1),
+       TOBN(0x8f4fcc49, 0x983b35a6), TOBN(0xde093760, 0x72754672)},
+      {TOBN(0x2f14ccc8, 0xf7bffe6d), TOBN(0x27566bff, 0x5d94263d),
+       TOBN(0xb5b4e9c6, 0x2df3ec30), TOBN(0x94f1d7d5, 0x3e6ea6ba)}},
+     {{TOBN(0x97b7851a, 0xaaca5e9b), TOBN(0x518aa521, 0x56713b97),
+       TOBN(0x3357e8c7, 0x150a61f6), TOBN(0x7842e7e2, 0xec2c2b69)},
+      {TOBN(0x8dffaf65, 0x6868a548), TOBN(0xd963bd82, 0xe068fc81),
+       TOBN(0x64da5c8b, 0x65917733), TOBN(0x927090ff, 0x7b247328)}}},
+    {{{TOBN(0x214bc9a7, 0xd298c241), TOBN(0xe3b697ba, 0x56807cfd),
+       TOBN(0xef1c7802, 0x4564eadb), TOBN(0xdde8cdcf, 0xb48149c5)},
+      {TOBN(0x946bf0a7, 0x5a4d2604), TOBN(0x27154d7f, 0x6c1538af),
+       TOBN(0x95cc9230, 0xde5b1fcc), TOBN(0xd88519e9, 0x66864f82)}},
+     {{TOBN(0xb828dd1a, 0x7cb1282c), TOBN(0xa08d7626, 0xbe46973a),
+       TOBN(0x6baf8d40, 0xe708d6b2), TOBN(0x72571fa1, 0x4daeb3f3)},
+      {TOBN(0x85b1732f, 0xf22dfd98), TOBN(0x87ab01a7, 0x0087108d),
+       TOBN(0xaaaafea8, 0x5988207a), TOBN(0xccc832f8, 0x69f00755)}},
+     {{TOBN(0x964d950e, 0x36ff3bf0), TOBN(0x8ad20f6f, 0xf0b34638),
+       TOBN(0x4d9177b3, 0xb5d7585f), TOBN(0xcf839760, 0xef3f019f)},
+      {TOBN(0x582fc5b3, 0x8288c545), TOBN(0x2f8e4e9b, 0x13116bd1),
+       TOBN(0xf91e1b2f, 0x332120ef), TOBN(0xcf568724, 0x2a17dd23)}},
+     {{TOBN(0x488f1185, 0xca8d9d1a), TOBN(0xadf2c77d, 0xd987ded2),
+       TOBN(0x5f3039f0, 0x60c46124), TOBN(0xe5d70b75, 0x71e095f4)},
+      {TOBN(0x82d58650, 0x6260e70f), TOBN(0x39d75ea7, 0xf750d105),
+       TOBN(0x8cf3d0b1, 0x75bac364), TOBN(0xf3a7564d, 0x21d01329)}},
+     {{TOBN(0x182f04cd, 0x2f52d2a7), TOBN(0x4fde149a, 0xe2df565a),
+       TOBN(0xb80c5eec, 0xa79fb2f7), TOBN(0xab491d7b, 0x22ddc897)},
+      {TOBN(0x99d76c18, 0xc6312c7f), TOBN(0xca0d5f3d, 0x6aa41a57),
+       TOBN(0x71207325, 0xd15363a0), TOBN(0xe82aa265, 0xbeb252c2)}},
+     {{TOBN(0x94ab4700, 0xec3128c2), TOBN(0x6c76d862, 0x8e383f49),
+       TOBN(0xdc36b150, 0xc03024eb), TOBN(0xfb439477, 0x53daac69)},
+      {TOBN(0xfc68764a, 0x8dc79623), TOBN(0x5b86995d, 0xb440fbb2),
+       TOBN(0xd66879bf, 0xccc5ee0d), TOBN(0x05228942, 0x95aa8bd3)}},
+     {{TOBN(0xb51a40a5, 0x1e6a75c1), TOBN(0x24327c76, 0x0ea7d817),
+       TOBN(0x06630182, 0x07774597), TOBN(0xd6fdbec3, 0x97fa7164)},
+      {TOBN(0x20c99dfb, 0x13c90f48), TOBN(0xd6ac5273, 0x686ef263),
+       TOBN(0xc6a50bdc, 0xfef64eeb), TOBN(0xcd87b281, 0x86fdfc32)}},
+     {{TOBN(0xb24aa43e, 0x3fcd3efc), TOBN(0xdd26c034, 0xb8088e9a),
+       TOBN(0xa5ef4dc9, 0xbd3d46ea), TOBN(0xa2f99d58, 0x8a4c6a6f)},
+      {TOBN(0xddabd355, 0x2f1da46c), TOBN(0x72c3f8ce, 0x1afacdd1),
+       TOBN(0xd90c4eee, 0x92d40578), TOBN(0xd28bb41f, 0xca623b94)}},
+     {{TOBN(0x50fc0711, 0x745edc11), TOBN(0x9dd9ad7d, 0x3dc87558),
+       TOBN(0xce6931fb, 0xb49d1e64), TOBN(0x6c77a0a2, 0xc98bd0f9)},
+      {TOBN(0x62b9a629, 0x6baf7cb1), TOBN(0xcf065f91, 0xccf72d22),
+       TOBN(0x7203cce9, 0x79639071), TOBN(0x09ae4885, 0xf9cb732f)}},
+     {{TOBN(0x5e7c3bec, 0xee8314f3), TOBN(0x1c068aed, 0xdbea298f),
+       TOBN(0x08d381f1, 0x7c80acec), TOBN(0x03b56be8, 0xe330495b)},
+      {TOBN(0xaeffb8f2, 0x9222882d), TOBN(0x95ff38f6, 0xc4af8bf7),
+       TOBN(0x50e32d35, 0x1fc57d8c), TOBN(0x6635be52, 0x17b444f0)}},
+     {{TOBN(0x04d15276, 0xa5177900), TOBN(0x4e1dbb47, 0xf6858752),
+       TOBN(0x5b475622, 0xc615796c), TOBN(0xa6fa0387, 0x691867bf)},
+      {TOBN(0xed7f5d56, 0x2844c6d0), TOBN(0xc633cf9b, 0x03a2477d),
+       TOBN(0xf6be5c40, 0x2d3721d6), TOBN(0xaf312eb7, 0xe9fd68e6)}},
+     {{TOBN(0x242792d2, 0xe7417ce1), TOBN(0xff42bc71, 0x970ee7f5),
+       TOBN(0x1ff4dc6d, 0x5c67a41e), TOBN(0x77709b7b, 0x20882a58)},
+      {TOBN(0x3554731d, 0xbe217f2c), TOBN(0x2af2a8cd, 0x5bb72177),
+       TOBN(0x58eee769, 0x591dd059), TOBN(0xbb2930c9, 0x4bba6477)}},
+     {{TOBN(0x863ee047, 0x7d930cfc), TOBN(0x4c262ad1, 0x396fd1f4),
+       TOBN(0xf4765bc8, 0x039af7e1), TOBN(0x2519834b, 0x5ba104f6)},
+      {TOBN(0x7cd61b4c, 0xd105f961), TOBN(0xa5415da5, 0xd63bca54),
+       TOBN(0x778280a0, 0x88a1f17c), TOBN(0xc4968949, 0x2329512c)}},
+     {{TOBN(0x174a9126, 0xcecdaa7a), TOBN(0xfc8c7e0e, 0x0b13247b),
+       TOBN(0x29c110d2, 0x3484c1c4), TOBN(0xf8eb8757, 0x831dfc3b)},
+      {TOBN(0x022f0212, 0xc0067452), TOBN(0x3f6f69ee, 0x7b9b926c),
+       TOBN(0x09032da0, 0xef42daf4), TOBN(0x79f00ade, 0x83f80de4)}},
+     {{TOBN(0x6210db71, 0x81236c97), TOBN(0x74f7685b, 0x3ee0781f),
+       TOBN(0x4df7da7b, 0xa3e41372), TOBN(0x2aae38b1, 0xb1a1553e)},
+      {TOBN(0x1688e222, 0xf6dd9d1b), TOBN(0x57695448, 0x5b8b6487),
+       TOBN(0x478d2127, 0x4b2edeaa), TOBN(0xb2818fa5, 0x1e85956a)}},
+     {{TOBN(0x1e6addda, 0xf176f2c0), TOBN(0x01ca4604, 0xe2572658),
+       TOBN(0x0a404ded, 0x85342ffb), TOBN(0x8cf60f96, 0x441838d6)},
+      {TOBN(0x9bbc691c, 0xc9071c4a), TOBN(0xfd588744, 0x34442803),
+       TOBN(0x97101c85, 0x809c0d81), TOBN(0xa7fb754c, 0x8c456f7f)}},
+     {{TOBN(0xc95f3c5c, 0xd51805e1), TOBN(0xab4ccd39, 0xb299dca8),
+       TOBN(0x3e03d20b, 0x47eaf500), TOBN(0xfa3165c1, 0xd7b80893)},
+      {TOBN(0x005e8b54, 0xe160e552), TOBN(0xdc4972ba, 0x9019d11f),
+       TOBN(0x21a6972e, 0x0c9a4a7a), TOBN(0xa52c258f, 0x37840fd7)}},
+     {{TOBN(0xf8559ff4, 0xc1e99d81), TOBN(0x08e1a7d6, 0xa3c617c0),
+       TOBN(0xb398fd43, 0x248c6ba7), TOBN(0x6ffedd91, 0xd1283794)},
+      {TOBN(0x8a6a59d2, 0xd629d208), TOBN(0xa9d141d5, 0x3490530e),
+       TOBN(0x42f6fc18, 0x38505989), TOBN(0x09bf250d, 0x479d94ee)}},
+     {{TOBN(0x223ad3b1, 0xb3822790), TOBN(0x6c5926c0, 0x93b8971c),
+       TOBN(0x609efc7e, 0x75f7fa62), TOBN(0x45d66a6d, 0x1ec2d989)},
+      {TOBN(0x4422d663, 0x987d2792), TOBN(0x4a73caad, 0x3eb31d2b),
+       TOBN(0xf06c2ac1, 0xa32cb9e6), TOBN(0xd9445c5f, 0x91aeba84)}},
+     {{TOBN(0x6af7a1d5, 0xaf71013f), TOBN(0xe68216e5, 0x0bedc946),
+       TOBN(0xf4cba30b, 0xd27370a0), TOBN(0x7981afbf, 0x870421cc)},
+      {TOBN(0x02496a67, 0x9449f0e1), TOBN(0x86cfc4be, 0x0a47edae),
+       TOBN(0x3073c936, 0xb1feca22), TOBN(0xf5694612, 0x03f8f8fb)}},
+     {{TOBN(0xd063b723, 0x901515ea), TOBN(0x4c6c77a5, 0x749cf038),
+       TOBN(0x6361e360, 0xab9e5059), TOBN(0x596cf171, 0xa76a37c0)},
+      {TOBN(0x800f53fa, 0x6530ae7a), TOBN(0x0f5e631e, 0x0792a7a6),
+       TOBN(0x5cc29c24, 0xefdb81c9), TOBN(0xa269e868, 0x3f9c40ba)}},
+     {{TOBN(0xec14f9e1, 0x2cb7191e), TOBN(0x78ea1bd8, 0xe5b08ea6),
+       TOBN(0x3c65aa9b, 0x46332bb9), TOBN(0x84cc22b3, 0xbf80ce25)},
+      {TOBN(0x0098e9e9, 0xd49d5bf1), TOBN(0xcd4ec1c6, 0x19087da4),
+       TOBN(0x3c9d07c5, 0xaef6e357), TOBN(0x839a0268, 0x9f8f64b8)}},
+     {{TOBN(0xc5e9eb62, 0xc6d8607f), TOBN(0x759689f5, 0x6aa995e4),
+       TOBN(0x70464669, 0xbbb48317), TOBN(0x921474bf, 0xe402417d)},
+      {TOBN(0xcabe135b, 0x2a354c8c), TOBN(0xd51e52d2, 0x812fa4b5),
+       TOBN(0xec741096, 0x53311fe8), TOBN(0x4f774535, 0xb864514b)}},
+     {{TOBN(0xbcadd671, 0x5bde48f8), TOBN(0xc9703873, 0x2189bc7d),
+       TOBN(0x5d45299e, 0xc709ee8a), TOBN(0xd1287ee2, 0x845aaff8)},
+      {TOBN(0x7d1f8874, 0xdb1dbf1f), TOBN(0xea46588b, 0x990c88d6),
+       TOBN(0x60ba649a, 0x84368313), TOBN(0xd5fdcbce, 0x60d543ae)}},
+     {{TOBN(0x90b46d43, 0x810d5ab0), TOBN(0x6739d8f9, 0x04d7e5cc),
+       TOBN(0x021c1a58, 0x0d337c33), TOBN(0x00a61162, 0x68e67c40)},
+      {TOBN(0x95ef413b, 0x379f0a1f), TOBN(0xfe126605, 0xe9e2ab95),
+       TOBN(0x67578b85, 0x2f5f199c), TOBN(0xf5c00329, 0x2cb84913)}},
+     {{TOBN(0xf7956430, 0x37577dd8), TOBN(0x83b82af4, 0x29c5fe88),
+       TOBN(0x9c1bea26, 0xcdbdc132), TOBN(0x589fa086, 0x9c04339e)},
+      {TOBN(0x033e9538, 0xb13799df), TOBN(0x85fa8b21, 0xd295d034),
+       TOBN(0xdf17f73f, 0xbd9ddcca), TOBN(0xf32bd122, 0xddb66334)}},
+     {{TOBN(0x55ef88a7, 0x858b044c), TOBN(0x1f0d69c2, 0x5aa9e397),
+       TOBN(0x55fd9cc3, 0x40d85559), TOBN(0xc774df72, 0x7785ddb2)},
+      {TOBN(0x5dcce9f6, 0xd3bd2e1c), TOBN(0xeb30da20, 0xa85dfed0),
+       TOBN(0x5ed7f5bb, 0xd3ed09c4), TOBN(0x7d42a35c, 0x82a9c1bd)}},
+     {{TOBN(0xcf3de995, 0x9890272d), TOBN(0x75f3432a, 0x3e713a10),
+       TOBN(0x5e13479f, 0xe28227b8), TOBN(0xb8561ea9, 0xfefacdc8)},
+      {TOBN(0xa6a297a0, 0x8332aafd), TOBN(0x9b0d8bb5, 0x73809b62),
+       TOBN(0xd2fa1cfd, 0x0c63036f), TOBN(0x7a16eb55, 0xbd64bda8)}},
+     {{TOBN(0x3f5cf5f6, 0x78e62ddc), TOBN(0x2267c454, 0x07fd752b),
+       TOBN(0x5e361b6b, 0x5e437bbe), TOBN(0x95c59501, 0x8354e075)},
+      {TOBN(0xec725f85, 0xf2b254d9), TOBN(0x844b617d, 0x2cb52b4e),
+       TOBN(0xed8554f5, 0xcf425fb5), TOBN(0xab67703e, 0x2af9f312)}},
+     {{TOBN(0x4cc34ec1, 0x3cf48283), TOBN(0xb09daa25, 0x9c8a705e),
+       TOBN(0xd1e9d0d0, 0x5b7d4f84), TOBN(0x4df6ef64, 0xdb38929d)},
+      {TOBN(0xe16b0763, 0xaa21ba46), TOBN(0xc6b1d178, 0xa293f8fb),
+       TOBN(0x0ff5b602, 0xd520aabf), TOBN(0x94d671bd, 0xc339397a)}},
+     {{TOBN(0x7c7d98cf, 0x4f5792fa), TOBN(0x7c5e0d67, 0x11215261),
+       TOBN(0x9b19a631, 0xa7c5a6d4), TOBN(0xc8511a62, 0x7a45274d)},
+      {TOBN(0x0c16621c, 0xa5a60d99), TOBN(0xf7fbab88, 0xcf5e48cb),
+       TOBN(0xab1e6ca2, 0xf7ddee08), TOBN(0x83bd08ce, 0xe7867f3c)}},
+     {{TOBN(0xf7e48e8a, 0x2ac13e27), TOBN(0x4494f6df, 0x4eb1a9f5),
+       TOBN(0xedbf84eb, 0x981f0a62), TOBN(0x49badc32, 0x536438f0)},
+      {TOBN(0x50bea541, 0x004f7571), TOBN(0xbac67d10, 0xdf1c94ee),
+       TOBN(0x253d73a1, 0xb727bc31), TOBN(0xb3d01cf2, 0x30686e28)}},
+     {{TOBN(0x51b77b1b, 0x55fd0b8b), TOBN(0xa099d183, 0xfeec3173),
+       TOBN(0x202b1fb7, 0x670e72b7), TOBN(0xadc88b33, 0xa8e1635f)},
+      {TOBN(0x34e8216a, 0xf989d905), TOBN(0xc2e68d20, 0x29b58d01),
+       TOBN(0x11f81c92, 0x6fe55a93), TOBN(0x15f1462a, 0x8f296f40)}},
+     {{TOBN(0x1915d375, 0xea3d62f2), TOBN(0xa17765a3, 0x01c8977d),
+       TOBN(0x7559710a, 0xe47b26f6), TOBN(0xe0bd29c8, 0x535077a5)},
+      {TOBN(0x615f976d, 0x08d84858), TOBN(0x370dfe85, 0x69ced5c1),
+       TOBN(0xbbc7503c, 0xa734fa56), TOBN(0xfbb9f1ec, 0x91ac4574)}},
+     {{TOBN(0x95d7ec53, 0x060dd7ef), TOBN(0xeef2dacd, 0x6e657979),
+       TOBN(0x54511af3, 0xe2a08235), TOBN(0x1e324aa4, 0x1f4aea3d)},
+      {TOBN(0x550e7e71, 0xe6e67671), TOBN(0xbccd5190, 0xbf52faf7),
+       TOBN(0xf880d316, 0x223cc62a), TOBN(0x0d402c7e, 0x2b32eb5d)}},
+     {{TOBN(0xa40bc039, 0x306a5a3b), TOBN(0x4e0a41fd, 0x96783a1b),
+       TOBN(0xa1e8d39a, 0x0253cdd4), TOBN(0x6480be26, 0xc7388638)},
+      {TOBN(0xee365e1d, 0x2285f382), TOBN(0x188d8d8f, 0xec0b5c36),
+       TOBN(0x34ef1a48, 0x1f0f4d82), TOBN(0x1a8f43e1, 0xa487d29a)}},
+     {{TOBN(0x8168226d, 0x77aefb3a), TOBN(0xf69a751e, 0x1e72c253),
+       TOBN(0x8e04359a, 0xe9594df1), TOBN(0x475ffd7d, 0xd14c0467)},
+      {TOBN(0xb5a2c2b1, 0x3844e95c), TOBN(0x85caf647, 0xdd12ef94),
+       TOBN(0x1ecd2a9f, 0xf1063d00), TOBN(0x1dd2e229, 0x23843311)}},
+     {{TOBN(0x38f0e09d, 0x73d17244), TOBN(0x3ede7746, 0x8fc653f1),
+       TOBN(0xae4459f5, 0xdc20e21c), TOBN(0x00db2ffa, 0x6a8599ea)},
+      {TOBN(0x11682c39, 0x30cfd905), TOBN(0x4934d074, 0xa5c112a6),
+       TOBN(0xbdf063c5, 0x568bfe95), TOBN(0x779a440a, 0x016c441a)}},
+     {{TOBN(0x0c23f218, 0x97d6fbdc), TOBN(0xd3a5cd87, 0xe0776aac),
+       TOBN(0xcee37f72, 0xd712e8db), TOBN(0xfb28c70d, 0x26f74e8d)},
+      {TOBN(0xffe0c728, 0xb61301a0), TOBN(0xa6282168, 0xd3724354),
+       TOBN(0x7ff4cb00, 0x768ffedc), TOBN(0xc51b3088, 0x03b02de9)}},
+     {{TOBN(0xa5a8147c, 0x3902dda5), TOBN(0x35d2f706, 0xfe6973b4),
+       TOBN(0x5ac2efcf, 0xc257457e), TOBN(0x933f48d4, 0x8700611b)},
+      {TOBN(0xc365af88, 0x4912beb2), TOBN(0x7f5a4de6, 0x162edf94),
+       TOBN(0xc646ba7c, 0x0c32f34b), TOBN(0x632c6af3, 0xb2091074)}},
+     {{TOBN(0x58d4f2e3, 0x753e43a9), TOBN(0x70e1d217, 0x24d4e23f),
+       TOBN(0xb24bf729, 0xafede6a6), TOBN(0x7f4a94d8, 0x710c8b60)},
+      {TOBN(0xaad90a96, 0x8d4faa6a), TOBN(0xd9ed0b32, 0xb066b690),
+       TOBN(0x52fcd37b, 0x78b6dbfd), TOBN(0x0b64615e, 0x8bd2b431)}},
+     {{TOBN(0x228e2048, 0xcfb9fad5), TOBN(0xbeaa386d, 0x240b76bd),
+       TOBN(0x2d6681c8, 0x90dad7bc), TOBN(0x3e553fc3, 0x06d38f5e)},
+      {TOBN(0xf27cdb9b, 0x9d5f9750), TOBN(0x3e85c52a, 0xd28c5b0e),
+       TOBN(0x190795af, 0x5247c39b), TOBN(0x547831eb, 0xbddd6828)}},
+     {{TOBN(0xf327a227, 0x4a82f424), TOBN(0x36919c78, 0x7e47f89d),
+       TOBN(0xe4783919, 0x43c7392c), TOBN(0xf101b9aa, 0x2316fefe)},
+      {TOBN(0xbcdc9e9c, 0x1c5009d2), TOBN(0xfb55ea13, 0x9cd18345),
+       TOBN(0xf5b5e231, 0xa3ce77c7), TOBN(0xde6b4527, 0xd2f2cb3d)}},
+     {{TOBN(0x10f6a333, 0x9bb26f5f), TOBN(0x1e85db8e, 0x044d85b6),
+       TOBN(0xc3697a08, 0x94197e54), TOBN(0x65e18cc0, 0xa7cb4ea8)},
+      {TOBN(0xa38c4f50, 0xa471fe6e), TOBN(0xf031747a, 0x2f13439c),
+       TOBN(0x53c4a6ba, 0xc007318b), TOBN(0xa8da3ee5, 0x1deccb3d)}},
+     {{TOBN(0x0555b31c, 0x558216b1), TOBN(0x90c7810c, 0x2f79e6c2),
+       TOBN(0x9b669f4d, 0xfe8eed3c), TOBN(0x70398ec8, 0xe0fac126)},
+      {TOBN(0xa96a449e, 0xf701b235), TOBN(0x0ceecdb3, 0xeb94f395),
+       TOBN(0x285fc368, 0xd0cb7431), TOBN(0x0d37bb52, 0x16a18c64)}},
+     {{TOBN(0x05110d38, 0xb880d2dd), TOBN(0xa60f177b, 0x65930d57),
+       TOBN(0x7da34a67, 0xf36235f5), TOBN(0x47f5e17c, 0x183816b9)},
+      {TOBN(0xc7664b57, 0xdb394af4), TOBN(0x39ba215d, 0x7036f789),
+       TOBN(0x46d2ca0e, 0x2f27b472), TOBN(0xc42647ee, 0xf73a84b7)}},
+     {{TOBN(0x44bc7545, 0x64488f1d), TOBN(0xaa922708, 0xf4cf85d5),
+       TOBN(0x721a01d5, 0x53e4df63), TOBN(0x649c0c51, 0x5db46ced)},
+      {TOBN(0x6bf0d64e, 0x3cffcb6c), TOBN(0xe3bf93fe, 0x50f71d96),
+       TOBN(0x75044558, 0xbcc194a0), TOBN(0x16ae3372, 0x6afdc554)}},
+     {{TOBN(0xbfc01adf, 0x5ca48f3f), TOBN(0x64352f06, 0xe22a9b84),
+       TOBN(0xcee54da1, 0xc1099e4a), TOBN(0xbbda54e8, 0xfa1b89c0)},
+      {TOBN(0x166a3df5, 0x6f6e55fb), TOBN(0x1ca44a24, 0x20176f88),
+       TOBN(0x936afd88, 0xdfb7b5ff), TOBN(0xe34c2437, 0x8611d4a0)}},
+     {{TOBN(0x7effbb75, 0x86142103), TOBN(0x6704ba1b, 0x1f34fc4d),
+       TOBN(0x7c2a468f, 0x10c1b122), TOBN(0x36b3a610, 0x8c6aace9)},
+      {TOBN(0xabfcc0a7, 0x75a0d050), TOBN(0x066f9197, 0x3ce33e32),
+       TOBN(0xce905ef4, 0x29fe09be), TOBN(0x89ee25ba, 0xa8376351)}},
+     {{TOBN(0x2a3ede22, 0xfd29dc76), TOBN(0x7fd32ed9, 0x36f17260),
+       TOBN(0x0cadcf68, 0x284b4126), TOBN(0x63422f08, 0xa7951fc8)},
+      {TOBN(0x562b24f4, 0x0807e199), TOBN(0xfe9ce5d1, 0x22ad4490),
+       TOBN(0xc2f51b10, 0x0db2b1b4), TOBN(0xeb3613ff, 0xe4541d0d)}},
+     {{TOBN(0xbd2c4a05, 0x2680813b), TOBN(0x527aa55d, 0x561b08d6),
+       TOBN(0xa9f8a40e, 0xa7205558), TOBN(0xe3eea56f, 0x243d0bec)},
+      {TOBN(0x7b853817, 0xa0ff58b3), TOBN(0xb67d3f65, 0x1a69e627),
+       TOBN(0x0b76bbb9, 0xa869b5d6), TOBN(0xa3afeb82, 0x546723ed)}},
+     {{TOBN(0x5f24416d, 0x3e554892), TOBN(0x8413b53d, 0x430e2a45),
+       TOBN(0x99c56aee, 0x9032a2a0), TOBN(0x09432bf6, 0xeec367b1)},
+      {TOBN(0x552850c6, 0xdaf0ecc1), TOBN(0x49ebce55, 0x5bc92048),
+       TOBN(0xdfb66ba6, 0x54811307), TOBN(0x1b84f797, 0x6f298597)}},
+     {{TOBN(0x79590481, 0x8d1d7a0d), TOBN(0xd9fabe03, 0x3a6fa556),
+       TOBN(0xa40f9c59, 0xba9e5d35), TOBN(0xcb1771c1, 0xf6247577)},
+      {TOBN(0x542a47ca, 0xe9a6312b), TOBN(0xa34b3560, 0x552dd8c5),
+       TOBN(0xfdf94de0, 0x0d794716), TOBN(0xd46124a9, 0x9c623094)}},
+     {{TOBN(0x56b7435d, 0x68afe8b4), TOBN(0x27f20540, 0x6c0d8ea1),
+       TOBN(0x12b77e14, 0x73186898), TOBN(0xdbc3dd46, 0x7479490f)},
+      {TOBN(0x951a9842, 0xc03b0c05), TOBN(0x8b1b3bb3, 0x7921bc96),
+       TOBN(0xa573b346, 0x2b202e0a), TOBN(0x77e4665d, 0x47254d56)}},
+     {{TOBN(0x08b70dfc, 0xd23e3984), TOBN(0xab86e8bc, 0xebd14236),
+       TOBN(0xaa3e07f8, 0x57114ba7), TOBN(0x5ac71689, 0xab0ef4f2)},
+      {TOBN(0x88fca384, 0x0139d9af), TOBN(0x72733f88, 0x76644af0),
+       TOBN(0xf122f72a, 0x65d74f4a), TOBN(0x13931577, 0xa5626c7a)}},
+     {{TOBN(0xd5b5d9eb, 0x70f8d5a4), TOBN(0x375adde7, 0xd7bbb228),
+       TOBN(0x31e88b86, 0x0c1c0b32), TOBN(0xd1f568c4, 0x173edbaa)},
+      {TOBN(0x1592fc83, 0x5459df02), TOBN(0x2beac0fb, 0x0fcd9a7e),
+       TOBN(0xb0a6fdb8, 0x1b473b0a), TOBN(0xe3224c6f, 0x0fe8fc48)}},
+     {{TOBN(0x680bd00e, 0xe87edf5b), TOBN(0x30385f02, 0x20e77cf5),
+       TOBN(0xe9ab98c0, 0x4d42d1b2), TOBN(0x72d191d2, 0xd3816d77)},
+      {TOBN(0x1564daca, 0x0917d9e5), TOBN(0x394eab59, 0x1f8fed7f),
+       TOBN(0xa209aa8d, 0x7fbb3896), TOBN(0x5564f3b9, 0xbe6ac98e)}},
+     {{TOBN(0xead21d05, 0xd73654ef), TOBN(0x68d1a9c4, 0x13d78d74),
+       TOBN(0x61e01708, 0x6d4973a0), TOBN(0x83da3500, 0x46e6d32a)},
+      {TOBN(0x6a3dfca4, 0x68ae0118), TOBN(0xa1b9a4c9, 0xd02da069),
+       TOBN(0x0b2ff9c7, 0xebab8302), TOBN(0x98af07c3, 0x944ba436)}},
+     {{TOBN(0x85997326, 0x995f0f9f), TOBN(0x467fade0, 0x71b58bc6),
+       TOBN(0x47e4495a, 0xbd625a2b), TOBN(0xfdd2d01d, 0x33c3b8cd)},
+      {TOBN(0x2c38ae28, 0xc693f9fa), TOBN(0x48622329, 0x348f7999),
+       TOBN(0x97bf738e, 0x2161f583), TOBN(0x15ee2fa7, 0x565e8cc9)}},
+     {{TOBN(0xa1a5c845, 0x5777e189), TOBN(0xcc10bee0, 0x456f2829),
+       TOBN(0x8ad95c56, 0xda762bd5), TOBN(0x152e2214, 0xe9d91da8)},
+      {TOBN(0x975b0e72, 0x7cb23c74), TOBN(0xfd5d7670, 0xa90c66df),
+       TOBN(0xb5b5b8ad, 0x225ffc53), TOBN(0xab6dff73, 0xfaded2ae)}},
+     {{TOBN(0xebd56781, 0x6f4cbe9d), TOBN(0x0ed8b249, 0x6a574bd7),
+       TOBN(0x41c246fe, 0x81a881fa), TOBN(0x91564805, 0xc3db9c70)},
+      {TOBN(0xd7c12b08, 0x5b862809), TOBN(0x1facd1f1, 0x55858d7b),
+       TOBN(0x7693747c, 0xaf09e92a), TOBN(0x3b69dcba, 0x189a425f)}},
+     {{TOBN(0x0be28e9f, 0x967365ef), TOBN(0x57300eb2, 0xe801f5c9),
+       TOBN(0x93b8ac6a, 0xd583352f), TOBN(0xa2cf1f89, 0xcd05b2b7)},
+      {TOBN(0x7c0c9b74, 0x4dcc40cc), TOBN(0xfee38c45, 0xada523fb),
+       TOBN(0xb49a4dec, 0x1099cc4d), TOBN(0x325c377f, 0x69f069c6)}},
+     {{TOBN(0xe12458ce, 0x476cc9ff), TOBN(0x580e0b6c, 0xc6d4cb63),
+       TOBN(0xd561c8b7, 0x9072289b), TOBN(0x0377f264, 0xa619e6da)},
+      {TOBN(0x26685362, 0x88e591a5), TOBN(0xa453a7bd, 0x7523ca2b),
+       TOBN(0x8a9536d2, 0xc1df4533), TOBN(0xc8e50f2f, 0xbe972f79)}},
+     {{TOBN(0xd433e50f, 0x6d3549cf), TOBN(0x6f33696f, 0xfacd665e),
+       TOBN(0x695bfdac, 0xce11fcb4), TOBN(0x810ee252, 0xaf7c9860)},
+      {TOBN(0x65450fe1, 0x7159bb2c), TOBN(0xf7dfbebe, 0x758b357b),
+       TOBN(0x2b057e74, 0xd69fea72), TOBN(0xd485717a, 0x92731745)}}},
+    {{{TOBN(0x896c42e8, 0xee36860c), TOBN(0xdaf04dfd, 0x4113c22d),
+       TOBN(0x1adbb7b7, 0x44104213), TOBN(0xe5fd5fa1, 0x1fd394ea)},
+      {TOBN(0x68235d94, 0x1a4e0551), TOBN(0x6772cfbe, 0x18d10151),
+       TOBN(0x276071e3, 0x09984523), TOBN(0xe4e879de, 0x5a56ba98)}},
+     {{TOBN(0xaaafafb0, 0x285b9491), TOBN(0x01a0be88, 0x1e4c705e),
+       TOBN(0xff1d4f5d, 0x2ad9caab), TOBN(0x6e349a4a, 0xc37a233f)},
+      {TOBN(0xcf1c1246, 0x4a1c6a16), TOBN(0xd99e6b66, 0x29383260),
+       TOBN(0xea3d4366, 0x5f6d5471), TOBN(0x36974d04, 0xff8cc89b)}},
+     {{TOBN(0xc26c49a1, 0xcfe89d80), TOBN(0xb42c026d, 0xda9c8371),
+       TOBN(0xca6c013a, 0xdad066d2), TOBN(0xfb8f7228, 0x56a4f3ee)},
+      {TOBN(0x08b579ec, 0xd850935b), TOBN(0x34c1a74c, 0xd631e1b3),
+       TOBN(0xcb5fe596, 0xac198534), TOBN(0x39ff21f6, 0xe1f24f25)}},
+     {{TOBN(0x27f29e14, 0x8f929057), TOBN(0x7a64ae06, 0xc0c853df),
+       TOBN(0x256cd183, 0x58e9c5ce), TOBN(0x9d9cce82, 0xded092a5)},
+      {TOBN(0xcc6e5979, 0x6e93b7c7), TOBN(0xe1e47092, 0x31bb9e27),
+       TOBN(0xb70b3083, 0xaa9e29a0), TOBN(0xbf181a75, 0x3785e644)}},
+     {{TOBN(0xf53f2c65, 0x8ead09f7), TOBN(0x1335e1d5, 0x9780d14d),
+       TOBN(0x69cc20e0, 0xcd1b66bc), TOBN(0x9b670a37, 0xbbe0bfc8)},
+      {TOBN(0xce53dc81, 0x28efbeed), TOBN(0x0c74e77c, 0x8326a6e5),
+       TOBN(0x3604e0d2, 0xb88e9a63), TOBN(0xbab38fca, 0x13dc2248)}},
+     {{TOBN(0x8ed6e8c8, 0x5c0a3f1e), TOBN(0xbcad2492, 0x7c87c37f),
+       TOBN(0xfdfb62bb, 0x9ee3b78d), TOBN(0xeba8e477, 0xcbceba46)},
+      {TOBN(0x37d38cb0, 0xeeaede4b), TOBN(0x0bc498e8, 0x7976deb6),
+       TOBN(0xb2944c04, 0x6b6147fb), TOBN(0x8b123f35, 0xf71f9609)}},
+     {{TOBN(0xa155dcc7, 0xde79dc24), TOBN(0xf1168a32, 0x558f69cd),
+       TOBN(0xbac21595, 0x0d1850df), TOBN(0x15c8295b, 0xb204c848)},
+      {TOBN(0xf661aa36, 0x7d8184ff), TOBN(0xc396228e, 0x30447bdb),
+       TOBN(0x11cd5143, 0xbde4a59e), TOBN(0xe3a26e3b, 0x6beab5e6)}},
+     {{TOBN(0xd3b3a13f, 0x1402b9d0), TOBN(0x573441c3, 0x2c7bc863),
+       TOBN(0x4b301ec4, 0x578c3e6e), TOBN(0xc26fc9c4, 0x0adaf57e)},
+      {TOBN(0x96e71bfd, 0x7493cea3), TOBN(0xd05d4b3f, 0x1af81456),
+       TOBN(0xdaca2a8a, 0x6a8c608f), TOBN(0x53ef07f6, 0x0725b276)}},
+     {{TOBN(0x07a5fbd2, 0x7824fc56), TOBN(0x34675218, 0x13289077),
+       TOBN(0x5bf69fd5, 0xe0c48349), TOBN(0xa613ddd3, 0xb6aa7875)},
+      {TOBN(0x7f78c19c, 0x5450d866), TOBN(0x46f4409c, 0x8f84a481),
+       TOBN(0x9f1d1928, 0x90fce239), TOBN(0x016c4168, 0xb2ce44b9)}},
+     {{TOBN(0xbae023f0, 0xc7435978), TOBN(0xb152c888, 0x20e30e19),
+       TOBN(0x9c241645, 0xe3fa6faf), TOBN(0x735d95c1, 0x84823e60)},
+      {TOBN(0x03197573, 0x03955317), TOBN(0x0b4b02a9, 0xf03b4995),
+       TOBN(0x076bf559, 0x70274600), TOBN(0x32c5cc53, 0xaaf57508)}},
+     {{TOBN(0xe8af6d1f, 0x60624129), TOBN(0xb7bc5d64, 0x9a5e2b5e),
+       TOBN(0x3814b048, 0x5f082d72), TOBN(0x76f267f2, 0xce19677a)},
+      {TOBN(0x626c630f, 0xb36eed93), TOBN(0x55230cd7, 0x3bf56803),
+       TOBN(0x78837949, 0xce2736a0), TOBN(0x0d792d60, 0xaa6c55f1)}},
+     {{TOBN(0x0318dbfd, 0xd5c7c5d2), TOBN(0xb38f8da7, 0x072b342d),
+       TOBN(0x3569bddc, 0x7b8de38a), TOBN(0xf25b5887, 0xa1c94842)},
+      {TOBN(0xb2d5b284, 0x2946ad60), TOBN(0x854f29ad, 0xe9d1707e),
+       TOBN(0xaa5159dc, 0x2c6a4509), TOBN(0x899f94c0, 0x57189837)}},
+     {{TOBN(0xcf6adc51, 0xf4a55b03), TOBN(0x261762de, 0x35e3b2d5),
+       TOBN(0x4cc43012, 0x04827b51), TOBN(0xcd22a113, 0xc6021442)},
+      {TOBN(0xce2fd61a, 0x247c9569), TOBN(0x59a50973, 0xd152beca),
+       TOBN(0x6c835a11, 0x63a716d4), TOBN(0xc26455ed, 0x187dedcf)}},
+     {{TOBN(0x27f536e0, 0x49ce89e7), TOBN(0x18908539, 0xcc890cb5),
+       TOBN(0x308909ab, 0xd83c2aa1), TOBN(0xecd3142b, 0x1ab73bd3)},
+      {TOBN(0x6a85bf59, 0xb3f5ab84), TOBN(0x3c320a68, 0xf2bea4c6),
+       TOBN(0xad8dc538, 0x6da4541f), TOBN(0xeaf34eb0, 0xb7c41186)}},
+     {{TOBN(0x1c780129, 0x977c97c4), TOBN(0x5ff9beeb, 0xc57eb9fa),
+       TOBN(0xa24d0524, 0xc822c478), TOBN(0xfd8eec2a, 0x461cd415)},
+      {TOBN(0xfbde194e, 0xf027458c), TOBN(0xb4ff5319, 0x1d1be115),
+       TOBN(0x63f874d9, 0x4866d6f4), TOBN(0x35c75015, 0xb21ad0c9)}},
+     {{TOBN(0xa6b5c9d6, 0x46ac49d2), TOBN(0x42c77c0b, 0x83137aa9),
+       TOBN(0x24d000fc, 0x68225a38), TOBN(0x0f63cfc8, 0x2fe1e907)},
+      {TOBN(0x22d1b01b, 0xc6441f95), TOBN(0x7d38f719, 0xec8e448f),
+       TOBN(0x9b33fa5f, 0x787fb1ba), TOBN(0x94dcfda1, 0x190158df)}},
+     {{TOBN(0xc47cb339, 0x5f6d4a09), TOBN(0x6b4f355c, 0xee52b826),
+       TOBN(0x3d100f5d, 0xf51b930a), TOBN(0xf4512fac, 0x9f668f69)},
+      {TOBN(0x546781d5, 0x206c4c74), TOBN(0xd021d4d4, 0xcb4d2e48),
+       TOBN(0x494a54c2, 0xca085c2d), TOBN(0xf1dbaca4, 0x520850a8)}},
+     {{TOBN(0x63c79326, 0x490a1aca), TOBN(0xcb64dd9c, 0x41526b02),
+       TOBN(0xbb772591, 0xa2979258), TOBN(0x3f582970, 0x48d97846)},
+      {TOBN(0xd66b70d1, 0x7c213ba7), TOBN(0xc28febb5, 0xe8a0ced4),
+       TOBN(0x6b911831, 0xc10338c1), TOBN(0x0d54e389, 0xbf0126f3)}},
+     {{TOBN(0x7048d460, 0x4af206ee), TOBN(0x786c88f6, 0x77e97cb9),
+       TOBN(0xd4375ae1, 0xac64802e), TOBN(0x469bcfe1, 0xd53ec11c)},
+      {TOBN(0xfc9b340d, 0x47062230), TOBN(0xe743bb57, 0xc5b4a3ac),
+       TOBN(0xfe00b4aa, 0x59ef45ac), TOBN(0x29a4ef23, 0x59edf188)}},
+     {{TOBN(0x40242efe, 0xb483689b), TOBN(0x2575d3f6, 0x513ac262),
+       TOBN(0xf30037c8, 0x0ca6db72), TOBN(0xc9fcce82, 0x98864be2)},
+      {TOBN(0x84a112ff, 0x0149362d), TOBN(0x95e57582, 0x1c4ae971),
+       TOBN(0x1fa4b1a8, 0x945cf86c), TOBN(0x4525a734, 0x0b024a2f)}},
+     {{TOBN(0xe76c8b62, 0x8f338360), TOBN(0x483ff593, 0x28edf32b),
+       TOBN(0x67e8e90a, 0x298b1aec), TOBN(0x9caab338, 0x736d9a21)},
+      {TOBN(0x5c09d2fd, 0x66892709), TOBN(0x2496b4dc, 0xb55a1d41),
+       TOBN(0x93f5fb1a, 0xe24a4394), TOBN(0x08c75049, 0x6fa8f6c1)}},
+     {{TOBN(0xcaead1c2, 0xc905d85f), TOBN(0xe9d7f790, 0x0733ae57),
+       TOBN(0x24c9a65c, 0xf07cdd94), TOBN(0x7389359c, 0xa4b55931)},
+      {TOBN(0xf58709b7, 0x367e45f7), TOBN(0x1f203067, 0xcb7e7adc),
+       TOBN(0x82444bff, 0xc7b72818), TOBN(0x07303b35, 0xbaac8033)}},
+     {{TOBN(0x1e1ee4e4, 0xd13b7ea1), TOBN(0xe6489b24, 0xe0e74180),
+       TOBN(0xa5f2c610, 0x7e70ef70), TOBN(0xa1655412, 0xbdd10894)},
+      {TOBN(0x555ebefb, 0x7af4194e), TOBN(0x533c1c3c, 0x8e89bd9c),
+       TOBN(0x735b9b57, 0x89895856), TOBN(0x15fb3cd2, 0x567f5c15)}},
+     {{TOBN(0x057fed45, 0x526f09fd), TOBN(0xe8a4f10c, 0x8128240a),
+       TOBN(0x9332efc4, 0xff2bfd8d), TOBN(0x214e77a0, 0xbd35aa31)},
+      {TOBN(0x32896d73, 0x14faa40e), TOBN(0x767867ec, 0x01e5f186),
+       TOBN(0xc9adf8f1, 0x17a1813e), TOBN(0xcb6cda78, 0x54741795)}},
+     {{TOBN(0xb7521b6d, 0x349d51aa), TOBN(0xf56b5a9e, 0xe3c7b8e9),
+       TOBN(0xc6f1e5c9, 0x32a096df), TOBN(0x083667c4, 0xa3635024)},
+      {TOBN(0x365ea135, 0x18087f2f), TOBN(0xf1b8eaac, 0xd136e45d),
+       TOBN(0xc8a0e484, 0x73aec989), TOBN(0xd75a324b, 0x142c9259)}},
+     {{TOBN(0xb7b4d001, 0x01dae185), TOBN(0x45434e0b, 0x9b7a94bc),
+       TOBN(0xf54339af, 0xfbd8cb0b), TOBN(0xdcc4569e, 0xe98ef49e)},
+      {TOBN(0x7789318a, 0x09a51299), TOBN(0x81b4d206, 0xb2b025d8),
+       TOBN(0xf64aa418, 0xfae85792), TOBN(0x3e50258f, 0xacd7baf7)}},
+     {{TOBN(0xdce84cdb, 0x2996864b), TOBN(0xa2e67089, 0x1f485fa4),
+       TOBN(0xb28b2bb6, 0x534c6a5a), TOBN(0x31a7ec6b, 0xc94b9d39)},
+      {TOBN(0x1d217766, 0xd6bc20da), TOBN(0x4acdb5ec, 0x86761190),
+       TOBN(0x68726328, 0x73701063), TOBN(0x4d24ee7c, 0x2128c29b)}},
+     {{TOBN(0xc072ebd3, 0xa19fd868), TOBN(0x612e481c, 0xdb8ddd3b),
+       TOBN(0xb4e1d754, 0x1a64d852), TOBN(0x00ef95ac, 0xc4c6c4ab)},
+      {TOBN(0x1536d2ed, 0xaa0a6c46), TOBN(0x61294086, 0x43774790),
+       TOBN(0x54af25e8, 0x343fda10), TOBN(0x9ff9d98d, 0xfd25d6f2)}},
+     {{TOBN(0x0746af7c, 0x468b8835), TOBN(0x977a31cb, 0x730ecea7),
+       TOBN(0xa5096b80, 0xc2cf4a81), TOBN(0xaa986833, 0x6458c37a)},
+      {TOBN(0x6af29bf3, 0xa6bd9d34), TOBN(0x6a62fe9b, 0x33c5d854),
+       TOBN(0x50e6c304, 0xb7133b5e), TOBN(0x04b60159, 0x7d6e6848)}},
+     {{TOBN(0x4cd296df, 0x5579bea4), TOBN(0x10e35ac8, 0x5ceedaf1),
+       TOBN(0x04c4c5fd, 0xe3bcc5b1), TOBN(0x95f9ee8a, 0x89412cf9)},
+      {TOBN(0x2c9459ee, 0x82b6eb0f), TOBN(0x2e845765, 0x95c2aadd),
+       TOBN(0x774a84ae, 0xd327fcfe), TOBN(0xd8c93722, 0x0368d476)}},
+     {{TOBN(0x0dbd5748, 0xf83e8a3b), TOBN(0xa579aa96, 0x8d2495f3),
+       TOBN(0x535996a0, 0xae496e9b), TOBN(0x07afbfe9, 0xb7f9bcc2)},
+      {TOBN(0x3ac1dc6d, 0x5b7bd293), TOBN(0x3b592cff, 0x7022323d),
+       TOBN(0xba0deb98, 0x9c0a3e76), TOBN(0x18e78e9f, 0x4b197acb)}},
+     {{TOBN(0x211cde10, 0x296c36ef), TOBN(0x7ee89672, 0x82c4da77),
+       TOBN(0xb617d270, 0xa57836da), TOBN(0xf0cd9c31, 0x9cb7560b)},
+      {TOBN(0x01fdcbf7, 0xe455fe90), TOBN(0x3fb53cbb, 0x7e7334f3),
+       TOBN(0x781e2ea4, 0x4e7de4ec), TOBN(0x8adab3ad, 0x0b384fd0)}},
+     {{TOBN(0x129eee2f, 0x53d64829), TOBN(0x7a471e17, 0xa261492b),
+       TOBN(0xe4f9adb9, 0xe4cb4a2c), TOBN(0x3d359f6f, 0x97ba2c2d)},
+      {TOBN(0x346c6786, 0x0aacd697), TOBN(0x92b444c3, 0x75c2f8a8),
+       TOBN(0xc79fa117, 0xd85df44e), TOBN(0x56782372, 0x398ddf31)}},
+     {{TOBN(0x60e690f2, 0xbbbab3b8), TOBN(0x4851f8ae, 0x8b04816b),
+       TOBN(0xc72046ab, 0x9c92e4d2), TOBN(0x518c74a1, 0x7cf3136b)},
+      {TOBN(0xff4eb50a, 0xf9877d4c), TOBN(0x14578d90, 0xa919cabb),
+       TOBN(0x8218f8c4, 0xac5eb2b6), TOBN(0xa3ccc547, 0x542016e4)}},
+     {{TOBN(0x025bf48e, 0x327f8349), TOBN(0xf3e97346, 0xf43cb641),
+       TOBN(0xdc2bafdf, 0x500f1085), TOBN(0x57167876, 0x2f063055)},
+      {TOBN(0x5bd914b9, 0x411925a6), TOBN(0x7c078d48, 0xa1123de5),
+       TOBN(0xee6bf835, 0x182b165d), TOBN(0xb11b5e5b, 0xba519727)}},
+     {{TOBN(0xe33ea76c, 0x1eea7b85), TOBN(0x2352b461, 0x92d4f85e),
+       TOBN(0xf101d334, 0xafe115bb), TOBN(0xfabc1294, 0x889175a3)},
+      {TOBN(0x7f6bcdc0, 0x5233f925), TOBN(0xe0a802db, 0xe77fec55),
+       TOBN(0xbdb47b75, 0x8069b659), TOBN(0x1c5e12de, 0xf98fbd74)}},
+     {{TOBN(0x869c58c6, 0x4b8457ee), TOBN(0xa5360f69, 0x4f7ea9f7),
+       TOBN(0xe576c09f, 0xf460b38f), TOBN(0x6b70d548, 0x22b7fb36)},
+      {TOBN(0x3fd237f1, 0x3bfae315), TOBN(0x33797852, 0xcbdff369),
+       TOBN(0x97df25f5, 0x25b516f9), TOBN(0x46f388f2, 0xba38ad2d)}},
+     {{TOBN(0x656c4658, 0x89d8ddbb), TOBN(0x8830b26e, 0x70f38ee8),
+       TOBN(0x4320fd5c, 0xde1212b0), TOBN(0xc34f30cf, 0xe4a2edb2)},
+      {TOBN(0xabb131a3, 0x56ab64b8), TOBN(0x7f77f0cc, 0xd99c5d26),
+       TOBN(0x66856a37, 0xbf981d94), TOBN(0x19e76d09, 0x738bd76e)}},
+     {{TOBN(0xe76c8ac3, 0x96238f39), TOBN(0xc0a482be, 0xa830b366),
+       TOBN(0xb7b8eaff, 0x0b4eb499), TOBN(0x8ecd83bc, 0x4bfb4865)},
+      {TOBN(0x971b2cb7, 0xa2f3776f), TOBN(0xb42176a4, 0xf4b88adf),
+       TOBN(0xb9617df5, 0xbe1fa446), TOBN(0x8b32d508, 0xcd031bd2)}},
+     {{TOBN(0x1c6bd47d, 0x53b618c0), TOBN(0xc424f46c, 0x6a227923),
+       TOBN(0x7303ffde, 0xdd92d964), TOBN(0xe9712878, 0x71b5abf2)},
+      {TOBN(0x8f48a632, 0xf815561d), TOBN(0x85f48ff5, 0xd3c055d1),
+       TOBN(0x222a1427, 0x7525684f), TOBN(0xd0d841a0, 0x67360cc3)}},
+     {{TOBN(0x4245a926, 0x0b9267c6), TOBN(0xc78913f1, 0xcf07f863),
+       TOBN(0xaa844c8e, 0x4d0d9e24), TOBN(0xa42ad522, 0x3d5f9017)},
+      {TOBN(0xbd371749, 0xa2c989d5), TOBN(0x928292df, 0xe1f5e78e),
+       TOBN(0x493b383e, 0x0a1ea6da), TOBN(0x5136fd8d, 0x13aee529)}},
+     {{TOBN(0x860c44b1, 0xf2c34a99), TOBN(0x3b00aca4, 0xbf5855ac),
+       TOBN(0xabf6aaa0, 0xfaaf37be), TOBN(0x65f43682, 0x2a53ec08)},
+      {TOBN(0x1d9a5801, 0xa11b12e1), TOBN(0x78a7ab2c, 0xe20ed475),
+       TOBN(0x0de1067e, 0x9a41e0d5), TOBN(0x30473f5f, 0x305023ea)}},
+     {{TOBN(0xdd3ae09d, 0x169c7d97), TOBN(0x5cd5baa4, 0xcfaef9cd),
+       TOBN(0x5cd7440b, 0x65a44803), TOBN(0xdc13966a, 0x47f364de)},
+      {TOBN(0x077b2be8, 0x2b8357c1), TOBN(0x0cb1b4c5, 0xe9d57c2a),
+       TOBN(0x7a4ceb32, 0x05ff363e), TOBN(0xf310fa4d, 0xca35a9ef)}},
+     {{TOBN(0xdbb7b352, 0xf97f68c6), TOBN(0x0c773b50, 0x0b02cf58),
+       TOBN(0xea2e4821, 0x3c1f96d9), TOBN(0xffb357b0, 0xeee01815)},
+      {TOBN(0xb9c924cd, 0xe0f28039), TOBN(0x0b36c95a, 0x46a3fbe4),
+       TOBN(0x1faaaea4, 0x5e46db6c), TOBN(0xcae575c3, 0x1928aaff)}},
+     {{TOBN(0x7f671302, 0xa70dab86), TOBN(0xfcbd12a9, 0x71c58cfc),
+       TOBN(0xcbef9acf, 0xbee0cb92), TOBN(0x573da0b9, 0xf8c1b583)},
+      {TOBN(0x4752fcfe, 0x0d41d550), TOBN(0xe7eec0e3, 0x2155cffe),
+       TOBN(0x0fc39fcb, 0x545ae248), TOBN(0x522cb8d1, 0x8065f44e)}},
+     {{TOBN(0x263c962a, 0x70cbb96c), TOBN(0xe034362a, 0xbcd124a9),
+       TOBN(0xf120db28, 0x3c2ae58d), TOBN(0xb9a38d49, 0xfef6d507)},
+      {TOBN(0xb1fd2a82, 0x1ff140fd), TOBN(0xbd162f30, 0x20aee7e0),
+       TOBN(0x4e17a5d4, 0xcb251949), TOBN(0x2aebcb83, 0x4f7e1c3d)}},
+     {{TOBN(0x608eb25f, 0x937b0527), TOBN(0xf42e1e47, 0xeb7d9997),
+       TOBN(0xeba699c4, 0xb8a53a29), TOBN(0x1f921c71, 0xe091b536)},
+      {TOBN(0xcce29e7b, 0x5b26bbd5), TOBN(0x7a8ef5ed, 0x3b61a680),
+       TOBN(0xe5ef8043, 0xba1f1c7e), TOBN(0x16ea8217, 0x18158dda)}},
+     {{TOBN(0x01778a2b, 0x599ff0f9), TOBN(0x68a923d7, 0x8104fc6b),
+       TOBN(0x5bfa44df, 0xda694ff3), TOBN(0x4f7199db, 0xf7667f12)},
+      {TOBN(0xc06d8ff6, 0xe46f2a79), TOBN(0x08b5dead, 0xe9f8131d),
+       TOBN(0x02519a59, 0xabb4ce7c), TOBN(0xc4f710bc, 0xb42aec3e)}},
+     {{TOBN(0x3d77b057, 0x78bde41a), TOBN(0x6474bf80, 0xb4186b5a),
+       TOBN(0x048b3f67, 0x88c65741), TOBN(0xc64519de, 0x03c7c154)},
+      {TOBN(0xdf073846, 0x0edfcc4f), TOBN(0x319aa737, 0x48f1aa6b),
+       TOBN(0x8b9f8a02, 0xca909f77), TOBN(0x90258139, 0x7580bfef)}},
+     {{TOBN(0xd8bfd3ca, 0xc0c22719), TOBN(0xc60209e4, 0xc9ca151e),
+       TOBN(0x7a744ab5, 0xd9a1a69c), TOBN(0x6de5048b, 0x14937f8f)},
+      {TOBN(0x171938d8, 0xe115ac04), TOBN(0x7df70940, 0x1c6b16d2),
+       TOBN(0xa6aeb663, 0x7f8e94e7), TOBN(0xc130388e, 0x2a2cf094)}},
+     {{TOBN(0x1850be84, 0x77f54e6e), TOBN(0x9f258a72, 0x65d60fe5),
+       TOBN(0xff7ff0c0, 0x6c9146d6), TOBN(0x039aaf90, 0xe63a830b)},
+      {TOBN(0x38f27a73, 0x9460342f), TOBN(0x4703148c, 0x3f795f8a),
+       TOBN(0x1bb5467b, 0x9681a97e), TOBN(0x00931ba5, 0xecaeb594)}},
+     {{TOBN(0xcdb6719d, 0x786f337c), TOBN(0xd9c01cd2, 0xe704397d),
+       TOBN(0x0f4a3f20, 0x555c2fef), TOBN(0x00452509, 0x7c0af223)},
+      {TOBN(0x54a58047, 0x84db8e76), TOBN(0x3bacf1aa, 0x93c8aa06),
+       TOBN(0x11ca957c, 0xf7919422), TOBN(0x50641053, 0x78cdaa40)}},
+     {{TOBN(0x7a303874, 0x9f7144ae), TOBN(0x170c963f, 0x43d4acfd),
+       TOBN(0x5e148149, 0x58ddd3ef), TOBN(0xa7bde582, 0x9e72dba8)},
+      {TOBN(0x0769da8b, 0x6fa68750), TOBN(0xfa64e532, 0x572e0249),
+       TOBN(0xfcaadf9d, 0x2619ad31), TOBN(0x87882daa, 0xa7b349cd)}},
+     {{TOBN(0x9f6eb731, 0x6c67a775), TOBN(0xcb10471a, 0xefc5d0b1),
+       TOBN(0xb433750c, 0xe1b806b2), TOBN(0x19c5714d, 0x57b1ae7e)},
+      {TOBN(0xc0dc8b7b, 0xed03fd3f), TOBN(0xdd03344f, 0x31bc194e),
+       TOBN(0xa66c52a7, 0x8c6320b5), TOBN(0x8bc82ce3, 0xd0b6fd93)}},
+     {{TOBN(0xf8e13501, 0xb35f1341), TOBN(0xe53156dd, 0x25a43e42),
+       TOBN(0xd3adf27e, 0x4daeb85c), TOBN(0xb81d8379, 0xbbeddeb5)},
+      {TOBN(0x1b0b546e, 0x2e435867), TOBN(0x9020eb94, 0xeba5dd60),
+       TOBN(0x37d91161, 0x8210cb9d), TOBN(0x4c596b31, 0x5c91f1cf)}},
+     {{TOBN(0xb228a90f, 0x0e0b040d), TOBN(0xbaf02d82, 0x45ff897f),
+       TOBN(0x2aac79e6, 0x00fa6122), TOBN(0x24828817, 0x8e36f557)},
+      {TOBN(0xb9521d31, 0x113ec356), TOBN(0x9e48861e, 0x15eff1f8),
+       TOBN(0x2aa1d412, 0xe0d41715), TOBN(0x71f86203, 0x53f131b8)}},
+     {{TOBN(0xf60da8da, 0x3fd19408), TOBN(0x4aa716dc, 0x278d9d99),
+       TOBN(0x394531f7, 0xa8c51c90), TOBN(0xb560b0e8, 0xf59db51c)},
+      {TOBN(0xa28fc992, 0xfa34bdad), TOBN(0xf024fa14, 0x9cd4f8bd),
+       TOBN(0x5cf530f7, 0x23a9d0d3), TOBN(0x615ca193, 0xe28c9b56)}},
+     {{TOBN(0x6d2a483d, 0x6f73c51e), TOBN(0xa4cb2412, 0xea0dc2dd),
+       TOBN(0x50663c41, 0x1eb917ff), TOBN(0x3d3a74cf, 0xeade299e)},
+      {TOBN(0x29b3990f, 0x4a7a9202), TOBN(0xa9bccf59, 0xa7b15c3d),
+       TOBN(0x66a3ccdc, 0xa5df9208), TOBN(0x48027c14, 0x43f2f929)}},
+     {{TOBN(0xd385377c, 0x40b557f0), TOBN(0xe001c366, 0xcd684660),
+       TOBN(0x1b18ed6b, 0xe2183a27), TOBN(0x879738d8, 0x63210329)},
+      {TOBN(0xa687c74b, 0xbda94882), TOBN(0xd1bbcc48, 0xa684b299),
+       TOBN(0xaf6f1112, 0x863b3724), TOBN(0x6943d1b4, 0x2c8ce9f8)}},
+     {{TOBN(0xe044a3bb, 0x098cafb4), TOBN(0x27ed2310, 0x60d48caf),
+       TOBN(0x542b5675, 0x3a31b84d), TOBN(0xcbf3dd50, 0xfcddbed7)},
+      {TOBN(0x25031f16, 0x41b1d830), TOBN(0xa7ec851d, 0xcb0c1e27),
+       TOBN(0xac1c8fe0, 0xb5ae75db), TOBN(0xb24c7557, 0x08c52120)}},
+     {{TOBN(0x57f811dc, 0x1d4636c3), TOBN(0xf8436526, 0x681a9939),
+       TOBN(0x1f6bc6d9, 0x9c81adb3), TOBN(0x840f8ac3, 0x5b7d80d4)},
+      {TOBN(0x731a9811, 0xf4387f1a), TOBN(0x7c501cd3, 0xb5156880),
+       TOBN(0xa5ca4a07, 0xdfe68867), TOBN(0xf123d8f0, 0x5fcea120)}},
+     {{TOBN(0x1fbb0e71, 0xd607039e), TOBN(0x2b70e215, 0xcd3a4546),
+       TOBN(0x32d2f01d, 0x53324091), TOBN(0xb796ff08, 0x180ab19b)},
+      {TOBN(0x32d87a86, 0x3c57c4aa), TOBN(0x2aed9caf, 0xb7c49a27),
+       TOBN(0x9fb35eac, 0x31630d98), TOBN(0x338e8cdf, 0x5c3e20a3)}},
+     {{TOBN(0x80f16182, 0x66cde8db), TOBN(0x4e159980, 0x2d72fd36),
+       TOBN(0xd7b8f13b, 0x9b6e5072), TOBN(0xf5213907, 0x3b7b5dc1)},
+      {TOBN(0x4d431f1d, 0x8ce4396e), TOBN(0x37a1a680, 0xa7ed2142),
+       TOBN(0xbf375696, 0xd01aaf6b), TOBN(0xaa1c0c54, 0xe63aab66)}},
+     {{TOBN(0x3014368b, 0x4ed80940), TOBN(0x67e6d056, 0x7a6fcedd),
+       TOBN(0x7c208c49, 0xca97579f), TOBN(0xfe3d7a81, 0xa23597f6)},
+      {TOBN(0x5e203202, 0x7e096ae2), TOBN(0xb1f3e1e7, 0x24b39366),
+       TOBN(0x26da26f3, 0x2fdcdffc), TOBN(0x79422f1d, 0x6097be83)}}},
+    {{{TOBN(0x263a2cfb, 0x9db3b381), TOBN(0x9c3a2dee, 0xd4df0a4b),
+       TOBN(0x728d06e9, 0x7d04e61f), TOBN(0x8b1adfbc, 0x42449325)},
+      {TOBN(0x6ec1d939, 0x7e053a1b), TOBN(0xee2be5c7, 0x66daf707),
+       TOBN(0x80ba1e14, 0x810ac7ab), TOBN(0xdd2ae778, 0xf530f174)}},
+     {{TOBN(0x0435d97a, 0x205b9d8b), TOBN(0x6eb8f064, 0x056756d4),
+       TOBN(0xd5e88a8b, 0xb6f8210e), TOBN(0x070ef12d, 0xec9fd9ea)},
+      {TOBN(0x4d849505, 0x3bcc876a), TOBN(0x12a75338, 0xa7404ce3),
+       TOBN(0xd22b49e1, 0xb8a1db5e), TOBN(0xec1f2051, 0x14bfa5ad)}},
+     {{TOBN(0xadbaeb79, 0xb6828f36), TOBN(0x9d7a0258, 0x01bd5b9e),
+       TOBN(0xeda01e0d, 0x1e844b0c), TOBN(0x4b625175, 0x887edfc9)},
+      {TOBN(0x14109fdd, 0x9669b621), TOBN(0x88a2ca56, 0xf6f87b98),
+       TOBN(0xfe2eb788, 0x170df6bc), TOBN(0x0cea06f4, 0xffa473f9)}},
+     {{TOBN(0x43ed81b5, 0xc4e83d33), TOBN(0xd9f35879, 0x5efd488b),
+       TOBN(0x164a620f, 0x9deb4d0f), TOBN(0xc6927bdb, 0xac6a7394)},
+      {TOBN(0x45c28df7, 0x9f9e0f03), TOBN(0x2868661e, 0xfcd7e1a9),
+       TOBN(0x7cf4e8d0, 0xffa348f1), TOBN(0x6bd4c284, 0x398538e0)}},
+     {{TOBN(0x2618a091, 0x289a8619), TOBN(0xef796e60, 0x6671b173),
+       TOBN(0x664e46e5, 0x9090c632), TOBN(0xa38062d4, 0x1e66f8fb)},
+      {TOBN(0x6c744a20, 0x0573274e), TOBN(0xd07b67e4, 0xa9271394),
+       TOBN(0x391223b2, 0x6bdc0e20), TOBN(0xbe2d93f1, 0xeb0a05a7)}},
+     {{TOBN(0xf23e2e53, 0x3f36d141), TOBN(0xe84bb3d4, 0x4dfca442),
+       TOBN(0xb804a48d, 0x6b7c023a), TOBN(0x1e16a8fa, 0x76431c3b)},
+      {TOBN(0x1b5452ad, 0xddd472e0), TOBN(0x7d405ee7, 0x0d1ee127),
+       TOBN(0x50fc6f1d, 0xffa27599), TOBN(0x351ac53c, 0xbf391b35)}},
+     {{TOBN(0x7efa14b8, 0x4444896b), TOBN(0x64974d2f, 0xf94027fb),
+       TOBN(0xefdcd0e8, 0xde84487d), TOBN(0x8c45b260, 0x2b48989b)},
+      {TOBN(0xa8fcbbc2, 0xd8463487), TOBN(0xd1b2b3f7, 0x3fbc476c),
+       TOBN(0x21d005b7, 0xc8f443c0), TOBN(0x518f2e67, 0x40c0139c)}},
+     {{TOBN(0x56036e8c, 0x06d75fc1), TOBN(0x2dcf7bb7, 0x3249a89f),
+       TOBN(0x81dd1d3d, 0xe245e7dd), TOBN(0xf578dc4b, 0xebd6e2a7)},
+      {TOBN(0x4c028903, 0xdf2ce7a0), TOBN(0xaee36288, 0x9c39afac),
+       TOBN(0xdc847c31, 0x146404ab), TOBN(0x6304c0d8, 0xa4e97818)}},
+     {{TOBN(0xae51dca2, 0xa91f6791), TOBN(0x2abe4190, 0x9baa9efc),
+       TOBN(0xd9d2e2f4, 0x559c7ac1), TOBN(0xe82f4b51, 0xfc9f773a)},
+      {TOBN(0xa7713027, 0x4073e81c), TOBN(0xc0276fac, 0xfbb596fc),
+       TOBN(0x1d819fc9, 0xa684f70c), TOBN(0x29b47fdd, 0xc9f7b1e0)}},
+     {{TOBN(0x358de103, 0x459b1940), TOBN(0xec881c59, 0x5b013e93),
+       TOBN(0x51574c93, 0x49532ad3), TOBN(0x2db1d445, 0xb37b46de)},
+      {TOBN(0xc6445b87, 0xdf239fd8), TOBN(0xc718af75, 0x151d24ee),
+       TOBN(0xaea1c4a4, 0xf43c6259), TOBN(0x40c0e5d7, 0x70be02f7)}},
+     {{TOBN(0x6a4590f4, 0x721b33f2), TOBN(0x2124f1fb, 0xfedf04ea),
+       TOBN(0xf8e53cde, 0x9745efe7), TOBN(0xe7e10432, 0x65f046d9)},
+      {TOBN(0xc3fca28e, 0xe4d0c7e6), TOBN(0x847e339a, 0x87253b1b),
+       TOBN(0x9b595348, 0x3743e643), TOBN(0xcb6a0a0b, 0x4fd12fc5)}},
+     {{TOBN(0xfb6836c3, 0x27d02dcc), TOBN(0x5ad00982, 0x7a68bcc2),
+       TOBN(0x1b24b44c, 0x005e912d), TOBN(0xcc83d20f, 0x811fdcfe)},
+      {TOBN(0x36527ec1, 0x666fba0c), TOBN(0x69948197, 0x14754635),
+       TOBN(0xfcdcb1a8, 0x556da9c2), TOBN(0xa5934267, 0x81a732b2)}},
+     {{TOBN(0xec1214ed, 0xa714181d), TOBN(0x609ac13b, 0x6067b341),
+       TOBN(0xff4b4c97, 0xa545df1f), TOBN(0xa1240501, 0x34d2076b)},
+      {TOBN(0x6efa0c23, 0x1409ca97), TOBN(0x254cc1a8, 0x20638c43),
+       TOBN(0xd4e363af, 0xdcfb46cd), TOBN(0x62c2adc3, 0x03942a27)}},
+     {{TOBN(0xc67b9df0, 0x56e46483), TOBN(0xa55abb20, 0x63736356),
+       TOBN(0xab93c098, 0xc551bc52), TOBN(0x382b49f9, 0xb15fe64b)},
+      {TOBN(0x9ec221ad, 0x4dff8d47), TOBN(0x79caf615, 0x437df4d6),
+       TOBN(0x5f13dc64, 0xbb456509), TOBN(0xe4c589d9, 0x191f0714)}},
+     {{TOBN(0x27b6a8ab, 0x3fd40e09), TOBN(0xe455842e, 0x77313ea9),
+       TOBN(0x8b51d1e2, 0x1f55988b), TOBN(0x5716dd73, 0x062bbbfc)},
+      {TOBN(0x633c11e5, 0x4e8bf3de), TOBN(0x9a0e77b6, 0x1b85be3b),
+       TOBN(0x56510729, 0x0911cca6), TOBN(0x27e76495, 0xefa6590f)}},
+     {{TOBN(0xe4ac8b33, 0x070d3aab), TOBN(0x2643672b, 0x9a2cd5e5),
+       TOBN(0x52eff79b, 0x1cfc9173), TOBN(0x665ca49b, 0x90a7c13f)},
+      {TOBN(0x5a8dda59, 0xb3efb998), TOBN(0x8a5b922d, 0x052f1341),
+       TOBN(0xae9ebbab, 0x3cf9a530), TOBN(0x35986e7b, 0xf56da4d7)}},
+     {{TOBN(0x3a636b5c, 0xff3513cc), TOBN(0xbb0cf8ba, 0x3198f7dd),
+       TOBN(0xb8d40522, 0x41f16f86), TOBN(0x760575d8, 0xde13a7bf)},
+      {TOBN(0x36f74e16, 0x9f7aa181), TOBN(0x163a3ecf, 0xf509ed1c),
+       TOBN(0x6aead61f, 0x3c40a491), TOBN(0x158c95fc, 0xdfe8fcaa)}},
+     {{TOBN(0xa3991b6e, 0x13cda46f), TOBN(0x79482415, 0x342faed0),
+       TOBN(0xf3ba5bde, 0x666b5970), TOBN(0x1d52e6bc, 0xb26ab6dd)},
+      {TOBN(0x768ba1e7, 0x8608dd3d), TOBN(0x4930db2a, 0xea076586),
+       TOBN(0xd9575714, 0xe7dc1afa), TOBN(0x1fc7bf7d, 0xf7c58817)}},
+     {{TOBN(0x6b47accd, 0xd9eee96c), TOBN(0x0ca277fb, 0xe58cec37),
+       TOBN(0x113fe413, 0xe702c42a), TOBN(0xdd1764ee, 0xc47cbe51)},
+      {TOBN(0x041e7cde, 0x7b3ed739), TOBN(0x50cb7459, 0x5ce9e1c0),
+       TOBN(0x35568513, 0x2925b212), TOBN(0x7cff95c4, 0x001b081c)}},
+     {{TOBN(0x63ee4cbd, 0x8088b454), TOBN(0xdb7f32f7, 0x9a9e0c8a),
+       TOBN(0xb377d418, 0x6b2447cb), TOBN(0xe3e982aa, 0xd370219b)},
+      {TOBN(0x06ccc1e4, 0xc2a2a593), TOBN(0x72c36865, 0x0773f24f),
+       TOBN(0xa13b4da7, 0x95859423), TOBN(0x8bbf1d33, 0x75040c8f)}},
+     {{TOBN(0x726f0973, 0xda50c991), TOBN(0x48afcd5b, 0x822d6ee2),
+       TOBN(0xe5fc718b, 0x20fd7771), TOBN(0xb9e8e77d, 0xfd0807a1)},
+      {TOBN(0x7f5e0f44, 0x99a7703d), TOBN(0x6972930e, 0x618e36f3),
+       TOBN(0x2b7c77b8, 0x23807bbe), TOBN(0xe5b82405, 0xcb27ff50)}},
+     {{TOBN(0xba8b8be3, 0xbd379062), TOBN(0xd64b7a1d, 0x2dce4a92),
+       TOBN(0x040a73c5, 0xb2952e37), TOBN(0x0a9e252e, 0xd438aeca)},
+      {TOBN(0xdd43956b, 0xc39d3bcb), TOBN(0x1a31ca00, 0xb32b2d63),
+       TOBN(0xd67133b8, 0x5c417a18), TOBN(0xd08e4790, 0x2ef442c8)}},
+     {{TOBN(0x98cb1ae9, 0x255c0980), TOBN(0x4bd86381, 0x2b4a739f),
+       TOBN(0x5a5c31e1, 0x1e4a45a1), TOBN(0x1e5d55fe, 0x9cb0db2f)},
+      {TOBN(0x74661b06, 0x8ff5cc29), TOBN(0x026b389f, 0x0eb8a4f4),
+       TOBN(0x536b21a4, 0x58848c24), TOBN(0x2e5bf8ec, 0x81dc72b0)}},
+     {{TOBN(0x03c187d0, 0xad886aac), TOBN(0x5c16878a, 0xb771b645),
+       TOBN(0xb07dfc6f, 0xc74045ab), TOBN(0x2c6360bf, 0x7800caed)},
+      {TOBN(0x24295bb5, 0xb9c972a3), TOBN(0xc9e6f88e, 0x7c9a6dba),
+       TOBN(0x90ffbf24, 0x92a79aa6), TOBN(0xde29d50a, 0x41c26ac2)}},
+     {{TOBN(0x9f0af483, 0xd309cbe6), TOBN(0x5b020d8a, 0xe0bced4f),
+       TOBN(0x606e986d, 0xb38023e3), TOBN(0xad8f2c9d, 0x1abc6933)},
+      {TOBN(0x19292e1d, 0xe7400e93), TOBN(0xfe3e18a9, 0x52be5e4d),
+       TOBN(0xe8e9771d, 0x2e0680bf), TOBN(0x8c5bec98, 0xc54db063)}},
+     {{TOBN(0x2af9662a, 0x74a55d1f), TOBN(0xe3fbf28f, 0x046f66d8),
+       TOBN(0xa3a72ab4, 0xd4dc4794), TOBN(0x09779f45, 0x5c7c2dd8)},
+      {TOBN(0xd893bdaf, 0xc3d19d8d), TOBN(0xd5a75094, 0x57d6a6df),
+       TOBN(0x8cf8fef9, 0x952e6255), TOBN(0x3da67cfb, 0xda9a8aff)}},
+     {{TOBN(0x4c23f62a, 0x2c160dcd), TOBN(0x34e6c5e3, 0x8f90eaef),
+       TOBN(0x35865519, 0xa9a65d5a), TOBN(0x07c48aae, 0x8fd38a3d)},
+      {TOBN(0xb7e7aeda, 0x50068527), TOBN(0x2c09ef23, 0x1c90936a),
+       TOBN(0x31ecfeb6, 0xe879324c), TOBN(0xa0871f6b, 0xfb0ec938)}},
+     {{TOBN(0xb1f0fb68, 0xd84d835d), TOBN(0xc90caf39, 0x861dc1e6),
+       TOBN(0x12e5b046, 0x7594f8d7), TOBN(0x26897ae2, 0x65012b92)},
+      {TOBN(0xbcf68a08, 0xa4d6755d), TOBN(0x403ee41c, 0x0991fbda),
+       TOBN(0x733e343e, 0x3bbf17e8), TOBN(0xd2c7980d, 0x679b3d65)}},
+     {{TOBN(0x33056232, 0xd2e11305), TOBN(0x966be492, 0xf3c07a6f),
+       TOBN(0x6a8878ff, 0xbb15509d), TOBN(0xff221101, 0x0a9b59a4)},
+      {TOBN(0x6c9f564a, 0xabe30129), TOBN(0xc6f2c940, 0x336e64cf),
+       TOBN(0x0fe75262, 0x8b0c8022), TOBN(0xbe0267e9, 0x6ae8db87)}},
+     {{TOBN(0x22e192f1, 0x93bc042b), TOBN(0xf085b534, 0xb237c458),
+       TOBN(0xa0d192bd, 0x832c4168), TOBN(0x7a76e9e3, 0xbdf6271d)},
+      {TOBN(0x52a882fa, 0xb88911b5), TOBN(0xc85345e4, 0xb4db0eb5),
+       TOBN(0xa3be02a6, 0x81a7c3ff), TOBN(0x51889c8c, 0xf0ec0469)}},
+     {{TOBN(0x9d031369, 0xa5e829e5), TOBN(0xcbb4c6fc, 0x1607aa41),
+       TOBN(0x75ac59a6, 0x241d84c1), TOBN(0xc043f2bf, 0x8829e0ee)},
+      {TOBN(0x82a38f75, 0x8ea5e185), TOBN(0x8bda40b9, 0xd87cbd9f),
+       TOBN(0x9e65e75e, 0x2d8fc601), TOBN(0x3d515f74, 0xa35690b3)}},
+     {{TOBN(0x534acf4f, 0xda79e5ac), TOBN(0x68b83b3a, 0x8630215f),
+       TOBN(0x5c748b2e, 0xd085756e), TOBN(0xb0317258, 0xe5d37cb2)},
+      {TOBN(0x6735841a, 0xc5ccc2c4), TOBN(0x7d7dc96b, 0x3d9d5069),
+       TOBN(0xa147e410, 0xfd1754bd), TOBN(0x65296e94, 0xd399ddd5)}},
+     {{TOBN(0xf6b5b2d0, 0xbc8fa5bc), TOBN(0x8a5ead67, 0x500c277b),
+       TOBN(0x214625e6, 0xdfa08a5d), TOBN(0x51fdfedc, 0x959cf047)},
+      {TOBN(0x6bc9430b, 0x289fca32), TOBN(0xe36ff0cf, 0x9d9bdc3f),
+       TOBN(0x2fe187cb, 0x58ea0ede), TOBN(0xed66af20, 0x5a900b3f)}},
+     {{TOBN(0x00e0968b, 0x5fa9f4d6), TOBN(0x2d4066ce, 0x37a362e7),
+       TOBN(0xa99a9748, 0xbd07e772), TOBN(0x710989c0, 0x06a4f1d0)},
+      {TOBN(0xd5dedf35, 0xce40cbd8), TOBN(0xab55c5f0, 0x1743293d),
+       TOBN(0x766f1144, 0x8aa24e2c), TOBN(0x94d874f8, 0x605fbcb4)}},
+     {{TOBN(0xa365f0e8, 0xa518001b), TOBN(0xee605eb6, 0x9d04ef0f),
+       TOBN(0x5a3915cd, 0xba8d4d25), TOBN(0x44c0e1b8, 0xb5113472)},
+      {TOBN(0xcbb024e8, 0x8b6740dc), TOBN(0x89087a53, 0xee1d4f0c),
+       TOBN(0xa88fa05c, 0x1fc4e372), TOBN(0x8bf395cb, 0xaf8b3af2)}},
+     {{TOBN(0x1e71c9a1, 0xdeb8568b), TOBN(0xa35daea0, 0x80fb3d32),
+       TOBN(0xe8b6f266, 0x2cf8fb81), TOBN(0x6d51afe8, 0x9490696a)},
+      {TOBN(0x81beac6e, 0x51803a19), TOBN(0xe3d24b7f, 0x86219080),
+       TOBN(0x727cfd9d, 0xdf6f463c), TOBN(0x8c6865ca, 0x72284ee8)}},
+     {{TOBN(0x32c88b7d, 0xb743f4ef), TOBN(0x3793909b, 0xe7d11dce),
+       TOBN(0xd398f922, 0x2ff2ebe8), TOBN(0x2c70ca44, 0xe5e49796)},
+      {TOBN(0xdf4d9929, 0xcb1131b1), TOBN(0x7826f298, 0x25888e79),
+       TOBN(0x4d3a112c, 0xf1d8740a), TOBN(0x00384cb6, 0x270afa8b)}},
+     {{TOBN(0xcb64125b, 0x3ab48095), TOBN(0x3451c256, 0x62d05106),
+       TOBN(0xd73d577d, 0xa4955845), TOBN(0x39570c16, 0xbf9f4433)},
+      {TOBN(0xd7dfaad3, 0xadecf263), TOBN(0xf1c3d8d1, 0xdc76e102),
+       TOBN(0x5e774a58, 0x54c6a836), TOBN(0xdad4b672, 0x3e92d47b)}},
+     {{TOBN(0xbe7e990f, 0xf0d796a0), TOBN(0x5fc62478, 0xdf0e8b02),
+       TOBN(0x8aae8bf4, 0x030c00ad), TOBN(0x3d2db93b, 0x9004ba0f)},
+      {TOBN(0xe48c8a79, 0xd85d5ddc), TOBN(0xe907caa7, 0x6bb07f34),
+       TOBN(0x58db343a, 0xa39eaed5), TOBN(0x0ea6e007, 0xadaf5724)}},
+     {{TOBN(0xe00df169, 0xd23233f3), TOBN(0x3e322796, 0x77cb637f),
+       TOBN(0x1f897c0e, 0x1da0cf6c), TOBN(0xa651f5d8, 0x31d6bbdd)},
+      {TOBN(0xdd61af19, 0x1a230c76), TOBN(0xbd527272, 0xcdaa5e4a),
+       TOBN(0xca753636, 0xd0abcd7e), TOBN(0x78bdd37c, 0x370bd8dc)}},
+     {{TOBN(0xc23916c2, 0x17cd93fe), TOBN(0x65b97a4d, 0xdadce6e2),
+       TOBN(0xe04ed4eb, 0x174e42f8), TOBN(0x1491ccaa, 0xbb21480a)},
+      {TOBN(0x145a8280, 0x23196332), TOBN(0x3c3862d7, 0x587b479a),
+       TOBN(0x9f4a88a3, 0x01dcd0ed), TOBN(0x4da2b7ef, 0x3ea12f1f)}},
+     {{TOBN(0xf8e7ae33, 0xb126e48e), TOBN(0x404a0b32, 0xf494e237),
+       TOBN(0x9beac474, 0xc55acadb), TOBN(0x4ee5cf3b, 0xcbec9fd9)},
+      {TOBN(0x336b33b9, 0x7df3c8c3), TOBN(0xbd905fe3, 0xb76808fd),
+       TOBN(0x8f436981, 0xaa45c16a), TOBN(0x255c5bfa, 0x3dd27b62)}},
+     {{TOBN(0x71965cbf, 0xc3dd9b4d), TOBN(0xce23edbf, 0xfc068a87),
+       TOBN(0xb78d4725, 0x745b029b), TOBN(0x74610713, 0xcefdd9bd)},
+      {TOBN(0x7116f75f, 0x1266bf52), TOBN(0x02046722, 0x18e49bb6),
+       TOBN(0xdf43df9f, 0x3d6f19e3), TOBN(0xef1bc7d0, 0xe685cb2f)}},
+     {{TOBN(0xcddb27c1, 0x7078c432), TOBN(0xe1961b9c, 0xb77fedb7),
+       TOBN(0x1edc2f5c, 0xc2290570), TOBN(0x2c3fefca, 0x19cbd886)},
+      {TOBN(0xcf880a36, 0xc2af389a), TOBN(0x96c610fd, 0xbda71cea),
+       TOBN(0xf03977a9, 0x32aa8463), TOBN(0x8eb7763f, 0x8586d90a)}},
+     {{TOBN(0x3f342454, 0x2a296e77), TOBN(0xc8718683, 0x42837a35),
+       TOBN(0x7dc71090, 0x6a09c731), TOBN(0x54778ffb, 0x51b816db)},
+      {TOBN(0x6b33bfec, 0xaf06defd), TOBN(0xfe3c105f, 0x8592b70b),
+       TOBN(0xf937fda4, 0x61da6114), TOBN(0x3c13e651, 0x4c266ad7)}},
+     {{TOBN(0xe363a829, 0x855938e8), TOBN(0x2eeb5d9e, 0x9de54b72),
+       TOBN(0xbeb93b0e, 0x20ccfab9), TOBN(0x3dffbb5f, 0x25e61a25)},
+      {TOBN(0x7f655e43, 0x1acc093d), TOBN(0x0cb6cc3d, 0x3964ce61),
+       TOBN(0x6ab283a1, 0xe5e9b460), TOBN(0x55d787c5, 0xa1c7e72d)}},
+     {{TOBN(0x4d2efd47, 0xdeadbf02), TOBN(0x11e80219, 0xac459068),
+       TOBN(0x810c7626, 0x71f311f0), TOBN(0xfa17ef8d, 0x4ab6ef53)},
+      {TOBN(0xaf47fd25, 0x93e43bff), TOBN(0x5cb5ff3f, 0x0be40632),
+       TOBN(0x54687106, 0x8ee61da3), TOBN(0x7764196e, 0xb08afd0f)}},
+     {{TOBN(0x831ab3ed, 0xf0290a8f), TOBN(0xcae81966, 0xcb47c387),
+       TOBN(0xaad7dece, 0x184efb4f), TOBN(0xdcfc53b3, 0x4749110e)},
+      {TOBN(0x6698f23c, 0x4cb632f9), TOBN(0xc42a1ad6, 0xb91f8067),
+       TOBN(0xb116a81d, 0x6284180a), TOBN(0xebedf5f8, 0xe901326f)}},
+     {{TOBN(0xf2274c9f, 0x97e3e044), TOBN(0x42018520, 0x11d09fc9),
+       TOBN(0x56a65f17, 0xd18e6e23), TOBN(0x2ea61e2a, 0x352b683c)},
+      {TOBN(0x27d291bc, 0x575eaa94), TOBN(0x9e7bc721, 0xb8ff522d),
+       TOBN(0x5f7268bf, 0xa7f04d6f), TOBN(0x5868c73f, 0xaba41748)}},
+     {{TOBN(0x9f85c2db, 0x7be0eead), TOBN(0x511e7842, 0xff719135),
+       TOBN(0x5a06b1e9, 0xc5ea90d7), TOBN(0x0c19e283, 0x26fab631)},
+      {TOBN(0x8af8f0cf, 0xe9206c55), TOBN(0x89389cb4, 0x3553c06a),
+       TOBN(0x39dbed97, 0xf65f8004), TOBN(0x0621b037, 0xc508991d)}},
+     {{TOBN(0x1c52e635, 0x96e78cc4), TOBN(0x5385c8b2, 0x0c06b4a8),
+       TOBN(0xd84ddfdb, 0xb0e87d03), TOBN(0xc49dfb66, 0x934bafad)},
+      {TOBN(0x7071e170, 0x59f70772), TOBN(0x3a073a84, 0x3a1db56b),
+       TOBN(0x03494903, 0x3b8af190), TOBN(0x7d882de3, 0xd32920f0)}},
+     {{TOBN(0x91633f0a, 0xb2cf8940), TOBN(0x72b0b178, 0x6f948f51),
+       TOBN(0x2d28dc30, 0x782653c8), TOBN(0x88829849, 0xdb903a05)},
+      {TOBN(0xb8095d0c, 0x6a19d2bb), TOBN(0x4b9e7f0c, 0x86f782cb),
+       TOBN(0x7af73988, 0x2d907064), TOBN(0xd12be0fe, 0x8b32643c)}},
+     {{TOBN(0x358ed23d, 0x0e165dc3), TOBN(0x3d47ce62, 0x4e2378ce),
+       TOBN(0x7e2bb0b9, 0xfeb8a087), TOBN(0x3246e8ae, 0xe29e10b9)},
+      {TOBN(0x459f4ec7, 0x03ce2b4d), TOBN(0xe9b4ca1b, 0xbbc077cf),
+       TOBN(0x2613b4f2, 0x0e9940c1), TOBN(0xfc598bb9, 0x047d1eb1)}},
+     {{TOBN(0x9744c62b, 0x45036099), TOBN(0xa9dee742, 0x167c65d8),
+       TOBN(0x0c511525, 0xdabe1943), TOBN(0xda110554, 0x93c6c624)},
+      {TOBN(0xae00a52c, 0x651a3be2), TOBN(0xcda5111d, 0x884449a6),
+       TOBN(0x063c06f4, 0xff33bed1), TOBN(0x73baaf9a, 0x0d3d76b4)}},
+     {{TOBN(0x52fb0c9d, 0x7fc63668), TOBN(0x6886c9dd, 0x0c039cde),
+       TOBN(0x602bd599, 0x55b22351), TOBN(0xb00cab02, 0x360c7c13)},
+      {TOBN(0x8cb616bc, 0x81b69442), TOBN(0x41486700, 0xb55c3cee),
+       TOBN(0x71093281, 0xf49ba278), TOBN(0xad956d9c, 0x64a50710)}},
+     {{TOBN(0x9561f28b, 0x638a7e81), TOBN(0x54155cdf, 0x5980ddc3),
+       TOBN(0xb2db4a96, 0xd26f247a), TOBN(0x9d774e4e, 0x4787d100)},
+      {TOBN(0x1a9e6e2e, 0x078637d2), TOBN(0x1c363e2d, 0x5e0ae06a),
+       TOBN(0x7493483e, 0xe9cfa354), TOBN(0x76843cb3, 0x7f74b98d)}},
+     {{TOBN(0xbaca6591, 0xd4b66947), TOBN(0xb452ce98, 0x04460a8c),
+       TOBN(0x6830d246, 0x43768f55), TOBN(0xf4197ed8, 0x7dff12df)},
+      {TOBN(0x6521b472, 0x400dd0f7), TOBN(0x59f5ca8f, 0x4b1e7093),
+       TOBN(0x6feff11b, 0x080338ae), TOBN(0x0ada31f6, 0xa29ca3c6)}},
+     {{TOBN(0x24794eb6, 0x94a2c215), TOBN(0xd83a43ab, 0x05a57ab4),
+       TOBN(0x264a543a, 0x2a6f89fe), TOBN(0x2c2a3868, 0xdd5ec7c2)},
+      {TOBN(0xd3373940, 0x8439d9b2), TOBN(0x715ea672, 0x0acd1f11),
+       TOBN(0x42c1d235, 0xe7e6cc19), TOBN(0x81ce6e96, 0xb990585c)}},
+     {{TOBN(0x04e5dfe0, 0xd809c7bd), TOBN(0xd7b2580c, 0x8f1050ab),
+       TOBN(0x6d91ad78, 0xd8a4176f), TOBN(0x0af556ee, 0x4e2e897c)},
+      {TOBN(0x162a8b73, 0x921de0ac), TOBN(0x52ac9c22, 0x7ea78400),
+       TOBN(0xee2a4eea, 0xefce2174), TOBN(0xbe61844e, 0x6d637f79)}},
+     {{TOBN(0x0491f1bc, 0x789a283b), TOBN(0x72d3ac3d, 0x880836f4),
+       TOBN(0xaa1c5ea3, 0x88e5402d), TOBN(0x1b192421, 0xd5cc473d)},
+      {TOBN(0x5c0b9998, 0x9dc84cac), TOBN(0xb0a8482d, 0x9c6e75b8),
+       TOBN(0x639961d0, 0x3a191ce2), TOBN(0xda3bc865, 0x6d837930)}},
+     {{TOBN(0xca990653, 0x056e6f8f), TOBN(0x84861c41, 0x64d133a7),
+       TOBN(0x8b403276, 0x746abe40), TOBN(0xb7b4d51a, 0xebf8e303)},
+      {TOBN(0x05b43211, 0x220a255d), TOBN(0xc997152c, 0x02419e6e),
+       TOBN(0x76ff47b6, 0x630c2fea), TOBN(0x50518677, 0x281fdade)}},
+     {{TOBN(0x3283b8ba, 0xcf902b0b), TOBN(0x8d4b4eb5, 0x37db303b),
+       TOBN(0xcc89f42d, 0x755011bc), TOBN(0xb43d74bb, 0xdd09d19b)},
+      {TOBN(0x65746bc9, 0x8adba350), TOBN(0x364eaf8c, 0xb51c1927),
+       TOBN(0x13c76596, 0x10ad72ec), TOBN(0x30045121, 0xf8d40c20)}},
+     {{TOBN(0x6d2d99b7, 0xea7b979b), TOBN(0xcd78cd74, 0xe6fb3bcd),
+       TOBN(0x11e45a9e, 0x86cffbfe), TOBN(0x78a61cf4, 0x637024f6)},
+      {TOBN(0xd06bc872, 0x3d502295), TOBN(0xf1376854, 0x458cb288),
+       TOBN(0xb9db26a1, 0x342f8586), TOBN(0xf33effcf, 0x4beee09e)}},
+     {{TOBN(0xd7e0c4cd, 0xb30cfb3a), TOBN(0x6d09b8c1, 0x6c9db4c8),
+       TOBN(0x40ba1a42, 0x07c8d9df), TOBN(0x6fd495f7, 0x1c52c66d)},
+      {TOBN(0xfb0e169f, 0x275264da), TOBN(0x80c2b746, 0xe57d8362),
+       TOBN(0xedd987f7, 0x49ad7222), TOBN(0xfdc229af, 0x4398ec7b)}}},
+    {{{TOBN(0xb0d1ed84, 0x52666a58), TOBN(0x4bcb6e00, 0xe6a9c3c2),
+       TOBN(0x3c57411c, 0x26906408), TOBN(0xcfc20755, 0x13556400)},
+      {TOBN(0xa08b1c50, 0x5294dba3), TOBN(0xa30ba286, 0x8b7dd31e),
+       TOBN(0xd70ba90e, 0x991eca74), TOBN(0x094e142c, 0xe762c2b9)}},
+     {{TOBN(0xb81d783e, 0x979f3925), TOBN(0x1efd130a, 0xaf4c89a7),
+       TOBN(0x525c2144, 0xfd1bf7fa), TOBN(0x4b296904, 0x1b265a9e)},
+      {TOBN(0xed8e9634, 0xb9db65b6), TOBN(0x35c82e32, 0x03599d8a),
+       TOBN(0xdaa7a54f, 0x403563f3), TOBN(0x9df088ad, 0x022c38ab)}},
+     {{TOBN(0xe5cfb066, 0xbb3fd30a), TOBN(0x429169da, 0xeff0354e),
+       TOBN(0x809cf852, 0x3524e36c), TOBN(0x136f4fb3, 0x0155be1d)},
+      {TOBN(0x4826af01, 0x1fbba712), TOBN(0x6ef0f0b4, 0x506ba1a1),
+       TOBN(0xd9928b31, 0x77aea73e), TOBN(0xe2bf6af2, 0x5eaa244e)}},
+     {{TOBN(0x8d084f12, 0x4237b64b), TOBN(0x688ebe99, 0xe3ecfd07),
+       TOBN(0x57b8a70c, 0xf6845dd8), TOBN(0x808fc59c, 0x5da4a325)},
+      {TOBN(0xa9032b2b, 0xa3585862), TOBN(0xb66825d5, 0xedf29386),
+       TOBN(0xb5a5a8db, 0x431ec29b), TOBN(0xbb143a98, 0x3a1e8dc8)}},
+     {{TOBN(0x35ee94ce, 0x12ae381b), TOBN(0x3a7f176c, 0x86ccda90),
+       TOBN(0xc63a657e, 0x4606eaca), TOBN(0x9ae5a380, 0x43cd04df)},
+      {TOBN(0x9bec8d15, 0xed251b46), TOBN(0x1f5d6d30, 0xcaca5e64),
+       TOBN(0x347b3b35, 0x9ff20f07), TOBN(0x4d65f034, 0xf7e4b286)}},
+     {{TOBN(0x9e93ba24, 0xf111661e), TOBN(0xedced484, 0xb105eb04),
+       TOBN(0x96dc9ba1, 0xf424b578), TOBN(0xbf8f66b7, 0xe83e9069)},
+      {TOBN(0x872d4df4, 0xd7ed8216), TOBN(0xbf07f377, 0x8e2cbecf),
+       TOBN(0x4281d899, 0x98e73754), TOBN(0xfec85fbb, 0x8aab8708)}},
+     {{TOBN(0x9a3c0dee, 0xa5ba5b0b), TOBN(0xe6a116ce, 0x42d05299),
+       TOBN(0xae9775fe, 0xe9b02d42), TOBN(0x72b05200, 0xa1545cb6)},
+      {TOBN(0xbc506f7d, 0x31a3b4ea), TOBN(0xe5893078, 0x8bbd9b32),
+       TOBN(0xc8bc5f37, 0xe4b12a97), TOBN(0x6b000c06, 0x4a73b671)}},
+     {{TOBN(0x13b5bf22, 0x765fa7d0), TOBN(0x59805bf0, 0x1d6a5370),
+       TOBN(0x67a5e29d, 0x4280db98), TOBN(0x4f53916f, 0x776b1ce3)},
+      {TOBN(0x714ff61f, 0x33ddf626), TOBN(0x4206238e, 0xa085d103),
+       TOBN(0x1c50d4b7, 0xe5809ee3), TOBN(0x999f450d, 0x85f8eb1d)}},
+     {{TOBN(0x658a6051, 0xe4c79e9b), TOBN(0x1394cb73, 0xc66a9fea),
+       TOBN(0x27f31ed5, 0xc6be7b23), TOBN(0xf4c88f36, 0x5aa6f8fe)},
+      {TOBN(0x0fb0721f, 0x4aaa499e), TOBN(0x68b3a7d5, 0xe3fb2a6b),
+       TOBN(0xa788097d, 0x3a92851d), TOBN(0x060e7f8a, 0xe96f4913)}},
+     {{TOBN(0x82eebe73, 0x1a3a93bc), TOBN(0x42bbf465, 0xa21adc1a),
+       TOBN(0xc10b6fa4, 0xef030efd), TOBN(0x247aa4c7, 0x87b097bb)},
+      {TOBN(0x8b8dc632, 0xf60c77da), TOBN(0x6ffbc26a, 0xc223523e),
+       TOBN(0xa4f6ff11, 0x344579cf), TOBN(0x5825653c, 0x980250f6)}},
+     {{TOBN(0xb2dd097e, 0xbc1aa2b9), TOBN(0x07889393, 0x37a0333a),
+       TOBN(0x1cf55e71, 0x37a0db38), TOBN(0x2648487f, 0x792c1613)},
+      {TOBN(0xdad01336, 0x3fcef261), TOBN(0x6239c81d, 0x0eabf129),
+       TOBN(0x8ee761de, 0x9d276be2), TOBN(0x406a7a34, 0x1eda6ad3)}},
+     {{TOBN(0x4bf367ba, 0x4a493b31), TOBN(0x54f20a52, 0x9bf7f026),
+       TOBN(0xb696e062, 0x9795914b), TOBN(0xcddab96d, 0x8bf236ac)},
+      {TOBN(0x4ff2c70a, 0xed25ea13), TOBN(0xfa1d09eb, 0x81cbbbe7),
+       TOBN(0x88fc8c87, 0x468544c5), TOBN(0x847a670d, 0x696b3317)}},
+     {{TOBN(0xf133421e, 0x64bcb626), TOBN(0xaea638c8, 0x26dee0b5),
+       TOBN(0xd6e7680b, 0xb310346c), TOBN(0xe06f4097, 0xd5d4ced3)},
+      {TOBN(0x09961452, 0x7512a30b), TOBN(0xf3d867fd, 0xe589a59a),
+       TOBN(0x2e73254f, 0x52d0c180), TOBN(0x9063d8a3, 0x333c74ac)}},
+     {{TOBN(0xeda6c595, 0xd314e7bc), TOBN(0x2ee7464b, 0x467899ed),
+       TOBN(0x1cef423c, 0x0a1ed5d3), TOBN(0x217e76ea, 0x69cc7613)},
+      {TOBN(0x27ccce1f, 0xe7cda917), TOBN(0x12d8016b, 0x8a893f16),
+       TOBN(0xbcd6de84, 0x9fc74f6b), TOBN(0xfa5817e2, 0xf3144e61)}},
+     {{TOBN(0x1f354164, 0x0821ee4c), TOBN(0x1583eab4, 0x0bc61992),
+       TOBN(0x7490caf6, 0x1d72879f), TOBN(0x998ad9f3, 0xf76ae7b2)},
+      {TOBN(0x1e181950, 0xa41157f7), TOBN(0xa9d7e1e6, 0xe8da3a7e),
+       TOBN(0x963784eb, 0x8426b95f), TOBN(0x0ee4ed6e, 0x542e2a10)}},
+     {{TOBN(0xb79d4cc5, 0xac751e7b), TOBN(0x93f96472, 0xfd4211bd),
+       TOBN(0x8c72d3d2, 0xc8de4fc6), TOBN(0x7b69cbf5, 0xdf44f064)},
+      {TOBN(0x3da90ca2, 0xf4bf94e1), TOBN(0x1a5325f8, 0xf12894e2),
+       TOBN(0x0a437f6c, 0x7917d60b), TOBN(0x9be70486, 0x96c9cb5d)}},
+     {{TOBN(0xb4d880bf, 0xe1dc5c05), TOBN(0xd738adda, 0xeebeeb57),
+       TOBN(0x6f0119d3, 0xdf0fe6a3), TOBN(0x5c686e55, 0x66eaaf5a)},
+      {TOBN(0x9cb10b50, 0xdfd0b7ec), TOBN(0xbdd0264b, 0x6a497c21),
+       TOBN(0xfc093514, 0x8c546c96), TOBN(0x58a947fa, 0x79dbf42a)}},
+     {{TOBN(0xc0b48d4e, 0x49ccd6d7), TOBN(0xff8fb02c, 0x88bd5580),
+       TOBN(0xc75235e9, 0x07d473b2), TOBN(0x4fab1ac5, 0xa2188af3)},
+      {TOBN(0x030fa3bc, 0x97576ec0), TOBN(0xe8c946e8, 0x0b7e7d2f),
+       TOBN(0x40a5c9cc, 0x70305600), TOBN(0x6d8260a9, 0xc8b013b4)}},
+     {{TOBN(0x0368304f, 0x70bba85c), TOBN(0xad090da1, 0xa4a0d311),
+       TOBN(0x7170e870, 0x2415eec1), TOBN(0xbfba35fe, 0x8461ea47)},
+      {TOBN(0x6279019a, 0xc1e91938), TOBN(0xa47638f3, 0x1afc415f),
+       TOBN(0x36c65cbb, 0xbcba0e0f), TOBN(0x02160efb, 0x034e2c48)}},
+     {{TOBN(0xe6c51073, 0x615cd9e4), TOBN(0x498ec047, 0xf1243c06),
+       TOBN(0x3e5a8809, 0xb17b3d8c), TOBN(0x5cd99e61, 0x0cc565f1)},
+      {TOBN(0x81e312df, 0x7851dafe), TOBN(0xf156f5ba, 0xa79061e2),
+       TOBN(0x80d62b71, 0x880c590e), TOBN(0xbec9746f, 0x0a39faa1)}},
+     {{TOBN(0x1d98a9c1, 0xc8ed1f7a), TOBN(0x09e43bb5, 0xa81d5ff2),
+       TOBN(0xd5f00f68, 0x0da0794a), TOBN(0x412050d9, 0x661aa836)},
+      {TOBN(0xa89f7c4e, 0x90747e40), TOBN(0x6dc05ebb, 0xb62a3686),
+       TOBN(0xdf4de847, 0x308e3353), TOBN(0x53868fbb, 0x9fb53bb9)}},
+     {{TOBN(0x2b09d2c3, 0xcfdcf7dd), TOBN(0x41a9fce3, 0x723fcab4),
+       TOBN(0x73d905f7, 0x07f57ca3), TOBN(0x080f9fb1, 0xac8e1555)},
+      {TOBN(0x7c088e84, 0x9ba7a531), TOBN(0x07d35586, 0xed9a147f),
+       TOBN(0x602846ab, 0xaf48c336), TOBN(0x7320fd32, 0x0ccf0e79)}},
+     {{TOBN(0xaa780798, 0xb18bd1ff), TOBN(0x52c2e300, 0xafdd2905),
+       TOBN(0xf27ea3d6, 0x434267cd), TOBN(0x8b96d16d, 0x15605b5f)},
+      {TOBN(0x7bb31049, 0x4b45706b), TOBN(0xe7f58b8e, 0x743d25f8),
+       TOBN(0xe9b5e45b, 0x87f30076), TOBN(0xd19448d6, 0x5d053d5a)}},
+     {{TOBN(0x1ecc8cb9, 0xd3210a04), TOBN(0x6bc7d463, 0xdafb5269),
+       TOBN(0x3e59b10a, 0x67c3489f), TOBN(0x1769788c, 0x65641e1b)},
+      {TOBN(0x8a53b82d, 0xbd6cb838), TOBN(0x7066d6e6, 0x236d5f22),
+       TOBN(0x03aa1c61, 0x6908536e), TOBN(0xc971da0d, 0x66ae9809)}},
+     {{TOBN(0x01b3a86b, 0xc49a2fac), TOBN(0x3b8420c0, 0x3092e77a),
+       TOBN(0x02057300, 0x7d6fb556), TOBN(0x6941b2a1, 0xbff40a87)},
+      {TOBN(0x140b6308, 0x0658ff2a), TOBN(0x87804363, 0x3424ab36),
+       TOBN(0x0253bd51, 0x5751e299), TOBN(0xc75bcd76, 0x449c3e3a)}},
+     {{TOBN(0x92eb4090, 0x7f8f875d), TOBN(0x9c9d754e, 0x56c26bbf),
+       TOBN(0x158cea61, 0x8110bbe7), TOBN(0x62a6b802, 0x745f91ea)},
+      {TOBN(0xa79c41aa, 0xc6e7394b), TOBN(0x445b6a83, 0xad57ef10),
+       TOBN(0x0c5277eb, 0x6ea6f40c), TOBN(0x319fe96b, 0x88633365)}},
+     {{TOBN(0x0b0fc61f, 0x385f63cb), TOBN(0x41250c84, 0x22bdd127),
+       TOBN(0x67d153f1, 0x09e942c2), TOBN(0x60920d08, 0xc021ad5d)},
+      {TOBN(0x229f5746, 0x724d81a5), TOBN(0xb7ffb892, 0x5bba3299),
+       TOBN(0x518c51a1, 0xde413032), TOBN(0x2a9bfe77, 0x3c2fd94c)}},
+     {{TOBN(0xcbcde239, 0x3191f4fd), TOBN(0x43093e16, 0xd3d6ada1),
+       TOBN(0x184579f3, 0x58769606), TOBN(0x2c94a8b3, 0xd236625c)},
+      {TOBN(0x6922b9c0, 0x5c437d8e), TOBN(0x3d4ae423, 0xd8d9f3c8),
+       TOBN(0xf72c31c1, 0x2e7090a2), TOBN(0x4ac3f5f3, 0xd76a55bd)}},
+     {{TOBN(0x342508fc, 0x6b6af991), TOBN(0x0d527100, 0x1b5cebbd),
+       TOBN(0xb84740d0, 0xdd440dd7), TOBN(0x748ef841, 0x780162fd)},
+      {TOBN(0xa8dbfe0e, 0xdfc6fafb), TOBN(0xeadfdf05, 0xf7300f27),
+       TOBN(0x7d06555f, 0xfeba4ec9), TOBN(0x12c56f83, 0x9e25fa97)}},
+     {{TOBN(0x77f84203, 0xd39b8c34), TOBN(0xed8b1be6, 0x3125eddb),
+       TOBN(0x5bbf2441, 0xf6e39dc5), TOBN(0xb00f6ee6, 0x6a5d678a)},
+      {TOBN(0xba456ecf, 0x57d0ea99), TOBN(0xdcae0f58, 0x17e06c43),
+       TOBN(0x01643de4, 0x0f5b4baa), TOBN(0x2c324341, 0xd161b9be)}},
+     {{TOBN(0x80177f55, 0xe126d468), TOBN(0xed325f1f, 0x76748e09),
+       TOBN(0x6116004a, 0xcfa9bdc2), TOBN(0x2d8607e6, 0x3a9fb468)},
+      {TOBN(0x0e573e27, 0x6009d660), TOBN(0x3a525d2e, 0x8d10c5a1),
+       TOBN(0xd26cb45c, 0x3b9009a0), TOBN(0xb6b0cdc0, 0xde9d7448)}},
+     {{TOBN(0x949c9976, 0xe1337c26), TOBN(0x6faadebd, 0xd73d68e5),
+       TOBN(0x9e158614, 0xf1b768d9), TOBN(0x22dfa557, 0x9cc4f069)},
+      {TOBN(0xccd6da17, 0xbe93c6d6), TOBN(0x24866c61, 0xa504f5b9),
+       TOBN(0x2121353c, 0x8d694da1), TOBN(0x1c6ca580, 0x0140b8c6)}},
+     {{TOBN(0xc245ad8c, 0xe964021e), TOBN(0xb83bffba, 0x032b82b3),
+       TOBN(0xfaa220c6, 0x47ef9898), TOBN(0x7e8d3ac6, 0x982c948a)},
+      {TOBN(0x1faa2091, 0xbc2d124a), TOBN(0xbd54c3dd, 0x05b15ff4),
+       TOBN(0x386bf3ab, 0xc87c6fb7), TOBN(0xfb2b0563, 0xfdeb6f66)}},
+     {{TOBN(0x4e77c557, 0x5b45afb4), TOBN(0xe9ded649, 0xefb8912d),
+       TOBN(0x7ec9bbf5, 0x42f6e557), TOBN(0x2570dfff, 0x62671f00)},
+      {TOBN(0x2b3bfb78, 0x88e084bd), TOBN(0xa024b238, 0xf37fe5b4),
+       TOBN(0x44e7dc04, 0x95649aee), TOBN(0x498ca255, 0x5e7ec1d8)}},
+     {{TOBN(0x3bc766ea, 0xaaa07e86), TOBN(0x0db6facb, 0xf3608586),
+       TOBN(0xbadd2549, 0xbdc259c8), TOBN(0x95af3c6e, 0x041c649f)},
+      {TOBN(0xb36a928c, 0x02e30afb), TOBN(0x9b5356ad, 0x008a88b8),
+       TOBN(0x4b67a5f1, 0xcf1d9e9d), TOBN(0xc6542e47, 0xa5d8d8ce)}},
+     {{TOBN(0x73061fe8, 0x7adfb6cc), TOBN(0xcc826fd3, 0x98678141),
+       TOBN(0x00e758b1, 0x3c80515a), TOBN(0x6afe3247, 0x41485083)},
+      {TOBN(0x0fcb08b9, 0xb6ae8a75), TOBN(0xb8cf388d, 0x4acf51e1),
+       TOBN(0x344a5560, 0x6961b9d6), TOBN(0x1a6778b8, 0x6a97fd0c)}},
+     {{TOBN(0xd840fdc1, 0xecc4c7e3), TOBN(0xde9fe47d, 0x16db68cc),
+       TOBN(0xe95f89de, 0xa3e216aa), TOBN(0x84f1a6a4, 0x9594a8be)},
+      {TOBN(0x7ddc7d72, 0x5a7b162b), TOBN(0xc5cfda19, 0xadc817a3),
+       TOBN(0x80a5d350, 0x78b58d46), TOBN(0x93365b13, 0x82978f19)}},
+     {{TOBN(0x2e44d225, 0x26a1fc90), TOBN(0x0d6d10d2, 0x4d70705d),
+       TOBN(0xd94b6b10, 0xd70c45f4), TOBN(0x0f201022, 0xb216c079)},
+      {TOBN(0xcec966c5, 0x658fde41), TOBN(0xa8d2bc7d, 0x7e27601d),
+       TOBN(0xbfcce3e1, 0xff230be7), TOBN(0x3394ff6b, 0x0033ffb5)}},
+     {{TOBN(0xd890c509, 0x8132c9af), TOBN(0xaac4b0eb, 0x361e7868),
+       TOBN(0x5194ded3, 0xe82d15aa), TOBN(0x4550bd2e, 0x23ae6b7d)},
+      {TOBN(0x3fda318e, 0xea5399d4), TOBN(0xd989bffa, 0x91638b80),
+       TOBN(0x5ea124d0, 0xa14aa12d), TOBN(0x1fb1b899, 0x3667b944)}},
+     {{TOBN(0x95ec7969, 0x44c44d6a), TOBN(0x91df144a, 0x57e86137),
+       TOBN(0x915fd620, 0x73adac44), TOBN(0x8f01732d, 0x59a83801)},
+      {TOBN(0xec579d25, 0x3aa0a633), TOBN(0x06de5e7c, 0xc9d6d59c),
+       TOBN(0xc132f958, 0xb1ef8010), TOBN(0x29476f96, 0xe65c1a02)}},
+     {{TOBN(0x336a77c0, 0xd34c3565), TOBN(0xef1105b2, 0x1b9f1e9e),
+       TOBN(0x63e6d08b, 0xf9e08002), TOBN(0x9aff2f21, 0xc613809e)},
+      {TOBN(0xb5754f85, 0x3a80e75d), TOBN(0xde71853e, 0x6bbda681),
+       TOBN(0x86f041df, 0x8197fd7a), TOBN(0x8b332e08, 0x127817fa)}},
+     {{TOBN(0x05d99be8, 0xb9c20cda), TOBN(0x89f7aad5, 0xd5cd0c98),
+       TOBN(0x7ef936fe, 0x5bb94183), TOBN(0x92ca0753, 0xb05cd7f2)},
+      {TOBN(0x9d65db11, 0x74a1e035), TOBN(0x02628cc8, 0x13eaea92),
+       TOBN(0xf2d9e242, 0x49e4fbf2), TOBN(0x94fdfd9b, 0xe384f8b7)}},
+     {{TOBN(0x65f56054, 0x63428c6b), TOBN(0x2f7205b2, 0x90b409a5),
+       TOBN(0xf778bb78, 0xff45ae11), TOBN(0xa13045be, 0xc5ee53b2)},
+      {TOBN(0xe00a14ff, 0x03ef77fe), TOBN(0x689cd59f, 0xffef8bef),
+       TOBN(0x3578f0ed, 0x1e9ade22), TOBN(0xe99f3ec0, 0x6268b6a8)}},
+     {{TOBN(0xa2057d91, 0xea1b3c3e), TOBN(0x2d1a7053, 0xb8823a4a),
+       TOBN(0xabbb336a, 0x2cca451e), TOBN(0xcd2466e3, 0x2218bb5d)},
+      {TOBN(0x3ac1f42f, 0xc8cb762d), TOBN(0x7e312aae, 0x7690211f),
+       TOBN(0xebb9bd73, 0x45d07450), TOBN(0x207c4b82, 0x46c2213f)}},
+     {{TOBN(0x99d425c1, 0x375913ec), TOBN(0x94e45e96, 0x67908220),
+       TOBN(0xc08f3087, 0xcd67dbf6), TOBN(0xa5670fbe, 0xc0887056)},
+      {TOBN(0x6717b64a, 0x66f5b8fc), TOBN(0xd5a56aea, 0x786fec28),
+       TOBN(0xa8c3f55f, 0xc0ff4952), TOBN(0xa77fefae, 0x457ac49b)}},
+     {{TOBN(0x29882d7c, 0x98379d44), TOBN(0xd000bdfb, 0x509edc8a),
+       TOBN(0xc6f95979, 0xe66fe464), TOBN(0x504a6115, 0xfa61bde0)},
+      {TOBN(0x56b3b871, 0xeffea31a), TOBN(0x2d3de26d, 0xf0c21a54),
+       TOBN(0x21dbff31, 0x834753bf), TOBN(0xe67ecf49, 0x69269d86)}},
+     {{TOBN(0x7a176952, 0x151fe690), TOBN(0x03515804, 0x7f2adb5f),
+       TOBN(0xee794b15, 0xd1b62a8d), TOBN(0xf004ceec, 0xaae454e6)},
+      {TOBN(0x0897ea7c, 0xf0386fac), TOBN(0x3b62ff12, 0xd1fca751),
+       TOBN(0x154181df, 0x1b7a04ec), TOBN(0x2008e04a, 0xfb5847ec)}},
+     {{TOBN(0xd147148e, 0x41dbd772), TOBN(0x2b419f73, 0x22942654),
+       TOBN(0x669f30d3, 0xe9c544f7), TOBN(0x52a2c223, 0xc8540149)},
+      {TOBN(0x5da9ee14, 0x634dfb02), TOBN(0x5f074ff0, 0xf47869f3),
+       TOBN(0x74ee878d, 0xa3933acc), TOBN(0xe6510651, 0x4fe35ed1)}},
+     {{TOBN(0xb3eb9482, 0xf1012e7a), TOBN(0x51013cc0, 0xa8a566ae),
+       TOBN(0xdd5e9243, 0x47c00d3b), TOBN(0x7fde089d, 0x946bb0e5)},
+      {TOBN(0x030754fe, 0xc731b4b3), TOBN(0x12a136a4, 0x99fda062),
+       TOBN(0x7c1064b8, 0x5a1a35bc), TOBN(0xbf1f5763, 0x446c84ef)}},
+     {{TOBN(0xed29a56d, 0xa16d4b34), TOBN(0x7fba9d09, 0xdca21c4f),
+       TOBN(0x66d7ac00, 0x6d8de486), TOBN(0x60061987, 0x73a2a5e1)},
+      {TOBN(0x8b400f86, 0x9da28ff0), TOBN(0x3133f708, 0x43c4599c),
+       TOBN(0x9911c9b8, 0xee28cb0d), TOBN(0xcd7e2874, 0x8e0af61d)}},
+     {{TOBN(0x5a85f0f2, 0x72ed91fc), TOBN(0x85214f31, 0x9cd4a373),
+       TOBN(0x881fe5be, 0x1925253c), TOBN(0xd8dc98e0, 0x91e8bc76)},
+      {TOBN(0x7120affe, 0x585cc3a2), TOBN(0x724952ed, 0x735bf97a),
+       TOBN(0x5581e7dc, 0x3eb34581), TOBN(0x5cbff4f2, 0xe52ee57d)}},
+     {{TOBN(0x8d320a0e, 0x87d8cc7b), TOBN(0x9beaa7f3, 0xf1d280d0),
+       TOBN(0x7a0b9571, 0x9beec704), TOBN(0x9126332e, 0x5b7f0057)},
+      {TOBN(0x01fbc1b4, 0x8ed3bd6d), TOBN(0x35bb2c12, 0xd945eb24),
+       TOBN(0x6404694e, 0x9a8ae255), TOBN(0xb6092eec, 0x8d6abfb3)}},
+     {{TOBN(0x4d76143f, 0xcc058865), TOBN(0x7b0a5af2, 0x6e249922),
+       TOBN(0x8aef9440, 0x6a50d353), TOBN(0xe11e4bcc, 0x64f0e07a)},
+      {TOBN(0x4472993a, 0xa14a90fa), TOBN(0x7706e20c, 0xba0c51d4),
+       TOBN(0xf403292f, 0x1532672d), TOBN(0x52573bfa, 0x21829382)}},
+     {{TOBN(0x6a7bb6a9, 0x3b5bdb83), TOBN(0x08da65c0, 0xa4a72318),
+       TOBN(0xc58d22aa, 0x63eb065f), TOBN(0x1717596c, 0x1b15d685)},
+      {TOBN(0x112df0d0, 0xb266d88b), TOBN(0xf688ae97, 0x5941945a),
+       TOBN(0x487386e3, 0x7c292cac), TOBN(0x42f3b50d, 0x57d6985c)}},
+     {{TOBN(0x6da4f998, 0x6a90fc34), TOBN(0xc8f257d3, 0x65ca8a8d),
+       TOBN(0xc2feabca, 0x6951f762), TOBN(0xe1bc81d0, 0x74c323ac)},
+      {TOBN(0x1bc68f67, 0x251a2a12), TOBN(0x10d86587, 0xbe8a70dc),
+       TOBN(0xd648af7f, 0xf0f84d2e), TOBN(0xf0aa9ebc, 0x6a43ac92)}},
+     {{TOBN(0x69e3be04, 0x27596893), TOBN(0xb6bb02a6, 0x45bf452b),
+       TOBN(0x0875c11a, 0xf4c698c8), TOBN(0x6652b5c7, 0xbece3794)},
+      {TOBN(0x7b3755fd, 0x4f5c0499), TOBN(0x6ea16558, 0xb5532b38),
+       TOBN(0xd1c69889, 0xa2e96ef7), TOBN(0x9c773c3a, 0x61ed8f48)}},
+     {{TOBN(0x2b653a40, 0x9b323abc), TOBN(0xe26605e1, 0xf0e1d791),
+       TOBN(0x45d41064, 0x4a87157a), TOBN(0x8f9a78b7, 0xcbbce616)},
+      {TOBN(0xcf1e44aa, 0xc407eddd), TOBN(0x81ddd1d8, 0xa35b964f),
+       TOBN(0x473e339e, 0xfd083999), TOBN(0x6c94bdde, 0x8e796802)}},
+     {{TOBN(0x5a304ada, 0x8545d185), TOBN(0x82ae44ea, 0x738bb8cb),
+       TOBN(0x628a35e3, 0xdf87e10e), TOBN(0xd3624f3d, 0xa15b9fe3)},
+      {TOBN(0xcc44209b, 0x14be4254), TOBN(0x7d0efcbc, 0xbdbc2ea5),
+       TOBN(0x1f603362, 0x04c37bbe), TOBN(0x21f363f5, 0x56a5852c)}},
+     {{TOBN(0xa1503d1c, 0xa8501550), TOBN(0x2251e0e1, 0xd8ab10bb),
+       TOBN(0xde129c96, 0x6961c51c), TOBN(0x1f7246a4, 0x81910f68)},
+      {TOBN(0x2eb744ee, 0x5f2591f2), TOBN(0x3c47d33f, 0x5e627157),
+       TOBN(0x4d6d62c9, 0x22f3bd68), TOBN(0x6120a64b, 0xcb8df856)}},
+     {{TOBN(0x3a9ac6c0, 0x7b5d07df), TOBN(0xa92b9558, 0x7ef39783),
+       TOBN(0xe128a134, 0xab3a9b4f), TOBN(0x41c18807, 0xb1252f05)},
+      {TOBN(0xfc7ed089, 0x80ba9b1c), TOBN(0xac8dc6de, 0xc532a9dd),
+       TOBN(0xbf829cef, 0x55246809), TOBN(0x101b784f, 0x5b4ee80f)}},
+     {{TOBN(0xc09945bb, 0xb6f11603), TOBN(0x57b09dbe, 0x41d2801e),
+       TOBN(0xfba5202f, 0xa97534a8), TOBN(0x7fd8ae5f, 0xc17b9614)},
+      {TOBN(0xa50ba666, 0x78308435), TOBN(0x9572f77c, 0xd3868c4d),
+       TOBN(0x0cef7bfd, 0x2dd7aab0), TOBN(0xe7958e08, 0x2c7c79ff)}},
+     {{TOBN(0x81262e42, 0x25346689), TOBN(0x716da290, 0xb07c7004),
+       TOBN(0x35f911ea, 0xb7950ee3), TOBN(0x6fd72969, 0x261d21b5)},
+      {TOBN(0x52389803, 0x08b640d3), TOBN(0x5b0026ee, 0x887f12a1),
+       TOBN(0x20e21660, 0x742e9311), TOBN(0x0ef6d541, 0x5ff77ff7)}},
+     {{TOBN(0x969127f0, 0xf9c41135), TOBN(0xf21d60c9, 0x68a64993),
+       TOBN(0x656e5d0c, 0xe541875c), TOBN(0xf1e0f84e, 0xa1d3c233)},
+      {TOBN(0x9bcca359, 0x06002d60), TOBN(0xbe2da60c, 0x06191552),
+       TOBN(0x5da8bbae, 0x61181ec3), TOBN(0x9f04b823, 0x65806f19)}},
+     {{TOBN(0xf1604a7d, 0xd4b79bb8), TOBN(0xaee806fb, 0x52c878c8),
+       TOBN(0x34144f11, 0x8d47b8e8), TOBN(0x72edf52b, 0x949f9054)},
+      {TOBN(0xebfca84e, 0x2127015a), TOBN(0x9051d0c0, 0x9cb7cef3),
+       TOBN(0x86e8fe58, 0x296deec8), TOBN(0x33b28188, 0x41010d74)}}},
+    {{{TOBN(0x01079383, 0x171b445f), TOBN(0x9bcf21e3, 0x8131ad4c),
+       TOBN(0x8cdfe205, 0xc93987e8), TOBN(0xe63f4152, 0xc92e8c8f)},
+      {TOBN(0x729462a9, 0x30add43d), TOBN(0x62ebb143, 0xc980f05a),
+       TOBN(0x4f3954e5, 0x3b06e968), TOBN(0xfe1d75ad, 0x242cf6b1)}},
+     {{TOBN(0x5f95c6c7, 0xaf8685c8), TOBN(0xd4c1c8ce, 0x2f8f01aa),
+       TOBN(0xc44bbe32, 0x2574692a), TOBN(0xb8003478, 0xd4a4a068)},
+      {TOBN(0x7c8fc6e5, 0x2eca3cdb), TOBN(0xea1db16b, 0xec04d399),
+       TOBN(0xb05bc82e, 0x8f2bc5cf), TOBN(0x763d517f, 0xf44793d2)}},
+     {{TOBN(0x4451c1b8, 0x08bd98d0), TOBN(0x644b1cd4, 0x6575f240),
+       TOBN(0x6907eb33, 0x7375d270), TOBN(0x56c8bebd, 0xfa2286bd)},
+      {TOBN(0xc713d2ac, 0xc4632b46), TOBN(0x17da427a, 0xafd60242),
+       TOBN(0x313065b7, 0xc95c7546), TOBN(0xf8239898, 0xbf17a3de)}},
+     {{TOBN(0xf3b7963f, 0x4c830320), TOBN(0x842c7aa0, 0x903203e3),
+       TOBN(0xaf22ca0a, 0xe7327afb), TOBN(0x38e13092, 0x967609b6)},
+      {TOBN(0x73b8fb62, 0x757558f1), TOBN(0x3cc3e831, 0xf7eca8c1),
+       TOBN(0xe4174474, 0xf6331627), TOBN(0xa77989ca, 0xc3c40234)}},
+     {{TOBN(0xe5fd17a1, 0x44a081e0), TOBN(0xd797fb7d, 0xb70e296a),
+       TOBN(0x2b472b30, 0x481f719c), TOBN(0x0e632a98, 0xfe6f8c52)},
+      {TOBN(0x89ccd116, 0xc5f0c284), TOBN(0xf51088af, 0x2d987c62),
+       TOBN(0x2a2bccda, 0x4c2de6cf), TOBN(0x810f9efe, 0xf679f0f9)}},
+     {{TOBN(0xb0f394b9, 0x7ffe4b3e), TOBN(0x0b691d21, 0xe5fa5d21),
+       TOBN(0xb0bd7747, 0x9dfbbc75), TOBN(0xd2830fda, 0xfaf78b00)},
+      {TOBN(0xf78c249c, 0x52434f57), TOBN(0x4b1f7545, 0x98096dab),
+       TOBN(0x73bf6f94, 0x8ff8c0b3), TOBN(0x34aef03d, 0x454e134c)}},
+     {{TOBN(0xf8d151f4, 0xb7ac7ec5), TOBN(0xd6ceb95a, 0xe50da7d5),
+       TOBN(0xa1b492b0, 0xdc3a0eb8), TOBN(0x75157b69, 0xb3dd2863)},
+      {TOBN(0xe2c4c74e, 0xc5413d62), TOBN(0xbe329ff7, 0xbc5fc4c7),
+       TOBN(0x835a2aea, 0x60fa9dda), TOBN(0xf117f5ad, 0x7445cb87)}},
+     {{TOBN(0xae8317f4, 0xb0166f7a), TOBN(0xfbd3e3f7, 0xceec74e6),
+       TOBN(0xfdb516ac, 0xe0874bfd), TOBN(0x3d846019, 0xc681f3a3)},
+      {TOBN(0x0b12ee5c, 0x7c1620b0), TOBN(0xba68b4dd, 0x2b63c501),
+       TOBN(0xac03cd32, 0x6668c51e), TOBN(0x2a6279f7, 0x4e0bcb5b)}},
+     {{TOBN(0x17bd69b0, 0x6ae85c10), TOBN(0x72946979, 0x1dfdd3a6),
+       TOBN(0xd9a03268, 0x2c078bec), TOBN(0x41c6a658, 0xbfd68a52)},
+      {TOBN(0xcdea1024, 0x0e023900), TOBN(0xbaeec121, 0xb10d144d),
+       TOBN(0x5a600e74, 0x058ab8dc), TOBN(0x1333af21, 0xbb89ccdd)}},
+     {{TOBN(0xdf25eae0, 0x3aaba1f1), TOBN(0x2cada16e, 0x3b7144cf),
+       TOBN(0x657ee27d, 0x71ab98bc), TOBN(0x99088b4c, 0x7a6fc96e)},
+      {TOBN(0x05d5c0a0, 0x3549dbd4), TOBN(0x42cbdf8f, 0xf158c3ac),
+       TOBN(0x3fb6b3b0, 0x87edd685), TOBN(0x22071cf6, 0x86f064d0)}},
+     {{TOBN(0xd2d6721f, 0xff2811e5), TOBN(0xdb81b703, 0xfe7fae8c),
+       TOBN(0x3cfb74ef, 0xd3f1f7bb), TOBN(0x0cdbcd76, 0x16cdeb5d)},
+      {TOBN(0x4f39642a, 0x566a808c), TOBN(0x02b74454, 0x340064d6),
+       TOBN(0xfabbadca, 0x0528fa6f), TOBN(0xe4c3074c, 0xd3fc0bb6)}},
+     {{TOBN(0xb32cb8b0, 0xb796d219), TOBN(0xc3e95f4f, 0x34741dd9),
+       TOBN(0x87212125, 0x68edf6f5), TOBN(0x7a03aee4, 0xa2b9cb8e)},
+      {TOBN(0x0cd3c376, 0xf53a89aa), TOBN(0x0d8af9b1, 0x948a28dc),
+       TOBN(0xcf86a3f4, 0x902ab04f), TOBN(0x8aacb62a, 0x7f42002d)}},
+     {{TOBN(0x106985eb, 0xf62ffd52), TOBN(0xe670b54e, 0x5797bf10),
+       TOBN(0x4b405209, 0xc5e30aef), TOBN(0x12c97a20, 0x4365b5e9)},
+      {TOBN(0x104646ce, 0x1fe32093), TOBN(0x13cb4ff6, 0x3907a8c9),
+       TOBN(0x8b9f30d1, 0xd46e726b), TOBN(0xe1985e21, 0xaba0f499)}},
+     {{TOBN(0xc573dea9, 0x10a230cd), TOBN(0x24f46a93, 0xcd30f947),
+       TOBN(0xf2623fcf, 0xabe2010a), TOBN(0x3f278cb2, 0x73f00e4f)},
+      {TOBN(0xed55c67d, 0x50b920eb), TOBN(0xf1cb9a2d, 0x8e760571),
+       TOBN(0x7c50d109, 0x0895b709), TOBN(0x4207cf07, 0x190d4369)}},
+     {{TOBN(0x3b027e81, 0xc4127fe1), TOBN(0xa9f8b9ad, 0x3ae9c566),
+       TOBN(0x5ab10851, 0xacbfbba5), TOBN(0xa747d648, 0x569556f5)},
+      {TOBN(0xcc172b5c, 0x2ba97bf7), TOBN(0x15e0f77d, 0xbcfa3324),
+       TOBN(0xa345b797, 0x7686279d), TOBN(0x5a723480, 0xe38003d3)}},
+     {{TOBN(0xfd8e139f, 0x8f5fcda8), TOBN(0xf3e558c4, 0xbdee5bfd),
+       TOBN(0xd76cbaf4, 0xe33f9f77), TOBN(0x3a4c97a4, 0x71771969)},
+      {TOBN(0xda27e84b, 0xf6dce6a7), TOBN(0xff373d96, 0x13e6c2d1),
+       TOBN(0xf115193c, 0xd759a6e9), TOBN(0x3f9b7025, 0x63d2262c)}},
+     {{TOBN(0xd9764a31, 0x317cd062), TOBN(0x30779d8e, 0x199f8332),
+       TOBN(0xd8074106, 0x16b11b0b), TOBN(0x7917ab9f, 0x78aeaed8)},
+      {TOBN(0xb67a9cbe, 0x28fb1d8e), TOBN(0x2e313563, 0x136eda33),
+       TOBN(0x010b7069, 0xa371a86c), TOBN(0x44d90fa2, 0x6744e6b7)}},
+     {{TOBN(0x68190867, 0xd6b3e243), TOBN(0x9fe6cd9d, 0x59048c48),
+       TOBN(0xb900b028, 0x95731538), TOBN(0xa012062f, 0x32cae04f)},
+      {TOBN(0x8107c8bc, 0x9399d082), TOBN(0x47e8c54a, 0x41df12e2),
+       TOBN(0x14ba5117, 0xb6ef3f73), TOBN(0x22260bea, 0x81362f0b)}},
+     {{TOBN(0x90ea261e, 0x1a18cc20), TOBN(0x2192999f, 0x2321d636),
+       TOBN(0xef64d314, 0xe311b6a0), TOBN(0xd7401e4c, 0x3b54a1f5)},
+      {TOBN(0x19019983, 0x6fbca2ba), TOBN(0x46ad3293, 0x8fbffc4b),
+       TOBN(0xa142d3f6, 0x3786bf40), TOBN(0xeb5cbc26, 0xb67039fc)}},
+     {{TOBN(0x9cb0ae6c, 0x252bd479), TOBN(0x05e0f88a, 0x12b5848f),
+       TOBN(0x78f6d2b2, 0xa5c97663), TOBN(0x6f6e149b, 0xc162225c)},
+      {TOBN(0xe602235c, 0xde601a89), TOBN(0xd17bbe98, 0xf373be1f),
+       TOBN(0xcaf49a5b, 0xa8471827), TOBN(0x7e1a0a85, 0x18aaa116)}},
+     {{TOBN(0x6c833196, 0x270580c3), TOBN(0x1e233839, 0xf1c98a14),
+       TOBN(0x67b2f7b4, 0xae34e0a5), TOBN(0x47ac8745, 0xd8ce7289)},
+      {TOBN(0x2b74779a, 0x100dd467), TOBN(0x274a4337, 0x4ee50d09),
+       TOBN(0x603dcf13, 0x83608bc9), TOBN(0xcd9da6c3, 0xc89e8388)}},
+     {{TOBN(0x2660199f, 0x355116ac), TOBN(0xcc38bb59, 0xb6d18eed),
+       TOBN(0x3075f31f, 0x2f4bc071), TOBN(0x9774457f, 0x265dc57e)},
+      {TOBN(0x06a6a9c8, 0xc6db88bb), TOBN(0x6429d07f, 0x4ec98e04),
+       TOBN(0x8d05e57b, 0x05ecaa8b), TOBN(0x20f140b1, 0x7872ea7b)}},
+     {{TOBN(0xdf8c0f09, 0xca494693), TOBN(0x48d3a020, 0xf252e909),
+       TOBN(0x4c5c29af, 0x57b14b12), TOBN(0x7e6fa37d, 0xbf47ad1c)},
+      {TOBN(0x66e7b506, 0x49a0c938), TOBN(0xb72c0d48, 0x6be5f41f),
+       TOBN(0x6a6242b8, 0xb2359412), TOBN(0xcd35c774, 0x8e859480)}},
+     {{TOBN(0x12536fea, 0x87baa627), TOBN(0x58c1fec1, 0xf72aa680),
+       TOBN(0x6c29b637, 0x601e5dc9), TOBN(0x9e3c3c1c, 0xde9e01b9)},
+      {TOBN(0xefc8127b, 0x2bcfe0b0), TOBN(0x35107102, 0x2a12f50d),
+       TOBN(0x6ccd6cb1, 0x4879b397), TOBN(0xf792f804, 0xf8a82f21)}},
+     {{TOBN(0x509d4804, 0xa9b46402), TOBN(0xedddf85d, 0xc10f0850),
+       TOBN(0x928410dc, 0x4b6208aa), TOBN(0xf6229c46, 0x391012dc)},
+      {TOBN(0xc5a7c41e, 0x7727b9b6), TOBN(0x289e4e4b, 0xaa444842),
+       TOBN(0x049ba1d9, 0xe9a947ea), TOBN(0x44f9e47f, 0x83c8debc)}},
+     {{TOBN(0xfa77a1fe, 0x611f8b8e), TOBN(0xfd2e416a, 0xf518f427),
+       TOBN(0xc5fffa70, 0x114ebac3), TOBN(0xfe57c4e9, 0x5d89697b)},
+      {TOBN(0xfdd053ac, 0xb1aaf613), TOBN(0x31df210f, 0xea585a45),
+       TOBN(0x318cc10e, 0x24985034), TOBN(0x1a38efd1, 0x5f1d6130)}},
+     {{TOBN(0xbf86f237, 0x0b1e9e21), TOBN(0xb258514d, 0x1dbe88aa),
+       TOBN(0x1e38a588, 0x90c1baf9), TOBN(0x2936a01e, 0xbdb9b692)},
+      {TOBN(0xd576de98, 0x6dd5b20c), TOBN(0xb586bf71, 0x70f98ecf),
+       TOBN(0xcccf0f12, 0xc42d2fd7), TOBN(0x8717e61c, 0xfb35bd7b)}},
+     {{TOBN(0x8b1e5722, 0x35e6fc06), TOBN(0x3477728f, 0x0b3e13d5),
+       TOBN(0x150c294d, 0xaa8a7372), TOBN(0xc0291d43, 0x3bfa528a)},
+      {TOBN(0xc6c8bc67, 0xcec5a196), TOBN(0xdeeb31e4, 0x5c2e8a7c),
+       TOBN(0xba93e244, 0xfb6e1c51), TOBN(0xb9f8b71b, 0x2e28e156)}},
+     {{TOBN(0xce65a287, 0x968a2ab9), TOBN(0xe3c5ce69, 0x46bbcb1f),
+       TOBN(0xf8c835b9, 0xe7ae3f30), TOBN(0x16bbee26, 0xff72b82b)},
+      {TOBN(0x665e2017, 0xfd42cd22), TOBN(0x1e139970, 0xf8b1d2a0),
+       TOBN(0x125cda29, 0x79204932), TOBN(0x7aee94a5, 0x49c3bee5)}},
+     {{TOBN(0x68c70160, 0x89821a66), TOBN(0xf7c37678, 0x8f981669),
+       TOBN(0xd90829fc, 0x48cc3645), TOBN(0x346af049, 0xd70addfc)},
+      {TOBN(0x2057b232, 0x370bf29c), TOBN(0xf90c73ce, 0x42e650ee),
+       TOBN(0xe03386ea, 0xa126ab90), TOBN(0x0e266e7e, 0x975a087b)}},
+     {{TOBN(0x80578eb9, 0x0fca65d9), TOBN(0x7e2989ea, 0x16af45b8),
+       TOBN(0x7438212d, 0xcac75a4e), TOBN(0x38c7ca39, 0x4fef36b8)},
+      {TOBN(0x8650c494, 0xd402676a), TOBN(0x26ab5a66, 0xf72c7c48),
+       TOBN(0x4e6cb426, 0xce3a464e), TOBN(0xf8f99896, 0x2b72f841)}},
+     {{TOBN(0x8c318491, 0x1a335cc8), TOBN(0x563459ba, 0x6a5913e4),
+       TOBN(0x1b920d61, 0xc7b32919), TOBN(0x805ab8b6, 0xa02425ad)},
+      {TOBN(0x2ac512da, 0x8d006086), TOBN(0x6ca4846a, 0xbcf5c0fd),
+       TOBN(0xafea51d8, 0xac2138d7), TOBN(0xcb647545, 0x344cd443)}},
+     {{TOBN(0x0429ee8f, 0xbd7d9040), TOBN(0xee66a2de, 0x819b9c96),
+       TOBN(0x54f9ec25, 0xdea7d744), TOBN(0x2ffea642, 0x671721bb)},
+      {TOBN(0x4f19dbd1, 0x114344ea), TOBN(0x04304536, 0xfd0dbc8b),
+       TOBN(0x014b50aa, 0x29ec7f91), TOBN(0xb5fc22fe, 0xbb06014d)}},
+     {{TOBN(0x60d963a9, 0x1ee682e0), TOBN(0xdf48abc0, 0xfe85c727),
+       TOBN(0x0cadba13, 0x2e707c2d), TOBN(0xde608d3a, 0xa645aeff)},
+      {TOBN(0x05f1c28b, 0xedafd883), TOBN(0x3c362ede, 0xbd94de1f),
+       TOBN(0x8dd0629d, 0x13593e41), TOBN(0x0a5e736f, 0x766d6eaf)}},
+     {{TOBN(0xbfa92311, 0xf68cf9d1), TOBN(0xa4f9ef87, 0xc1797556),
+       TOBN(0x10d75a1f, 0x5601c209), TOBN(0x651c374c, 0x09b07361)},
+      {TOBN(0x49950b58, 0x88b5cead), TOBN(0x0ef00058, 0x6fa9dbaa),
+       TOBN(0xf51ddc26, 0x4e15f33a), TOBN(0x1f8b5ca6, 0x2ef46140)}},
+     {{TOBN(0x343ac0a3, 0xee9523f0), TOBN(0xbb75eab2, 0x975ea978),
+       TOBN(0x1bccf332, 0x107387f4), TOBN(0x790f9259, 0x9ab0062e)},
+      {TOBN(0xf1a363ad, 0x1e4f6a5f), TOBN(0x06e08b84, 0x62519a50),
+       TOBN(0x60915187, 0x7265f1ee), TOBN(0x6a80ca34, 0x93ae985e)}},
+     {{TOBN(0x81b29768, 0xaaba4864), TOBN(0xb13cabf2, 0x8d52a7d6),
+       TOBN(0xb5c36348, 0x8ead03f1), TOBN(0xc932ad95, 0x81c7c1c0)},
+      {TOBN(0x5452708e, 0xcae1e27b), TOBN(0x9dac4269, 0x1b0df648),
+       TOBN(0x233e3f0c, 0xdfcdb8bc), TOBN(0xe6ceccdf, 0xec540174)}},
+     {{TOBN(0xbd0d845e, 0x95081181), TOBN(0xcc8a7920, 0x699355d5),
+       TOBN(0x111c0f6d, 0xc3b375a8), TOBN(0xfd95bc6b, 0xfd51e0dc)},
+      {TOBN(0x4a106a26, 0x6888523a), TOBN(0x4d142bd6, 0xcb01a06d),
+       TOBN(0x79bfd289, 0xadb9b397), TOBN(0x0bdbfb94, 0xe9863914)}},
+     {{TOBN(0x29d8a229, 0x1660f6a6), TOBN(0x7f6abcd6, 0x551c042d),
+       TOBN(0x13039deb, 0x0ac3ffe8), TOBN(0xa01be628, 0xec8523fb)},
+      {TOBN(0x6ea34103, 0x0ca1c328), TOBN(0xc74114bd, 0xb903928e),
+       TOBN(0x8aa4ff4e, 0x9e9144b0), TOBN(0x7064091f, 0x7f9a4b17)}},
+     {{TOBN(0xa3f4f521, 0xe447f2c4), TOBN(0x81b8da7a, 0x604291f0),
+       TOBN(0xd680bc46, 0x7d5926de), TOBN(0x84f21fd5, 0x34a1202f)},
+      {TOBN(0x1d1e3181, 0x4e9df3d8), TOBN(0x1ca4861a, 0x39ab8d34),
+       TOBN(0x809ddeec, 0x5b19aa4a), TOBN(0x59f72f7e, 0x4d329366)}},
+     {{TOBN(0xa2f93f41, 0x386d5087), TOBN(0x40bf739c, 0xdd67d64f),
+       TOBN(0xb4494205, 0x66702158), TOBN(0xc33c65be, 0x73b1e178)},
+      {TOBN(0xcdcd657c, 0x38ca6153), TOBN(0x97f4519a, 0xdc791976),
+       TOBN(0xcc7c7f29, 0xcd6e1f39), TOBN(0x38de9cfb, 0x7e3c3932)}},
+     {{TOBN(0xe448eba3, 0x7b793f85), TOBN(0xe9f8dbf9, 0xf067e914),
+       TOBN(0xc0390266, 0xf114ae87), TOBN(0x39ed75a7, 0xcd6a8e2a)},
+      {TOBN(0xadb14848, 0x7ffba390), TOBN(0x67f8cb8b, 0x6af9bc09),
+       TOBN(0x322c3848, 0x9c7476db), TOBN(0xa320fecf, 0x52a538d6)}},
+     {{TOBN(0xe0493002, 0xb2aced2b), TOBN(0xdfba1809, 0x616bd430),
+       TOBN(0x531c4644, 0xc331be70), TOBN(0xbc04d32e, 0x90d2e450)},
+      {TOBN(0x1805a0d1, 0x0f9f142d), TOBN(0x2c44a0c5, 0x47ee5a23),
+       TOBN(0x31875a43, 0x3989b4e3), TOBN(0x6b1949fd, 0x0c063481)}},
+     {{TOBN(0x2dfb9e08, 0xbe0f4492), TOBN(0x3ff0da03, 0xe9d5e517),
+       TOBN(0x03dbe9a1, 0xf79466a8), TOBN(0x0b87bcd0, 0x15ea9932)},
+      {TOBN(0xeb64fc83, 0xab1f58ab), TOBN(0x6d9598da, 0x817edc8a),
+       TOBN(0x699cff66, 0x1d3b67e5), TOBN(0x645c0f29, 0x92635853)}},
+     {{TOBN(0x253cdd82, 0xeabaf21c), TOBN(0x82b9602a, 0x2241659e),
+       TOBN(0x2cae07ec, 0x2d9f7091), TOBN(0xbe4c720c, 0x8b48cd9b)},
+      {TOBN(0x6ce5bc03, 0x6f08d6c9), TOBN(0x36e8a997, 0xaf10bf40),
+       TOBN(0x83422d21, 0x3e10ff12), TOBN(0x7b26d3eb, 0xbcc12494)}},
+     {{TOBN(0xb240d2d0, 0xc9469ad6), TOBN(0xc4a11b4d, 0x30afa05b),
+       TOBN(0x4b604ace, 0xdd6ba286), TOBN(0x18486600, 0x3ee2864c)},
+      {TOBN(0x5869d6ba, 0x8d9ce5be), TOBN(0x0d8f68c5, 0xff4bfb0d),
+       TOBN(0xb69f210b, 0x5700cf73), TOBN(0x61f6653a, 0x6d37c135)}},
+     {{TOBN(0xff3d432b, 0x5aff5a48), TOBN(0x0d81c4b9, 0x72ba3a69),
+       TOBN(0xee879ae9, 0xfa1899ef), TOBN(0xbac7e2a0, 0x2d6acafd)},
+      {TOBN(0xd6d93f6c, 0x1c664399), TOBN(0x4c288de1, 0x5bcb135d),
+       TOBN(0x83031dab, 0x9dab7cbf), TOBN(0xfe23feb0, 0x3abbf5f0)}},
+     {{TOBN(0x9f1b2466, 0xcdedca85), TOBN(0x140bb710, 0x1a09538c),
+       TOBN(0xac8ae851, 0x5e11115d), TOBN(0x0d63ff67, 0x6f03f59e)},
+      {TOBN(0x755e5551, 0x7d234afb), TOBN(0x61c2db4e, 0x7e208fc1),
+       TOBN(0xaa9859ce, 0xf28a4b5d), TOBN(0xbdd6d4fc, 0x34af030f)}},
+     {{TOBN(0xd1c4a26d, 0x3be01cb1), TOBN(0x9ba14ffc, 0x243aa07c),
+       TOBN(0xf95cd3a9, 0xb2503502), TOBN(0xe379bc06, 0x7d2a93ab)},
+      {TOBN(0x3efc18e9, 0xd4ca8d68), TOBN(0x083558ec, 0x80bb412a),
+       TOBN(0xd903b940, 0x9645a968), TOBN(0xa499f0b6, 0x9ba6054f)}},
+     {{TOBN(0x208b573c, 0xb8349abe), TOBN(0x3baab3e5, 0x30b4fc1c),
+       TOBN(0x87e978ba, 0xcb524990), TOBN(0x3524194e, 0xccdf0e80)},
+      {TOBN(0x62711725, 0x7d4bcc42), TOBN(0xe90a3d9b, 0xb90109ba),
+       TOBN(0x3b1bdd57, 0x1323e1e0), TOBN(0xb78e9bd5, 0x5eae1599)}},
+     {{TOBN(0x0794b746, 0x9e03d278), TOBN(0x80178605, 0xd70e6297),
+       TOBN(0x171792f8, 0x99c97855), TOBN(0x11b393ee, 0xf5a86b5c)},
+      {TOBN(0x48ef6582, 0xd8884f27), TOBN(0xbd44737a, 0xbf19ba5f),
+       TOBN(0x8698de4c, 0xa42062c6), TOBN(0x8975eb80, 0x61ce9c54)}},
+     {{TOBN(0xd50e57c7, 0xd7fe71f3), TOBN(0x15342190, 0xbc97ce38),
+       TOBN(0x51bda2de, 0x4df07b63), TOBN(0xba12aeae, 0x200eb87d)},
+      {TOBN(0xabe135d2, 0xa9b4f8f6), TOBN(0x04619d65, 0xfad6d99c),
+       TOBN(0x4a6683a7, 0x7994937c), TOBN(0x7a778c8b, 0x6f94f09a)}},
+     {{TOBN(0x8c508623, 0x20a71b89), TOBN(0x241a2aed, 0x1c229165),
+       TOBN(0x352be595, 0xaaf83a99), TOBN(0x9fbfee7f, 0x1562bac8)},
+      {TOBN(0xeaf658b9, 0x5c4017e3), TOBN(0x1dc7f9e0, 0x15120b86),
+       TOBN(0xd84f13dd, 0x4c034d6f), TOBN(0x283dd737, 0xeaea3038)}},
+     {{TOBN(0x197f2609, 0xcd85d6a2), TOBN(0x6ebbc345, 0xfae60177),
+       TOBN(0xb80f031b, 0x4e12fede), TOBN(0xde55d0c2, 0x07a2186b)},
+      {TOBN(0x1fb3e37f, 0x24dcdd5a), TOBN(0x8d602da5, 0x7ed191fb),
+       TOBN(0x108fb056, 0x76023e0d), TOBN(0x70178c71, 0x459c20c0)}},
+     {{TOBN(0xfad5a386, 0x3fe54cf0), TOBN(0xa4a3ec4f, 0x02bbb475),
+       TOBN(0x1aa5ec20, 0x919d94d7), TOBN(0x5d3b63b5, 0xa81e4ab3)},
+      {TOBN(0x7fa733d8, 0x5ad3d2af), TOBN(0xfbc586dd, 0xd1ac7a37),
+       TOBN(0x282925de, 0x40779614), TOBN(0xfe0ffffb, 0xe74a242a)}},
+     {{TOBN(0x3f39e67f, 0x906151e5), TOBN(0xcea27f5f, 0x55e10649),
+       TOBN(0xdca1d4e1, 0xc17cf7b7), TOBN(0x0c326d12, 0x2fe2362d)},
+      {TOBN(0x05f7ac33, 0x7dd35df3), TOBN(0x0c3b7639, 0xc396dbdf),
+       TOBN(0x0912f5ac, 0x03b7db1c), TOBN(0x9dea4b70, 0x5c9ed4a9)}},
+     {{TOBN(0x475e6e53, 0xaae3f639), TOBN(0xfaba0e7c, 0xfc278bac),
+       TOBN(0x16f9e221, 0x9490375f), TOBN(0xaebf9746, 0xa5a7ed0a)},
+      {TOBN(0x45f9af3f, 0xf41ad5d6), TOBN(0x03c4623c, 0xb2e99224),
+       TOBN(0x82c5bb5c, 0xb3cf56aa), TOBN(0x64311819, 0x34567ed3)}},
+     {{TOBN(0xec57f211, 0x8be489ac), TOBN(0x2821895d, 0xb9a1104b),
+       TOBN(0x610dc875, 0x6064e007), TOBN(0x8e526f3f, 0x5b20d0fe)},
+      {TOBN(0x6e71ca77, 0x5b645aee), TOBN(0x3d1dcb9f, 0x800e10ff),
+       TOBN(0x36b51162, 0x189cf6de), TOBN(0x2c5a3e30, 0x6bb17353)}},
+     {{TOBN(0xc186cd3e, 0x2a6c6fbf), TOBN(0xa74516fa, 0x4bf97906),
+       TOBN(0x5b4b8f4b, 0x279d6901), TOBN(0x0c4e57b4, 0x2b573743)},
+      {TOBN(0x75fdb229, 0xb6e386b6), TOBN(0xb46793fd, 0x99deac27),
+       TOBN(0xeeec47ea, 0xcf712629), TOBN(0xe965f3c4, 0xcbc3b2dd)}},
+     {{TOBN(0x8dd1fb83, 0x425c6559), TOBN(0x7fc00ee6, 0x0af06fda),
+       TOBN(0xe98c9225, 0x33d956df), TOBN(0x0f1ef335, 0x4fbdc8a2)},
+      {TOBN(0x2abb5145, 0xb79b8ea2), TOBN(0x40fd2945, 0xbdbff288),
+       TOBN(0x6a814ac4, 0xd7185db7), TOBN(0xc4329d6f, 0xc084609a)}},
+     {{TOBN(0xc9ba7b52, 0xed1be45d), TOBN(0x891dd20d, 0xe4cd2c74),
+       TOBN(0x5a4d4a7f, 0x824139b1), TOBN(0x66c17716, 0xb873c710)},
+      {TOBN(0x5e5bc141, 0x2843c4e0), TOBN(0xd5ac4817, 0xb97eb5bf),
+       TOBN(0xc0f8af54, 0x450c95c7), TOBN(0xc91b3fa0, 0x318406c5)}},
+     {{TOBN(0x360c340a, 0xab9d97f8), TOBN(0xfb57bd07, 0x90a2d611),
+       TOBN(0x4339ae3c, 0xa6a6f7e5), TOBN(0x9c1fcd2a, 0x2feb8a10)},
+      {TOBN(0x972bcca9, 0xc7ea7432), TOBN(0x1b0b924c, 0x308076f6),
+       TOBN(0x80b2814a, 0x2a5b4ca5), TOBN(0x2f78f55b, 0x61ef3b29)}},
+     {{TOBN(0xf838744a, 0xc18a414f), TOBN(0xc611eaae, 0x903d0a86),
+       TOBN(0x94dabc16, 0x2a453f55), TOBN(0xe6f2e3da, 0x14efb279)},
+      {TOBN(0x5b7a6017, 0x9320dc3c), TOBN(0x692e382f, 0x8df6b5a4),
+       TOBN(0x3f5e15e0, 0x2d40fa90), TOBN(0xc87883ae, 0x643dd318)}},
+     {{TOBN(0x511053e4, 0x53544774), TOBN(0x834d0ecc, 0x3adba2bc),
+       TOBN(0x4215d7f7, 0xbae371f5), TOBN(0xfcfd57bf, 0x6c8663bc)},
+      {TOBN(0xded2383d, 0xd6901b1d), TOBN(0x3b49fbb4, 0xb5587dc3),
+       TOBN(0xfd44a08d, 0x07625f62), TOBN(0x3ee4d65b, 0x9de9b762)}}},
+    {{{TOBN(0x64e5137d, 0x0d63d1fa), TOBN(0x658fc052, 0x02a9d89f),
+       TOBN(0x48894874, 0x50436309), TOBN(0xe9ae30f8, 0xd598da61)},
+      {TOBN(0x2ed710d1, 0x818baf91), TOBN(0xe27e9e06, 0x8b6a0c20),
+       TOBN(0x1e28dcfb, 0x1c1a6b44), TOBN(0x883acb64, 0xd6ac57dc)}},
+     {{TOBN(0x8735728d, 0xc2c6ff70), TOBN(0x79d6122f, 0xc5dc2235),
+       TOBN(0x23f5d003, 0x19e277f9), TOBN(0x7ee84e25, 0xdded8cc7)},
+      {TOBN(0x91a8afb0, 0x63cd880a), TOBN(0x3f3ea7c6, 0x3574af60),
+       TOBN(0x0cfcdc84, 0x02de7f42), TOBN(0x62d0792f, 0xb31aa152)}},
+     {{TOBN(0x8e1b4e43, 0x8a5807ce), TOBN(0xad283893, 0xe4109a7e),
+       TOBN(0xc30cc9cb, 0xafd59dda), TOBN(0xf65f36c6, 0x3d8d8093)},
+      {TOBN(0xdf31469e, 0xa60d32b2), TOBN(0xee93df4b, 0x3e8191c8),
+       TOBN(0x9c1017c5, 0x355bdeb5), TOBN(0xd2623185, 0x8616aa28)}},
+     {{TOBN(0xb02c83f9, 0xdec31a21), TOBN(0x988c8b23, 0x6ad9d573),
+       TOBN(0x53e983ae, 0xa57be365), TOBN(0xe968734d, 0x646f834e)},
+      {TOBN(0x9137ea8f, 0x5da6309b), TOBN(0x10f3a624, 0xc1f1ce16),
+       TOBN(0x782a9ea2, 0xca440921), TOBN(0xdf94739e, 0x5b46f1b5)}},
+     {{TOBN(0x9f9be006, 0xcce85c9b), TOBN(0x360e70d6, 0xa4c7c2d3),
+       TOBN(0x2cd5beea, 0xaefa1e60), TOBN(0x64cf63c0, 0x8c3d2b6d)},
+      {TOBN(0xfb107fa3, 0xe1cf6f90), TOBN(0xb7e937c6, 0xd5e044e6),
+       TOBN(0x74e8ca78, 0xce34db9f), TOBN(0x4f8b36c1, 0x3e210bd0)}},
+     {{TOBN(0x1df165a4, 0x34a35ea8), TOBN(0x3418e0f7, 0x4d4412f6),
+       TOBN(0x5af1f8af, 0x518836c3), TOBN(0x42ceef4d, 0x130e1965)},
+      {TOBN(0x5560ca0b, 0x543a1957), TOBN(0xc33761e5, 0x886cb123),
+       TOBN(0x66624b1f, 0xfe98ed30), TOBN(0xf772f4bf, 0x1090997d)}},
+     {{TOBN(0xf4e540bb, 0x4885d410), TOBN(0x7287f810, 0x9ba5f8d7),
+       TOBN(0x22d0d865, 0xde98dfb1), TOBN(0x49ff51a1, 0xbcfbb8a3)},
+      {TOBN(0xb6b6fa53, 0x6bc3012e), TOBN(0x3d31fd72, 0x170d541d),
+       TOBN(0x8018724f, 0x4b0f4966), TOBN(0x79e7399f, 0x87dbde07)}},
+     {{TOBN(0x56f8410e, 0xf4f8b16a), TOBN(0x97241afe, 0xc47b266a),
+       TOBN(0x0a406b8e, 0x6d9c87c1), TOBN(0x803f3e02, 0xcd42ab1b)},
+      {TOBN(0x7f0309a8, 0x04dbec69), TOBN(0xa83b85f7, 0x3bbad05f),
+       TOBN(0xc6097273, 0xad8e197f), TOBN(0xc097440e, 0x5067adc1)}},
+     {{TOBN(0x730eafb6, 0x3524ff16), TOBN(0xd7f9b51e, 0x823fc6ce),
+       TOBN(0x27bd0d32, 0x443e4ac0), TOBN(0x40c59ad9, 0x4d66f217)},
+      {TOBN(0x6c33136f, 0x17c387a4), TOBN(0x5043b8d5, 0xeb86804d),
+       TOBN(0x74970312, 0x675a73c9), TOBN(0x838fdb31, 0xf16669b6)}},
+     {{TOBN(0xc507b6dd, 0x418e7ddd), TOBN(0x39888d93, 0x472f19d6),
+       TOBN(0x7eae26be, 0x0c27eb4d), TOBN(0x17b53ed3, 0xfbabb884)},
+      {TOBN(0xfc27021b, 0x2b01ae4f), TOBN(0x88462e87, 0xcf488682),
+       TOBN(0xbee096ec, 0x215e2d87), TOBN(0xeb2fea9a, 0xd242e29b)}},
+     {{TOBN(0x5d985b5f, 0xb821fc28), TOBN(0x89d2e197, 0xdc1e2ad2),
+       TOBN(0x55b566b8, 0x9030ba62), TOBN(0xe3fd41b5, 0x4f41b1c6)},
+      {TOBN(0xb738ac2e, 0xb9a96d61), TOBN(0x7f8567ca, 0x369443f4),
+       TOBN(0x8698622d, 0xf803a440), TOBN(0x2b586236, 0x8fe2f4dc)}},
+     {{TOBN(0xbbcc00c7, 0x56b95bce), TOBN(0x5ec03906, 0x616da680),
+       TOBN(0x79162ee6, 0x72214252), TOBN(0x43132b63, 0x86a892d2)},
+      {TOBN(0x4bdd3ff2, 0x2f3263bf), TOBN(0xd5b3733c, 0x9cd0a142),
+       TOBN(0x592eaa82, 0x44415ccb), TOBN(0x663e8924, 0x8d5474ea)}},
+     {{TOBN(0x8058a25e, 0x5236344e), TOBN(0x82e8df9d, 0xbda76ee6),
+       TOBN(0xdcf6efd8, 0x11cc3d22), TOBN(0x00089cda, 0x3b4ab529)},
+      {TOBN(0x91d3a071, 0xbd38a3db), TOBN(0x4ea97fc0, 0xef72b925),
+       TOBN(0x0c9fc15b, 0xea3edf75), TOBN(0x5a6297cd, 0xa4348ed3)}},
+     {{TOBN(0x0d38ab35, 0xce7c42d4), TOBN(0x9fd493ef, 0x82feab10),
+       TOBN(0x46056b6d, 0x82111b45), TOBN(0xda11dae1, 0x73efc5c3)},
+      {TOBN(0xdc740278, 0x5545a7fb), TOBN(0xbdb2601c, 0x40d507e6),
+       TOBN(0x121dfeeb, 0x7066fa58), TOBN(0x214369a8, 0x39ae8c2a)}},
+     {{TOBN(0x195709cb, 0x06e0956c), TOBN(0x4c9d254f, 0x010cd34b),
+       TOBN(0xf51e13f7, 0x0471a532), TOBN(0xe19d6791, 0x1e73054d)},
+      {TOBN(0xf702a628, 0xdb5c7be3), TOBN(0xc7141218, 0xb24dde05),
+       TOBN(0xdc18233c, 0xf29b2e2e), TOBN(0x3a6bd1e8, 0x85342dba)}},
+     {{TOBN(0x3f747fa0, 0xb311898c), TOBN(0xe2a272e4, 0xcd0eac65),
+       TOBN(0x4bba5851, 0xf914d0bc), TOBN(0x7a1a9660, 0xc4a43ee3)},
+      {TOBN(0xe5a367ce, 0xa1c8cde9), TOBN(0x9d958ba9, 0x7271abe3),
+       TOBN(0xf3ff7eb6, 0x3d1615cd), TOBN(0xa2280dce, 0xf5ae20b0)}},
+     {{TOBN(0x56dba5c1, 0xcf640147), TOBN(0xea5a2e3d, 0x5e83d118),
+       TOBN(0x04cd6b6d, 0xda24c511), TOBN(0x1c0f4671, 0xe854d214)},
+      {TOBN(0x91a6b7a9, 0x69565381), TOBN(0xdc966240, 0xdecf1f5b),
+       TOBN(0x1b22d21c, 0xfcf5d009), TOBN(0x2a05f641, 0x9021dbd5)}},
+     {{TOBN(0x8c0ed566, 0xd4312483), TOBN(0x5179a95d, 0x643e216f),
+       TOBN(0xcc185fec, 0x17044493), TOBN(0xb3063339, 0x54991a21)},
+      {TOBN(0xd801ecdb, 0x0081a726), TOBN(0x0149b0c6, 0x4fa89bbb),
+       TOBN(0xafe9065a, 0x4391b6b9), TOBN(0xedc92786, 0xd633f3a3)}},
+     {{TOBN(0xe408c24a, 0xae6a8e13), TOBN(0x85833fde, 0x9f3897ab),
+       TOBN(0x43800e7e, 0xd81a0715), TOBN(0xde08e346, 0xb44ffc5f)},
+      {TOBN(0x7094184c, 0xcdeff2e0), TOBN(0x49f9387b, 0x165eaed1),
+       TOBN(0x635d6129, 0x777c468a), TOBN(0x8c0dcfd1, 0x538c2dd8)}},
+     {{TOBN(0xd6d9d9e3, 0x7a6a308b), TOBN(0x62375830, 0x4c2767d3),
+       TOBN(0x874a8bc6, 0xf38cbeb6), TOBN(0xd94d3f1a, 0xccb6fd9e)},
+      {TOBN(0x92a9735b, 0xba21f248), TOBN(0x272ad0e5, 0x6cd1efb0),
+       TOBN(0x7437b69c, 0x05b03284), TOBN(0xe7f04702, 0x6948c225)}},
+     {{TOBN(0x8a56c04a, 0xcba2ecec), TOBN(0x0c181270, 0xe3a73e41),
+       TOBN(0x6cb34e9d, 0x03e93725), TOBN(0xf77c8713, 0x496521a9)},
+      {TOBN(0x94569183, 0xfa7f9f90), TOBN(0xf2e7aa4c, 0x8c9707ad),
+       TOBN(0xced2c9ba, 0x26c1c9a3), TOBN(0x9109fe96, 0x40197507)}},
+     {{TOBN(0x9ae868a9, 0xe9adfe1c), TOBN(0x3984403d, 0x314e39bb),
+       TOBN(0xb5875720, 0xf2fe378f), TOBN(0x33f901e0, 0xba44a628)},
+      {TOBN(0xea1125fe, 0x3652438c), TOBN(0xae9ec4e6, 0x9dd1f20b),
+       TOBN(0x1e740d9e, 0xbebf7fbd), TOBN(0x6dbd3ddc, 0x42dbe79c)}},
+     {{TOBN(0x62082aec, 0xedd36776), TOBN(0xf612c478, 0xe9859039),
+       TOBN(0xa493b201, 0x032f7065), TOBN(0xebd4d8f2, 0x4ff9b211)},
+      {TOBN(0x3f23a0aa, 0xaac4cb32), TOBN(0xea3aadb7, 0x15ed4005),
+       TOBN(0xacf17ea4, 0xafa27e63), TOBN(0x56125c1a, 0xc11fd66c)}},
+     {{TOBN(0x266344a4, 0x3794f8dc), TOBN(0xdcca923a, 0x483c5c36),
+       TOBN(0x2d6b6bbf, 0x3f9d10a0), TOBN(0xb320c5ca, 0x81d9bdf3)},
+      {TOBN(0x620e28ff, 0x47b50a95), TOBN(0x933e3b01, 0xcef03371),
+       TOBN(0xf081bf85, 0x99100153), TOBN(0x183be9a0, 0xc3a8c8d6)}},
+     {{TOBN(0x4e3ddc5a, 0xd6bbe24d), TOBN(0xc6c74630, 0x53843795),
+       TOBN(0x78193dd7, 0x65ec2d4c), TOBN(0xb8df26cc, 0xcd3c89b2)},
+      {TOBN(0x98dbe399, 0x5a483f8d), TOBN(0x72d8a957, 0x7dd3313a),
+       TOBN(0x65087294, 0xab0bd375), TOBN(0xfcd89248, 0x7c259d16)}},
+     {{TOBN(0x8a9443d7, 0x7613aa81), TOBN(0x80100800, 0x85fe6584),
+       TOBN(0x70fc4dbc, 0x7fb10288), TOBN(0xf58280d3, 0xe86beee8)},
+      {TOBN(0x14fdd82f, 0x7c978c38), TOBN(0xdf1204c1, 0x0de44d7b),
+       TOBN(0xa08a1c84, 0x4160252f), TOBN(0x591554ca, 0xc17646a5)}},
+     {{TOBN(0x214a37d6, 0xa05bd525), TOBN(0x48d5f09b, 0x07957b3c),
+       TOBN(0x0247cdcb, 0xd7109bc9), TOBN(0x40f9e4bb, 0x30599ce7)},
+      {TOBN(0xc325fa03, 0xf46ad2ec), TOBN(0x00f766cf, 0xc3e3f9ee),
+       TOBN(0xab556668, 0xd43a4577), TOBN(0x68d30a61, 0x3ee03b93)}},
+     {{TOBN(0x7ddc81ea, 0x77b46a08), TOBN(0xcf5a6477, 0xc7480699),
+       TOBN(0x43a8cb34, 0x6633f683), TOBN(0x1b867e6b, 0x92363c60)},
+      {TOBN(0x43921114, 0x1f60558e), TOBN(0xcdbcdd63, 0x2f41450e),
+       TOBN(0x7fc04601, 0xcc630e8b), TOBN(0xea7c66d5, 0x97038b43)}},
+     {{TOBN(0x7259b8a5, 0x04e99fd8), TOBN(0x98a8dd12, 0x4785549a),
+       TOBN(0x0e459a7c, 0x840552e1), TOBN(0xcdfcf4d0, 0x4bb0909e)},
+      {TOBN(0x34a86db2, 0x53758da7), TOBN(0xe643bb83, 0xeac997e1),
+       TOBN(0x96400bd7, 0x530c5b7e), TOBN(0x9f97af87, 0xb41c8b52)}},
+     {{TOBN(0x34fc8820, 0xfbeee3f9), TOBN(0x93e53490, 0x49091afd),
+       TOBN(0x764b9be5, 0x9a31f35c), TOBN(0x71f37864, 0x57e3d924)},
+      {TOBN(0x02fb34e0, 0x943aa75e), TOBN(0xa18c9c58, 0xab8ff6e4),
+       TOBN(0x080f31b1, 0x33cf0d19), TOBN(0x5c9682db, 0x083518a7)}},
+     {{TOBN(0x873d4ca6, 0xb709c3de), TOBN(0x64a84262, 0x3575b8f0),
+       TOBN(0x6275da1f, 0x020154bb), TOBN(0x97678caa, 0xd17cf1ab)},
+      {TOBN(0x8779795f, 0x951a95c3), TOBN(0xdd35b163, 0x50fccc08),
+       TOBN(0x32709627, 0x33d8f031), TOBN(0x3c5ab10a, 0x498dd85c)}},
+     {{TOBN(0xb6c185c3, 0x41dca566), TOBN(0x7de7feda, 0xd8622aa3),
+       TOBN(0x99e84d92, 0x901b6dfb), TOBN(0x30a02b0e, 0x7c4ad288)},
+      {TOBN(0xc7c81daa, 0x2fd3cf36), TOBN(0xd1319547, 0xdf89e59f),
+       TOBN(0xb2be8184, 0xcd496733), TOBN(0xd5f449eb, 0x93d3412b)}},
+     {{TOBN(0x7ea41b1b, 0x25fe531d), TOBN(0xf9797432, 0x6a1d5646),
+       TOBN(0x86067f72, 0x2bde501a), TOBN(0xf91481c0, 0x0c85e89c)},
+      {TOBN(0xca8ee465, 0xf8b05bc6), TOBN(0x1844e1cf, 0x02e83cda),
+       TOBN(0xca82114a, 0xb4dbe33b), TOBN(0x0f9f8769, 0x4eabfde2)}},
+     {{TOBN(0x4936b1c0, 0x38b27fe2), TOBN(0x63b6359b, 0xaba402df),
+       TOBN(0x40c0ea2f, 0x656bdbab), TOBN(0x9c992a89, 0x6580c39c)},
+      {TOBN(0x600e8f15, 0x2a60aed1), TOBN(0xeb089ca4, 0xe0bf49df),
+       TOBN(0x9c233d7d, 0x2d42d99a), TOBN(0x648d3f95, 0x4c6bc2fa)}},
+     {{TOBN(0xdcc383a8, 0xe1add3f3), TOBN(0xf42c0c6a, 0x4f64a348),
+       TOBN(0x2abd176f, 0x0030dbdb), TOBN(0x4de501a3, 0x7d6c215e)},
+      {TOBN(0x4a107c1f, 0x4b9a64bc), TOBN(0xa77f0ad3, 0x2496cd59),
+       TOBN(0xfb78ac62, 0x7688dffb), TOBN(0x7025a2ca, 0x67937d8e)}},
+     {{TOBN(0xfde8b2d1, 0xd1a8f4e7), TOBN(0xf5b3da47, 0x7354927c),
+       TOBN(0xe48606a3, 0xd9205735), TOBN(0xac477cc6, 0xe177b917)},
+      {TOBN(0xfb1f73d2, 0xa883239a), TOBN(0xe12572f6, 0xcc8b8357),
+       TOBN(0x9d355e9c, 0xfb1f4f86), TOBN(0x89b795f8, 0xd9f3ec6e)}},
+     {{TOBN(0x27be56f1, 0xb54398dc), TOBN(0x1890efd7, 0x3fedeed5),
+       TOBN(0x62f77f1f, 0x9c6d0140), TOBN(0x7ef0e314, 0x596f0ee4)},
+      {TOBN(0x50ca6631, 0xcc61dab3), TOBN(0x4a39801d, 0xf4866e4f),
+       TOBN(0x66c8d032, 0xae363b39), TOBN(0x22c591e5, 0x2ead66aa)}},
+     {{TOBN(0x954ba308, 0xde02a53e), TOBN(0x2a6c060f, 0xd389f357),
+       TOBN(0xe6cfcde8, 0xfbf40b66), TOBN(0x8e02fc56, 0xc6340ce1)},
+      {TOBN(0xe4957795, 0x73adb4ba), TOBN(0x7b86122c, 0xa7b03805),
+       TOBN(0x63f83512, 0x0c8e6fa6), TOBN(0x83660ea0, 0x057d7804)}},
+     {{TOBN(0xbad79105, 0x21ba473c), TOBN(0xb6c50bee, 0xded5389d),
+       TOBN(0xee2caf4d, 0xaa7c9bc0), TOBN(0xd97b8de4, 0x8c4e98a7)},
+      {TOBN(0xa9f63e70, 0xab3bbddb), TOBN(0x3898aabf, 0x2597815a),
+       TOBN(0x7659af89, 0xac15b3d9), TOBN(0xedf7725b, 0x703ce784)}},
+     {{TOBN(0x25470fab, 0xe085116b), TOBN(0x04a43375, 0x87285310),
+       TOBN(0x4e39187e, 0xe2bfd52f), TOBN(0x36166b44, 0x7d9ebc74)},
+      {TOBN(0x92ad433c, 0xfd4b322c), TOBN(0x726aa817, 0xba79ab51),
+       TOBN(0xf96eacd8, 0xc1db15eb), TOBN(0xfaf71e91, 0x0476be63)}},
+     {{TOBN(0xdd69a640, 0x641fad98), TOBN(0xb7995918, 0x29622559),
+       TOBN(0x03c6daa5, 0xde4199dc), TOBN(0x92cadc97, 0xad545eb4)},
+      {TOBN(0x1028238b, 0x256534e4), TOBN(0x73e80ce6, 0x8595409a),
+       TOBN(0x690d4c66, 0xd05dc59b), TOBN(0xc95f7b8f, 0x981dee80)}},
+     {{TOBN(0xf4337014, 0xd856ac25), TOBN(0x441bd9dd, 0xac524dca),
+       TOBN(0x640b3d85, 0x5f0499f5), TOBN(0x39cf84a9, 0xd5fda182)},
+      {TOBN(0x04e7b055, 0xb2aa95a0), TOBN(0x29e33f0a, 0x0ddf1860),
+       TOBN(0x082e74b5, 0x423f6b43), TOBN(0x217edeb9, 0x0aaa2b0f)}},
+     {{TOBN(0x58b83f35, 0x83cbea55), TOBN(0xc485ee4d, 0xbc185d70),
+       TOBN(0x833ff03b, 0x1e5f6992), TOBN(0xb5b9b9cc, 0xcf0c0dd5)},
+      {TOBN(0x7caaee8e, 0x4e9e8a50), TOBN(0x462e907b, 0x6269dafd),
+       TOBN(0x6ed5cee9, 0xfbe791c6), TOBN(0x68ca3259, 0xed430790)}},
+     {{TOBN(0x2b72bdf2, 0x13b5ba88), TOBN(0x60294c8a, 0x35ef0ac4),
+       TOBN(0x9c3230ed, 0x19b99b08), TOBN(0x560fff17, 0x6c2589aa)},
+      {TOBN(0x552b8487, 0xd6770374), TOBN(0xa373202d, 0x9a56f685),
+       TOBN(0xd3e7f907, 0x45f175d9), TOBN(0x3c2f315f, 0xd080d810)}},
+     {{TOBN(0x1130e9dd, 0x7b9520e8), TOBN(0xc078f9e2, 0x0af037b5),
+       TOBN(0x38cd2ec7, 0x1e9c104c), TOBN(0x0f684368, 0xc472fe92)},
+      {TOBN(0xd3f1b5ed, 0x6247e7ef), TOBN(0xb32d33a9, 0x396dfe21),
+       TOBN(0x46f59cf4, 0x4a9aa2c2), TOBN(0x69cd5168, 0xff0f7e41)}},
+     {{TOBN(0x3f59da0f, 0x4b3234da), TOBN(0xcf0b0235, 0xb4579ebe),
+       TOBN(0x6d1cbb25, 0x6d2476c7), TOBN(0x4f0837e6, 0x9dc30f08)},
+      {TOBN(0x9a4075bb, 0x906f6e98), TOBN(0x253bb434, 0xc761e7d1),
+       TOBN(0xde2e645f, 0x6e73af10), TOBN(0xb89a4060, 0x0c5f131c)}},
+     {{TOBN(0xd12840c5, 0xb8cc037f), TOBN(0x3d093a5b, 0x7405bb47),
+       TOBN(0x6202c253, 0x206348b8), TOBN(0xbf5d57fc, 0xc55a3ca7)},
+      {TOBN(0x89f6c90c, 0x8c3bef48), TOBN(0x23ac7623, 0x5a0a960a),
+       TOBN(0xdfbd3d6b, 0x552b42ab), TOBN(0x3ef22458, 0x132061f6)}},
+     {{TOBN(0xd74e9bda, 0xc97e6516), TOBN(0x88779360, 0xc230f49e),
+       TOBN(0xa6ec1de3, 0x1e74ea49), TOBN(0x581dcee5, 0x3fb645a2)},
+      {TOBN(0xbaef2391, 0x8f483f14), TOBN(0x6d2dddfc, 0xd137d13b),
+       TOBN(0x54cde50e, 0xd2743a42), TOBN(0x89a34fc5, 0xe4d97e67)}},
+     {{TOBN(0x13f1f5b3, 0x12e08ce5), TOBN(0xa80540b8, 0xa7f0b2ca),
+       TOBN(0x854bcf77, 0x01982805), TOBN(0xb8653ffd, 0x233bea04)},
+      {TOBN(0x8e7b8787, 0x02b0b4c9), TOBN(0x2675261f, 0x9acb170a),
+       TOBN(0x061a9d90, 0x930c14e5), TOBN(0xb59b30e0, 0xdef0abea)}},
+     {{TOBN(0x1dc19ea6, 0x0200ec7d), TOBN(0xb6f4a3f9, 0x0bce132b),
+       TOBN(0xb8d5de90, 0xf13e27e0), TOBN(0xbaee5ef0, 0x1fade16f)},
+      {TOBN(0x6f406aaa, 0xe4c6cf38), TOBN(0xab4cfe06, 0xd1369815),
+       TOBN(0x0dcffe87, 0xefd550c6), TOBN(0x9d4f59c7, 0x75ff7d39)}},
+     {{TOBN(0xb02553b1, 0x51deb6ad), TOBN(0x812399a4, 0xb1877749),
+       TOBN(0xce90f71f, 0xca6006e1), TOBN(0xc32363a6, 0xb02b6e77)},
+      {TOBN(0x02284fbe, 0xdc36c64d), TOBN(0x86c81e31, 0xa7e1ae61),
+       TOBN(0x2576c7e5, 0xb909d94a), TOBN(0x8b6f7d02, 0x818b2bb0)}},
+     {{TOBN(0xeca3ed07, 0x56faa38a), TOBN(0xa3790e6c, 0x9305bb54),
+       TOBN(0xd784eeda, 0x7bc73061), TOBN(0xbd56d369, 0x6dd50614)},
+      {TOBN(0xd6575949, 0x229a8aa9), TOBN(0xdcca8f47, 0x4595ec28),
+       TOBN(0x814305c1, 0x06ab4fe6), TOBN(0xc8c39768, 0x24f43f16)}},
+     {{TOBN(0xe2a45f36, 0x523f2b36), TOBN(0x995c6493, 0x920d93bb),
+       TOBN(0xf8afdab7, 0x90f1632b), TOBN(0x79ebbecd, 0x1c295954)},
+      {TOBN(0xc7bb3ddb, 0x79592f48), TOBN(0x67216a7b, 0x5f88e998),
+       TOBN(0xd91f098b, 0xbc01193e), TOBN(0xf7d928a5, 0xb1db83fc)}},
+     {{TOBN(0x55e38417, 0xe991f600), TOBN(0x2a91113e, 0x2981a934),
+       TOBN(0xcbc9d648, 0x06b13bde), TOBN(0xb011b6ac, 0x0755ff44)},
+      {TOBN(0x6f4cb518, 0x045ec613), TOBN(0x522d2d31, 0xc2f5930a),
+       TOBN(0x5acae1af, 0x382e65de), TOBN(0x57643067, 0x27bc966f)}},
+     {{TOBN(0x5e12705d, 0x1c7193f0), TOBN(0xf0f32f47, 0x3be8858e),
+       TOBN(0x785c3d7d, 0x96c6dfc7), TOBN(0xd75b4a20, 0xbf31795d)},
+      {TOBN(0x91acf17b, 0x342659d4), TOBN(0xe596ea34, 0x44f0378f),
+       TOBN(0x4515708f, 0xce52129d), TOBN(0x17387e1e, 0x79f2f585)}},
+     {{TOBN(0x72cfd2e9, 0x49dee168), TOBN(0x1ae05223, 0x3e2af239),
+       TOBN(0x009e75be, 0x1d94066a), TOBN(0x6cca31c7, 0x38abf413)},
+      {TOBN(0xb50bd61d, 0x9bc49908), TOBN(0x4a9b4a8c, 0xf5e2bc1e),
+       TOBN(0xeb6cc5f7, 0x946f83ac), TOBN(0x27da93fc, 0xebffab28)}},
+     {{TOBN(0xea314c96, 0x4821c8c5), TOBN(0x8de49ded, 0xa83c15f4),
+       TOBN(0x7a64cf20, 0x7af33004), TOBN(0x45f1bfeb, 0xc9627e10)},
+      {TOBN(0x878b0626, 0x54b9df60), TOBN(0x5e4fdc3c, 0xa95c0b33),
+       TOBN(0xe54a37ca, 0xc2035d8e), TOBN(0x9087cda9, 0x80f20b8c)}},
+     {{TOBN(0x36f61c23, 0x8319ade4), TOBN(0x766f287a, 0xde8cfdf8),
+       TOBN(0x48821948, 0x346f3705), TOBN(0x49a7b853, 0x16e4f4a2)},
+      {TOBN(0xb9b3f8a7, 0x5cedadfd), TOBN(0x8f562815, 0x8db2a815),
+       TOBN(0xc0b7d554, 0x01f68f95), TOBN(0x12971e27, 0x688a208e)}},
+     {{TOBN(0xc9f8b696, 0xd0ff34fc), TOBN(0x20824de2, 0x1222718c),
+       TOBN(0x7213cf9f, 0x0c95284d), TOBN(0xe2ad741b, 0xdc158240)},
+      {TOBN(0x0ee3a6df, 0x54043ccf), TOBN(0x16ff479b, 0xd84412b3),
+       TOBN(0xf6c74ee0, 0xdfc98af0), TOBN(0xa78a169f, 0x52fcd2fb)}},
+     {{TOBN(0xd8ae8746, 0x99c930e9), TOBN(0x1d33e858, 0x49e117a5),
+       TOBN(0x7581fcb4, 0x6624759f), TOBN(0xde50644f, 0x5bedc01d)},
+      {TOBN(0xbeec5d00, 0xcaf3155e), TOBN(0x672d66ac, 0xbc73e75f),
+       TOBN(0x86b9d8c6, 0x270b01db), TOBN(0xd249ef83, 0x50f55b79)}},
+     {{TOBN(0x6131d6d4, 0x73978fe3), TOBN(0xcc4e4542, 0x754b00a1),
+       TOBN(0x4e05df05, 0x57dfcfe9), TOBN(0x94b29cdd, 0x51ef6bf0)},
+      {TOBN(0xe4530cff, 0x9bc7edf2), TOBN(0x8ac236fd, 0xd3da65f3),
+       TOBN(0x0faf7d5f, 0xc8eb0b48), TOBN(0x4d2de14c, 0x660eb039)}},
+     {{TOBN(0xc006bba7, 0x60430e54), TOBN(0x10a2d0d6, 0xda3289ab),
+       TOBN(0x9c037a5d, 0xd7979c59), TOBN(0x04d1f3d3, 0xa116d944)},
+      {TOBN(0x9ff22473, 0x8a0983cd), TOBN(0x28e25b38, 0xc883cabb),
+       TOBN(0xe968dba5, 0x47a58995), TOBN(0x2c80b505, 0x774eebdf)}},
+     {{TOBN(0xee763b71, 0x4a953beb), TOBN(0x502e223f, 0x1642e7f6),
+       TOBN(0x6fe4b641, 0x61d5e722), TOBN(0x9d37c5b0, 0xdbef5316)},
+      {TOBN(0x0115ed70, 0xf8330bc7), TOBN(0x139850e6, 0x75a72789),
+       TOBN(0x27d7faec, 0xffceccc2), TOBN(0x3016a860, 0x4fd9f7f6)}},
+     {{TOBN(0xc492ec64, 0x4cd8f64c), TOBN(0x58a2d790, 0x279d7b51),
+       TOBN(0x0ced1fc5, 0x1fc75256), TOBN(0x3e658aed, 0x8f433017)},
+      {TOBN(0x0b61942e, 0x05da59eb), TOBN(0xba3d60a3, 0x0ddc3722),
+       TOBN(0x7c311cd1, 0x742e7f87), TOBN(0x6473ffee, 0xf6b01b6e)}}},
+    {{{TOBN(0x8303604f, 0x692ac542), TOBN(0xf079ffe1, 0x227b91d3),
+       TOBN(0x19f63e63, 0x15aaf9bd), TOBN(0xf99ee565, 0xf1f344fb)},
+      {TOBN(0x8a1d661f, 0xd6219199), TOBN(0x8c883bc6, 0xd48ce41c),
+       TOBN(0x1065118f, 0x3c74d904), TOBN(0x713889ee, 0x0faf8b1b)}},
+     {{TOBN(0x972b3f8f, 0x81a1b3be), TOBN(0x4f3ce145, 0xce2764a0),
+       TOBN(0xe2d0f1cc, 0x28c4f5f7), TOBN(0xdeee0c0d, 0xc7f3985b)},
+      {TOBN(0x7df4adc0, 0xd39e25c3), TOBN(0x40619820, 0xc467a080),
+       TOBN(0x440ebc93, 0x61cf5a58), TOBN(0x527729a6, 0x422ad600)}},
+     {{TOBN(0xca6c0937, 0xb1b76ba6), TOBN(0x1a2eab85, 0x4d2026dc),
+       TOBN(0xb1715e15, 0x19d9ae0a), TOBN(0xf1ad9199, 0xbac4a026)},
+      {TOBN(0x35b3dfb8, 0x07ea7b0e), TOBN(0xedf5496f, 0x3ed9eb89),
+       TOBN(0x8932e5ff, 0x2d6d08ab), TOBN(0xf314874e, 0x25bd2731)}},
+     {{TOBN(0xefb26a75, 0x3f73f449), TOBN(0x1d1c94f8, 0x8d44fc79),
+       TOBN(0x49f0fbc5, 0x3bc0dc4d), TOBN(0xb747ea0b, 0x3698a0d0)},
+      {TOBN(0x5218c3fe, 0x228d291e), TOBN(0x35b804b5, 0x43c129d6),
+       TOBN(0xfac859b8, 0xd1acc516), TOBN(0x6c10697d, 0x95d6e668)}},
+     {{TOBN(0xc38e438f, 0x0876fd4e), TOBN(0x45f0c307, 0x83d2f383),
+       TOBN(0x203cc2ec, 0xb10934cb), TOBN(0x6a8f2439, 0x2c9d46ee)},
+      {TOBN(0xf16b431b, 0x65ccde7b), TOBN(0x41e2cd18, 0x27e76a6f),
+       TOBN(0xb9c8cf8f, 0x4e3484d7), TOBN(0x64426efd, 0x8315244a)}},
+     {{TOBN(0x1c0a8e44, 0xfc94dea3), TOBN(0x34c8cdbf, 0xdad6a0b0),
+       TOBN(0x919c3840, 0x04113cef), TOBN(0xfd32fba4, 0x15490ffa)},
+      {TOBN(0x58d190f6, 0x795dcfb7), TOBN(0xfef01b03, 0x83588baf),
+       TOBN(0x9e6d1d63, 0xca1fc1c0), TOBN(0x53173f96, 0xf0a41ac9)}},
+     {{TOBN(0x2b1d402a, 0xba16f73b), TOBN(0x2fb31014, 0x8cf9b9fc),
+       TOBN(0x2d51e60e, 0x446ef7bf), TOBN(0xc731021b, 0xb91e1745)},
+      {TOBN(0x9d3b4724, 0x4fee99d4), TOBN(0x4bca48b6, 0xfac5c1ea),
+       TOBN(0x70f5f514, 0xbbea9af7), TOBN(0x751f55a5, 0x974c283a)}},
+     {{TOBN(0x6e30251a, 0xcb452fdb), TOBN(0x31ee6965, 0x50f30650),
+       TOBN(0xb0b3e508, 0x933548d9), TOBN(0xb8949a4f, 0xf4b0ef5b)},
+      {TOBN(0x208b8326, 0x3c88f3bd), TOBN(0xab147c30, 0xdb1d9989),
+       TOBN(0xed6515fd, 0x44d4df03), TOBN(0x17a12f75, 0xe72eb0c5)}},
+     {{TOBN(0x3b59796d, 0x36cf69db), TOBN(0x1219eee9, 0x56670c18),
+       TOBN(0xfe3341f7, 0x7a070d8e), TOBN(0x9b70130b, 0xa327f90c)},
+      {TOBN(0x36a32462, 0x0ae18e0e), TOBN(0x2021a623, 0x46c0a638),
+       TOBN(0x251b5817, 0xc62eb0d4), TOBN(0x87bfbcdf, 0x4c762293)}},
+     {{TOBN(0xf78ab505, 0xcdd61d64), TOBN(0x8c7a53fc, 0xc8c18857),
+       TOBN(0xa653ce6f, 0x16147515), TOBN(0x9c923aa5, 0xea7d52d5)},
+      {TOBN(0xc24709cb, 0x5c18871f), TOBN(0x7d53bec8, 0x73b3cc74),
+       TOBN(0x59264aff, 0xfdd1d4c4), TOBN(0x5555917e, 0x240da582)}},
+     {{TOBN(0xcae8bbda, 0x548f5a0e), TOBN(0x1910eaba, 0x3bbfbbe1),
+       TOBN(0xae579685, 0x7677afc3), TOBN(0x49ea61f1, 0x73ff0b5c)},
+      {TOBN(0x78655478, 0x4f7c3922), TOBN(0x95d337cd, 0x20c68eef),
+       TOBN(0x68f1e1e5, 0xdf779ab9), TOBN(0x14b491b0, 0xb5cf69a8)}},
+     {{TOBN(0x7a6cbbe0, 0x28e3fe89), TOBN(0xe7e1fee4, 0xc5aac0eb),
+       TOBN(0x7f47eda5, 0x697e5140), TOBN(0x4f450137, 0xb454921f)},
+      {TOBN(0xdb625f84, 0x95cd8185), TOBN(0x74be0ba1, 0xcdb2e583),
+       TOBN(0xaee4fd7c, 0xdd5e6de4), TOBN(0x4251437d, 0xe8101739)}},
+     {{TOBN(0x686d72a0, 0xac620366), TOBN(0x4be3fb9c, 0xb6d59344),
+       TOBN(0x6e8b44e7, 0xa1eb75b9), TOBN(0x84e39da3, 0x91a5c10c)},
+      {TOBN(0x37cc1490, 0xb38f0409), TOBN(0x02951943, 0x2c2ade82),
+       TOBN(0x9b688783, 0x1190a2d8), TOBN(0x25627d14, 0x231182ba)}},
+     {{TOBN(0x6eb550aa, 0x658a6d87), TOBN(0x1405aaa7, 0xcf9c7325),
+       TOBN(0xd147142e, 0x5c8748c9), TOBN(0x7f637e4f, 0x53ede0e0)},
+      {TOBN(0xf8ca2776, 0x14ffad2c), TOBN(0xe58fb1bd, 0xbafb6791),
+       TOBN(0x17158c23, 0xbf8f93fc), TOBN(0x7f15b373, 0x0a4a4655)}},
+     {{TOBN(0x39d4add2, 0xd842ca72), TOBN(0xa71e4391, 0x3ed96305),
+       TOBN(0x5bb09cbe, 0x6700be14), TOBN(0x68d69d54, 0xd8befcf6)},
+      {TOBN(0xa45f5367, 0x37183bcf), TOBN(0x7152b7bb, 0x3370dff7),
+       TOBN(0xcf887baa, 0xbf12525b), TOBN(0xe7ac7bdd, 0xd6d1e3cd)}},
+     {{TOBN(0x25914f78, 0x81fdad90), TOBN(0xcf638f56, 0x0d2cf6ab),
+       TOBN(0xb90bc03f, 0xcc054de5), TOBN(0x932811a7, 0x18b06350)},
+      {TOBN(0x2f00b330, 0x9bbd11ff), TOBN(0x76108a6f, 0xb4044974),
+       TOBN(0x801bb9e0, 0xa851d266), TOBN(0x0dd099be, 0xbf8990c1)}},
+     {{TOBN(0x58c5aaaa, 0xabe32986), TOBN(0x0fe9dd2a, 0x50d59c27),
+       TOBN(0x84951ff4, 0x8d307305), TOBN(0x6c23f829, 0x86529b78)},
+      {TOBN(0x50bb2218, 0x0b136a79), TOBN(0x7e2174de, 0x77a20996),
+       TOBN(0x6f00a4b9, 0xc0bb4da6), TOBN(0x89a25a17, 0xefdde8da)}},
+     {{TOBN(0xf728a27e, 0xc11ee01d), TOBN(0xf900553a, 0xe5f10dfb),
+       TOBN(0x189a83c8, 0x02ec893c), TOBN(0x3ca5bdc1, 0x23f66d77)},
+      {TOBN(0x98781537, 0x97eada9f), TOBN(0x59c50ab3, 0x10256230),
+       TOBN(0x346042d9, 0x323c69b3), TOBN(0x1b715a6d, 0x2c460449)}},
+     {{TOBN(0xa41dd476, 0x6ae06e0b), TOBN(0xcdd7888e, 0x9d42e25f),
+       TOBN(0x0f395f74, 0x56b25a20), TOBN(0xeadfe0ae, 0x8700e27e)},
+      {TOBN(0xb09d52a9, 0x69950093), TOBN(0x3525d9cb, 0x327f8d40),
+       TOBN(0xb8235a94, 0x67df886a), TOBN(0x77e4b0dd, 0x035faec2)}},
+     {{TOBN(0x115eb20a, 0x517d7061), TOBN(0x77fe3433, 0x6c2df683),
+       TOBN(0x6870ddc7, 0xcdc6fc67), TOBN(0xb1610588, 0x0b87de83)},
+      {TOBN(0x343584ca, 0xd9c4ddbe), TOBN(0xb3164f1c, 0x3d754be2),
+       TOBN(0x0731ed3a, 0xc1e6c894), TOBN(0x26327dec, 0x4f6b904c)}},
+     {{TOBN(0x9d49c6de, 0x97b5cd32), TOBN(0x40835dae, 0xb5eceecd),
+       TOBN(0xc66350ed, 0xd9ded7fe), TOBN(0x8aeebb5c, 0x7a678804)},
+      {TOBN(0x51d42fb7, 0x5b8ee9ec), TOBN(0xd7a17bdd, 0x8e3ca118),
+       TOBN(0x40d7511a, 0x2ef4400e), TOBN(0xc48990ac, 0x875a66f4)}},
+     {{TOBN(0x8de07d2a, 0x2199e347), TOBN(0xbee75556, 0x2a39e051),
+       TOBN(0x56918786, 0x916e51dc), TOBN(0xeb191313, 0x4a2d89ec)},
+      {TOBN(0x6679610d, 0x37d341ed), TOBN(0x434fbb41, 0x56d51c2b),
+       TOBN(0xe54b7ee7, 0xd7492dba), TOBN(0xaa33a79a, 0x59021493)}},
+     {{TOBN(0x49fc5054, 0xe4bd6d3d), TOBN(0x09540f04, 0x5ab551d0),
+       TOBN(0x8acc9085, 0x4942d3a6), TOBN(0x231af02f, 0x2d28323b)},
+      {TOBN(0x93458cac, 0x0992c163), TOBN(0x1fef8e71, 0x888e3bb4),
+       TOBN(0x27578da5, 0xbe8c268c), TOBN(0xcc8be792, 0xe805ec00)}},
+     {{TOBN(0x29267bae, 0xc61c3855), TOBN(0xebff429d, 0x58c1fd3b),
+       TOBN(0x22d886c0, 0x8c0b93b8), TOBN(0xca5e00b2, 0x2ddb8953)},
+      {TOBN(0xcf330117, 0xc3fed8b7), TOBN(0xd49ac6fa, 0x819c01f6),
+       TOBN(0x6ddaa6bd, 0x3c0fbd54), TOBN(0x91743068, 0x8049a2cf)}},
+     {{TOBN(0xd67f981e, 0xaff2ef81), TOBN(0xc3654d35, 0x2818ae80),
+       TOBN(0x81d05044, 0x1b2aa892), TOBN(0x2db067bf, 0x3d099328)},
+      {TOBN(0xe7c79e86, 0x703dcc97), TOBN(0xe66f9b37, 0xe133e215),
+       TOBN(0xcdf119a6, 0xe39a7a5c), TOBN(0x47c60de3, 0x876f1b61)}},
+     {{TOBN(0x6e405939, 0xd860f1b2), TOBN(0x3e9a1dbc, 0xf5ed4d4a),
+       TOBN(0x3f23619e, 0xc9b6bcbd), TOBN(0x5ee790cf, 0x734e4497)},
+      {TOBN(0xf0a834b1, 0x5bdaf9bb), TOBN(0x02cedda7, 0x4ca295f0),
+       TOBN(0x4619aa2b, 0xcb8e378c), TOBN(0xe5613244, 0xcc987ea4)}},
+     {{TOBN(0x0bc022cc, 0x76b23a50), TOBN(0x4a2793ad, 0x0a6c21ce),
+       TOBN(0x38328780, 0x89cac3f5), TOBN(0x29176f1b, 0xcba26d56)},
+      {TOBN(0x06296187, 0x4f6f59eb), TOBN(0x86e9bca9, 0x8bdc658e),
+       TOBN(0x2ca9c4d3, 0x57e30402), TOBN(0x5438b216, 0x516a09bb)}},
+     {{TOBN(0x0a6a063c, 0x7672765a), TOBN(0x37a3ce64, 0x0547b9bf),
+       TOBN(0x42c099c8, 0x98b1a633), TOBN(0xb5ab800d, 0x05ee6961)},
+      {TOBN(0xf1963f59, 0x11a5acd6), TOBN(0xbaee6157, 0x46201063),
+       TOBN(0x36d9a649, 0xa596210a), TOBN(0xaed04363, 0x1ba7138c)}},
+     {{TOBN(0xcf817d1c, 0xa4a82b76), TOBN(0x5586960e, 0xf3806be9),
+       TOBN(0x7ab67c89, 0x09dc6bb5), TOBN(0x52ace7a0, 0x114fe7eb)},
+      {TOBN(0xcd987618, 0xcbbc9b70), TOBN(0x4f06fd5a, 0x604ca5e1),
+       TOBN(0x90af14ca, 0x6dbde133), TOBN(0x1afe4322, 0x948a3264)}},
+     {{TOBN(0xa70d2ca6, 0xc44b2c6c), TOBN(0xab726799, 0x0ef87dfe),
+       TOBN(0x310f64dc, 0x2e696377), TOBN(0x49b42e68, 0x4c8126a0)},
+      {TOBN(0x0ea444c3, 0xcea0b176), TOBN(0x53a8ddf7, 0xcb269182),
+       TOBN(0xf3e674eb, 0xbbba9dcb), TOBN(0x0d2878a8, 0xd8669d33)}},
+     {{TOBN(0x04b935d5, 0xd019b6a3), TOBN(0xbb5cf88e, 0x406f1e46),
+       TOBN(0xa1912d16, 0x5b57c111), TOBN(0x9803fc21, 0x19ebfd78)},
+      {TOBN(0x4f231c9e, 0xc07764a9), TOBN(0xd93286ee, 0xb75bd055),
+       TOBN(0x83a9457d, 0x8ee6c9de), TOBN(0x04695915, 0x6087ec90)}},
+     {{TOBN(0x14c6dd8a, 0x58d6cd46), TOBN(0x9cb633b5, 0x8e6634d2),
+       TOBN(0xc1305047, 0xf81bc328), TOBN(0x12ede0e2, 0x26a177e5)},
+      {TOBN(0x332cca62, 0x065a6f4f), TOBN(0xc3a47ecd, 0x67be487b),
+       TOBN(0x741eb187, 0x0f47ed1c), TOBN(0x99e66e58, 0xe7598b14)}},
+     {{TOBN(0x6f0544ca, 0x63d0ff12), TOBN(0xe5efc784, 0xb610a05f),
+       TOBN(0xf72917b1, 0x7cad7b47), TOBN(0x3ff6ea20, 0xf2cac0c0)},
+      {TOBN(0xcc23791b, 0xf21db8b7), TOBN(0x7dac70b1, 0xd7d93565),
+       TOBN(0x682cda1d, 0x694bdaad), TOBN(0xeb88bb8c, 0x1023516d)}},
+     {{TOBN(0xc4c634b4, 0xdfdbeb1b), TOBN(0x22f5ca72, 0xb4ee4dea),
+       TOBN(0x1045a368, 0xe6524821), TOBN(0xed9e8a3f, 0x052b18b2)},
+      {TOBN(0x9b7f2cb1, 0xb961f49a), TOBN(0x7fee2ec1, 0x7b009670),
+       TOBN(0x350d8754, 0x22507a6d), TOBN(0x561bd711, 0x4db55f1d)}},
+     {{TOBN(0x4c189ccc, 0x320bbcaf), TOBN(0x568434cf, 0xdf1de48c),
+       TOBN(0x6af1b00e, 0x0fa8f128), TOBN(0xf0ba9d02, 0x8907583c)},
+      {TOBN(0x735a4004, 0x32ff9f60), TOBN(0x3dd8e4b6, 0xc25dcf33),
+       TOBN(0xf2230f16, 0x42c74cef), TOBN(0xd8117623, 0x013fa8ad)}},
+     {{TOBN(0x36822876, 0xf51fe76e), TOBN(0x8a6811cc, 0x11d62589),
+       TOBN(0xc3fc7e65, 0x46225718), TOBN(0xb7df2c9f, 0xc82fdbcd)},
+      {TOBN(0x3b1d4e52, 0xdd7b205b), TOBN(0xb6959478, 0x47a2e414),
+       TOBN(0x05e4d793, 0xefa91148), TOBN(0xb47ed446, 0xfd2e9675)}},
+     {{TOBN(0x1a7098b9, 0x04c9d9bf), TOBN(0x661e2881, 0x1b793048),
+       TOBN(0xb1a16966, 0xb01ee461), TOBN(0xbc521308, 0x2954746f)},
+      {TOBN(0xc909a0fc, 0x2477de50), TOBN(0xd80bb41c, 0x7dbd51ef),
+       TOBN(0xa85be7ec, 0x53294905), TOBN(0x6d465b18, 0x83958f97)}},
+     {{TOBN(0x16f6f330, 0xfb6840fd), TOBN(0xfaaeb214, 0x3401e6c8),
+       TOBN(0xaf83d30f, 0xccb5b4f8), TOBN(0x22885739, 0x266dec4b)},
+      {TOBN(0x51b4367c, 0x7bc467df), TOBN(0x926562e3, 0xd842d27a),
+       TOBN(0xdfcb6614, 0x0fea14a6), TOBN(0xeb394dae, 0xf2734cd9)}},
+     {{TOBN(0x3eeae5d2, 0x11c0be98), TOBN(0xb1e6ed11, 0x814e8165),
+       TOBN(0x191086bc, 0xe52bce1c), TOBN(0x14b74cc6, 0xa75a04da)},
+      {TOBN(0x63cf1186, 0x8c060985), TOBN(0x071047de, 0x2dbd7f7c),
+       TOBN(0x4e433b8b, 0xce0942ca), TOBN(0xecbac447, 0xd8fec61d)}},
+     {{TOBN(0x8f0ed0e2, 0xebf3232f), TOBN(0xfff80f9e, 0xc52a2edd),
+       TOBN(0xad9ab433, 0x75b55fdb), TOBN(0x73ca7820, 0xe42e0c11)},
+      {TOBN(0x6dace0a0, 0xe6251b46), TOBN(0x89bc6b5c, 0x4c0d932d),
+       TOBN(0x3438cd77, 0x095da19a), TOBN(0x2f24a939, 0x8d48bdfb)}},
+     {{TOBN(0x99b47e46, 0x766561b7), TOBN(0x736600e6, 0x0ed0322a),
+       TOBN(0x06a47cb1, 0x638e1865), TOBN(0x927c1c2d, 0xcb136000)},
+      {TOBN(0x29542337, 0x0cc5df69), TOBN(0x99b37c02, 0x09d649a9),
+       TOBN(0xc5f0043c, 0x6aefdb27), TOBN(0x6cdd9987, 0x1be95c27)}},
+     {{TOBN(0x69850931, 0x390420d2), TOBN(0x299c40ac, 0x0983efa4),
+       TOBN(0x3a05e778, 0xaf39aead), TOBN(0x84274408, 0x43a45193)},
+      {TOBN(0x6bcd0fb9, 0x91a711a0), TOBN(0x461592c8, 0x9f52ab17),
+       TOBN(0xb49302b4, 0xda3c6ed6), TOBN(0xc51fddc7, 0x330d7067)}},
+     {{TOBN(0x94babeb6, 0xda50d531), TOBN(0x521b840d, 0xa6a7b9da),
+       TOBN(0x5305151e, 0x404bdc89), TOBN(0x1bcde201, 0xd0d07449)},
+      {TOBN(0xf427a78b, 0x3b76a59a), TOBN(0xf84841ce, 0x07791a1b),
+       TOBN(0xebd314be, 0xbf91ed1c), TOBN(0x8e61d34c, 0xbf172943)}},
+     {{TOBN(0x1d5dc451, 0x5541b892), TOBN(0xb186ee41, 0xfc9d9e54),
+       TOBN(0x9d9f345e, 0xd5bf610d), TOBN(0x3e7ba65d, 0xf6acca9f)},
+      {TOBN(0x9dda787a, 0xa8369486), TOBN(0x09f9dab7, 0x8eb5ba53),
+       TOBN(0x5afb2033, 0xd6481bc3), TOBN(0x76f4ce30, 0xafa62104)}},
+     {{TOBN(0xa8fa00cf, 0xf4f066b5), TOBN(0x89ab5143, 0x461dafc2),
+       TOBN(0x44339ed7, 0xa3389998), TOBN(0x2ff862f1, 0xbc214903)},
+      {TOBN(0x2c88f985, 0xb05556e3), TOBN(0xcd96058e, 0x3467081e),
+       TOBN(0x7d6a4176, 0xedc637ea), TOBN(0xe1743d09, 0x36a5acdc)}},
+     {{TOBN(0x66fd72e2, 0x7eb37726), TOBN(0xf7fa264e, 0x1481a037),
+       TOBN(0x9fbd3bde, 0x45f4aa79), TOBN(0xed1e0147, 0x767c3e22)},
+      {TOBN(0x7621f979, 0x82e7abe2), TOBN(0x19eedc72, 0x45f633f8),
+       TOBN(0xe69b155e, 0x6137bf3a), TOBN(0xa0ad13ce, 0x414ee94e)}},
+     {{TOBN(0x93e3d524, 0x1c0e651a), TOBN(0xab1a6e2a, 0x02ce227e),
+       TOBN(0xe7af1797, 0x4ab27eca), TOBN(0x245446de, 0xbd444f39)},
+      {TOBN(0x59e22a21, 0x56c07613), TOBN(0x43deafce, 0xf4275498),
+       TOBN(0x10834ccb, 0x67fd0946), TOBN(0xa75841e5, 0x47406edf)}},
+     {{TOBN(0xebd6a677, 0x7b0ac93d), TOBN(0xa6e37b0d, 0x78f5e0d7),
+       TOBN(0x2516c096, 0x76f5492b), TOBN(0x1e4bf888, 0x9ac05f3a)},
+      {TOBN(0xcdb42ce0, 0x4df0ba2b), TOBN(0x935d5cfd, 0x5062341b),
+       TOBN(0x8a303333, 0x82acac20), TOBN(0x429438c4, 0x5198b00e)}},
+     {{TOBN(0x1d083bc9, 0x049d33fa), TOBN(0x58b82dda, 0x946f67ff),
+       TOBN(0xac3e2db8, 0x67a1d6a3), TOBN(0x62e6bead, 0x1798aac8)},
+      {TOBN(0xfc85980f, 0xde46c58c), TOBN(0xa7f69379, 0x69c8d7be),
+       TOBN(0x23557927, 0x837b35ec), TOBN(0x06a933d8, 0xe0790c0c)}},
+     {{TOBN(0x827c0e9b, 0x077ff55d), TOBN(0x53977798, 0xbb26e680),
+       TOBN(0x59530874, 0x1d9cb54f), TOBN(0xcca3f449, 0x4aac53ef)},
+      {TOBN(0x11dc5c87, 0xa07eda0f), TOBN(0xc138bccf, 0xfd6400c8),
+       TOBN(0x549680d3, 0x13e5da72), TOBN(0xc93eed82, 0x4540617e)}},
+     {{TOBN(0xfd3db157, 0x4d0b75c0), TOBN(0x9716eb42, 0x6386075b),
+       TOBN(0x0639605c, 0x817b2c16), TOBN(0x09915109, 0xf1e4f201)},
+      {TOBN(0x35c9a928, 0x5cca6c3b), TOBN(0xb25f7d1a, 0x3505c900),
+       TOBN(0xeb9f7d20, 0x630480c4), TOBN(0xc3c7b8c6, 0x2a1a501c)}},
+     {{TOBN(0x3f99183c, 0x5a1f8e24), TOBN(0xfdb118fa, 0x9dd255f0),
+       TOBN(0xb9b18b90, 0xc27f62a6), TOBN(0xe8f732f7, 0x396ec191)},
+      {TOBN(0x524a2d91, 0x0be786ab), TOBN(0x5d32adef, 0x0ac5a0f5),
+       TOBN(0x9b53d4d6, 0x9725f694), TOBN(0x032a76c6, 0x0510ba89)}},
+     {{TOBN(0x840391a3, 0xebeb1544), TOBN(0x44b7b88c, 0x3ed73ac3),
+       TOBN(0xd24bae7a, 0x256cb8b3), TOBN(0x7ceb151a, 0xe394cb12)},
+      {TOBN(0xbd6b66d0, 0x5bc1e6a8), TOBN(0xec70cecb, 0x090f07bf),
+       TOBN(0x270644ed, 0x7d937589), TOBN(0xee9e1a3d, 0x5f1dccfe)}},
+     {{TOBN(0xb0d40a84, 0x745b98d2), TOBN(0xda429a21, 0x2556ed40),
+       TOBN(0xf676eced, 0x85148cb9), TOBN(0x5a22d40c, 0xded18936)},
+      {TOBN(0x3bc4b9e5, 0x70e8a4ce), TOBN(0xbfd1445b, 0x9eae0379),
+       TOBN(0xf23f2c0c, 0x1a0bd47e), TOBN(0xa9c0bb31, 0xe1845531)}},
+     {{TOBN(0x9ddc4d60, 0x0a4c3f6b), TOBN(0xbdfaad79, 0x2c15ef44),
+       TOBN(0xce55a236, 0x7f484acc), TOBN(0x08653ca7, 0x055b1f15)},
+      {TOBN(0x2efa8724, 0x538873a3), TOBN(0x09299e5d, 0xace1c7e7),
+       TOBN(0x07afab66, 0xade332ba), TOBN(0x9be1fdf6, 0x92dd71b7)}},
+     {{TOBN(0xa49b5d59, 0x5758b11c), TOBN(0x0b852893, 0xc8654f40),
+       TOBN(0xb63ef6f4, 0x52379447), TOBN(0xd4957d29, 0x105e690c)},
+      {TOBN(0x7d484363, 0x646559b0), TOBN(0xf4a8273c, 0x49788a8e),
+       TOBN(0xee406cb8, 0x34ce54a9), TOBN(0x1e1c260f, 0xf86fda9b)}},
+     {{TOBN(0xe150e228, 0xcf6a4a81), TOBN(0x1fa3b6a3, 0x1b488772),
+       TOBN(0x1e6ff110, 0xc5a9c15b), TOBN(0xc6133b91, 0x8ad6aa47)},
+      {TOBN(0x8ac5d55c, 0x9dffa978), TOBN(0xba1d1c1d, 0x5f3965f2),
+       TOBN(0xf969f4e0, 0x7732b52f), TOBN(0xfceecdb5, 0xa5172a07)}},
+     {{TOBN(0xb0120a5f, 0x10f2b8f5), TOBN(0xc83a6cdf, 0x5c4c2f63),
+       TOBN(0x4d47a491, 0xf8f9c213), TOBN(0xd9e1cce5, 0xd3f1bbd5)},
+      {TOBN(0x0d91bc7c, 0xaba7e372), TOBN(0xfcdc74c8, 0xdfd1a2db),
+       TOBN(0x05efa800, 0x374618e5), TOBN(0x11216969, 0x15a7925e)}},
+     {{TOBN(0xd4c89823, 0xf6021c5d), TOBN(0x880d5e84, 0xeff14423),
+       TOBN(0x6523bc5a, 0x6dcd1396), TOBN(0xd1acfdfc, 0x113c978b)},
+      {TOBN(0xb0c164e8, 0xbbb66840), TOBN(0xf7f4301e, 0x72b58459),
+       TOBN(0xc29ad4a6, 0xa638e8ec), TOBN(0xf5ab8961, 0x46b78699)}},
+     {{TOBN(0x9dbd7974, 0x0e954750), TOBN(0x0121de88, 0x64f9d2c6),
+       TOBN(0x2e597b42, 0xd985232e), TOBN(0x55b6c3c5, 0x53451777)},
+      {TOBN(0xbb53e547, 0x519cb9fb), TOBN(0xf134019f, 0x8428600d),
+       TOBN(0x5a473176, 0xe081791a), TOBN(0x2f3e2263, 0x35fb0c08)}},
+     {{TOBN(0xb28c3017, 0x73d273b0), TOBN(0xccd21076, 0x7721ef9a),
+       TOBN(0x054cc292, 0xb650dc39), TOBN(0x662246de, 0x6188045e)},
+      {TOBN(0x904b52fa, 0x6b83c0d1), TOBN(0xa72df267, 0x97e9cd46),
+       TOBN(0x886b43cd, 0x899725e4), TOBN(0x2b651688, 0xd849ff22)}},
+     {{TOBN(0x60479b79, 0x02f34533), TOBN(0x5e354c14, 0x0c77c148),
+       TOBN(0xb4bb7581, 0xa8537c78), TOBN(0x188043d7, 0xefe1495f)},
+      {TOBN(0x9ba12f42, 0x8c1d5026), TOBN(0x2e0c8a26, 0x93d4aaab),
+       TOBN(0xbdba7b8b, 0xaa57c450), TOBN(0x140c9ad6, 0x9bbdafef)}},
+     {{TOBN(0x2067aa42, 0x25ac0f18), TOBN(0xf7b1295b, 0x04d1fbf3),
+       TOBN(0x14829111, 0xa4b04824), TOBN(0x2ce3f192, 0x33bd5e91)},
+      {TOBN(0x9c7a1d55, 0x8f2e1b72), TOBN(0xfe932286, 0x302aa243),
+       TOBN(0x497ca7b4, 0xd4be9554), TOBN(0xb8e821b8, 0xe0547a6e)}},
+     {{TOBN(0xfb2838be, 0x67e573e0), TOBN(0x05891db9, 0x4084c44b),
+       TOBN(0x91311373, 0x96c1c2c5), TOBN(0x6aebfa3f, 0xd958444b)},
+      {TOBN(0xac9cdce9, 0xe56e55c1), TOBN(0x7148ced3, 0x2caa46d0),
+       TOBN(0x2e10c7ef, 0xb61fe8eb), TOBN(0x9fd835da, 0xff97cf4d)}}},
+    {{{TOBN(0xa36da109, 0x081e9387), TOBN(0xfb9780d7, 0x8c935828),
+       TOBN(0xd5940332, 0xe540b015), TOBN(0xc9d7b51b, 0xe0f466fa)},
+      {TOBN(0xfaadcd41, 0xd6d9f671), TOBN(0xba6c1e28, 0xb1a2ac17),
+       TOBN(0x066a7833, 0xed201e5f), TOBN(0x19d99719, 0xf90f462b)}},
+     {{TOBN(0xf431f462, 0x060b5f61), TOBN(0xa56f46b4, 0x7bd057c2),
+       TOBN(0x348dca6c, 0x47e1bf65), TOBN(0x9a38783e, 0x41bcf1ff)},
+      {TOBN(0x7a5d33a9, 0xda710718), TOBN(0x5a779987, 0x2e0aeaf6),
+       TOBN(0xca87314d, 0x2d29d187), TOBN(0xfa0edc3e, 0xc687d733)}},
+     {{TOBN(0x9df33621, 0x6a31e09b), TOBN(0xde89e44d, 0xc1350e35),
+       TOBN(0x29214871, 0x4ca0cf52), TOBN(0xdf379672, 0x0b88a538)},
+      {TOBN(0xc92a510a, 0x2591d61b), TOBN(0x79aa87d7, 0x585b447b),
+       TOBN(0xf67db604, 0xe5287f77), TOBN(0x1697c8bf, 0x5efe7a80)}},
+     {{TOBN(0x1c894849, 0xcb198ac7), TOBN(0xa884a93d, 0x0f264665),
+       TOBN(0x2da964ef, 0x9b200678), TOBN(0x3c351b87, 0x009834e6)},
+      {TOBN(0xafb2ef9f, 0xe2c4b44b), TOBN(0x580f6c47, 0x3326790c),
+       TOBN(0xb8480521, 0x0b02264a), TOBN(0x8ba6f9e2, 0x42a194e2)}},
+     {{TOBN(0xfc87975f, 0x8fb54738), TOBN(0x35160788, 0x27c3ead3),
+       TOBN(0x834116d2, 0xb74a085a), TOBN(0x53c99a73, 0xa62fe996)},
+      {TOBN(0x87585be0, 0x5b81c51b), TOBN(0x925bafa8, 0xbe0852b7),
+       TOBN(0x76a4fafd, 0xa84d19a7), TOBN(0x39a45982, 0x585206d4)}},
+     {{TOBN(0x499b6ab6, 0x5eb03c0e), TOBN(0xf19b7954, 0x72bc3fde),
+       TOBN(0xa86b5b9c, 0x6e3a80d2), TOBN(0xe4377508, 0x6d42819f)},
+      {TOBN(0xc1663650, 0xbb3ee8a3), TOBN(0x75eb14fc, 0xb132075f),
+       TOBN(0xa8ccc906, 0x7ad834f6), TOBN(0xea6a2474, 0xe6e92ffd)}},
+     {{TOBN(0x9d72fd95, 0x0f8d6758), TOBN(0xcb84e101, 0x408c07dd),
+       TOBN(0xb9114bfd, 0xa5e23221), TOBN(0x358b5fe2, 0xe94e742c)},
+      {TOBN(0x1c0577ec, 0x95f40e75), TOBN(0xf0155451, 0x3d73f3d6),
+       TOBN(0x9d55cd67, 0xbd1b9b66), TOBN(0x63e86e78, 0xaf8d63c7)}},
+     {{TOBN(0x39d934ab, 0xd3c095f1), TOBN(0x04b261be, 0xe4b76d71),
+       TOBN(0x1d2e6970, 0xe73e6984), TOBN(0x879fb23b, 0x5e5fcb11)},
+      {TOBN(0x11506c72, 0xdfd75490), TOBN(0x3a97d085, 0x61bcf1c1),
+       TOBN(0x43201d82, 0xbf5e7007), TOBN(0x7f0ac52f, 0x798232a7)}},
+     {{TOBN(0x2715cbc4, 0x6eb564d4), TOBN(0x8d6c752c, 0x9e570e29),
+       TOBN(0xf80247c8, 0x9ef5fd5d), TOBN(0xc3c66b46, 0xd53eb514)},
+      {TOBN(0x9666b401, 0x0f87de56), TOBN(0xce62c06f, 0xc6c603b5),
+       TOBN(0xae7b4c60, 0x7e4fc942), TOBN(0x38ac0b77, 0x663a9c19)}},
+     {{TOBN(0xcb4d20ee, 0x4b049136), TOBN(0x8b63bf12, 0x356a4613),
+       TOBN(0x1221aef6, 0x70e08128), TOBN(0xe62d8c51, 0x4acb6b16)},
+      {TOBN(0x71f64a67, 0x379e7896), TOBN(0xb25237a2, 0xcafd7fa5),
+       TOBN(0xf077bd98, 0x3841ba6a), TOBN(0xc4ac0244, 0x3cd16e7e)}},
+     {{TOBN(0x548ba869, 0x21fea4ca), TOBN(0xd36d0817, 0xf3dfdac1),
+       TOBN(0x09d8d71f, 0xf4685faf), TOBN(0x8eff66be, 0xc52c459a)},
+      {TOBN(0x182faee7, 0x0b57235e), TOBN(0xee3c39b1, 0x0106712b),
+       TOBN(0x5107331f, 0xc0fcdcb0), TOBN(0x669fb9dc, 0xa51054ba)}},
+     {{TOBN(0xb25101fb, 0x319d7682), TOBN(0xb0293129, 0x0a982fee),
+       TOBN(0x51c1c9b9, 0x0261b344), TOBN(0x0e008c5b, 0xbfd371fa)},
+      {TOBN(0xd866dd1c, 0x0278ca33), TOBN(0x666f76a6, 0xe5aa53b1),
+       TOBN(0xe5cfb779, 0x6013a2cf), TOBN(0x1d3a1aad, 0xa3521836)}},
+     {{TOBN(0xcedd2531, 0x73faa485), TOBN(0xc8ee6c4f, 0xc0a76878),
+       TOBN(0xddbccfc9, 0x2a11667d), TOBN(0x1a418ea9, 0x1c2f695a)},
+      {TOBN(0xdb11bd92, 0x51f73971), TOBN(0x3e4b3c82, 0xda2ed89f),
+       TOBN(0x9a44f3f4, 0xe73e0319), TOBN(0xd1e3de0f, 0x303431af)}},
+     {{TOBN(0x3c5604ff, 0x50f75f9c), TOBN(0x1d8eddf3, 0x7e752b22),
+       TOBN(0x0ef074dd, 0x3c9a1118), TOBN(0xd0ffc172, 0xccb86d7b)},
+      {TOBN(0xabd1ece3, 0x037d90f2), TOBN(0xe3f307d6, 0x6055856c),
+       TOBN(0x422f9328, 0x7e4c6daf), TOBN(0x902aac66, 0x334879a0)}},
+     {{TOBN(0xb6a1e7bf, 0x94cdfade), TOBN(0x6c97e1ed, 0x7fc6d634),
+       TOBN(0x662ad24d, 0xa2fb63f8), TOBN(0xf81be1b9, 0xa5928405)},
+      {TOBN(0x86d765e4, 0xd14b4206), TOBN(0xbecc2e0e, 0x8fa0db65),
+       TOBN(0xa28838e0, 0xb17fc76c), TOBN(0xe49a602a, 0xe37cf24e)}},
+     {{TOBN(0x76b4131a, 0x567193ec), TOBN(0xaf3c305a, 0xe5f6e70b),
+       TOBN(0x9587bd39, 0x031eebdd), TOBN(0x5709def8, 0x71bbe831)},
+      {TOBN(0x57059983, 0x0eb2b669), TOBN(0x4d80ce1b, 0x875b7029),
+       TOBN(0x838a7da8, 0x0364ac16), TOBN(0x2f431d23, 0xbe1c83ab)}},
+     {{TOBN(0xe56812a6, 0xf9294dd3), TOBN(0xb448d01f, 0x9b4b0d77),
+       TOBN(0xf3ae6061, 0x04e8305c), TOBN(0x2bead645, 0x94d8c63e)},
+      {TOBN(0x0a85434d, 0x84fd8b07), TOBN(0x537b983f, 0xf7a9dee5),
+       TOBN(0xedcc5f18, 0xef55bd85), TOBN(0x2041af62, 0x21c6cf8b)}},
+     {{TOBN(0x8e52874c, 0xb940c71e), TOBN(0x211935a9, 0xdb5f4b3a),
+       TOBN(0x94350492, 0x301b1dc3), TOBN(0x33d2646d, 0x29958620)},
+      {TOBN(0x16b0d64b, 0xef911404), TOBN(0x9d1f25ea, 0x9a3c5ef4),
+       TOBN(0x20f200eb, 0x4a352c78), TOBN(0x43929f2c, 0x4bd0b428)}},
+     {{TOBN(0xa5656667, 0xc7196e29), TOBN(0x7992c2f0, 0x9391be48),
+       TOBN(0xaaa97cbd, 0x9ee0cd6e), TOBN(0x51b0310c, 0x3dc8c9bf)},
+      {TOBN(0x237f8acf, 0xdd9f22cb), TOBN(0xbb1d81a1, 0xb585d584),
+       TOBN(0x8d5d85f5, 0x8c416388), TOBN(0x0d6e5a5a, 0x42fe474f)}},
+     {{TOBN(0xe7812766, 0x38235d4e), TOBN(0x1c62bd67, 0x496e3298),
+       TOBN(0x8378660c, 0x3f175bc8), TOBN(0x4d04e189, 0x17afdd4d)},
+      {TOBN(0x32a81601, 0x85a8068c), TOBN(0xdb58e4e1, 0x92b29a85),
+       TOBN(0xe8a65b86, 0xc70d8a3b), TOBN(0x5f0e6f4e, 0x98a0403b)}},
+     {{TOBN(0x08129684, 0x69ed2370), TOBN(0x34dc30bd, 0x0871ee26),
+       TOBN(0x3a5ce948, 0x7c9c5b05), TOBN(0x7d487b80, 0x43a90c87)},
+      {TOBN(0x4089ba37, 0xdd0e7179), TOBN(0x45f80191, 0xb4041811),
+       TOBN(0x1c3e1058, 0x98747ba5), TOBN(0x98c4e13a, 0x6e1ae592)}},
+     {{TOBN(0xd44636e6, 0xe82c9f9e), TOBN(0x711db87c, 0xc33a1043),
+       TOBN(0x6f431263, 0xaa8aec05), TOBN(0x43ff120d, 0x2744a4aa)},
+      {TOBN(0xd3bd892f, 0xae77779b), TOBN(0xf0fe0cc9, 0x8cdc9f82),
+       TOBN(0xca5f7fe6, 0xf1c5b1bc), TOBN(0xcc63a682, 0x44929a72)}},
+     {{TOBN(0xc7eaba0c, 0x09dbe19a), TOBN(0x2f3585ad, 0x6b5c73c2),
+       TOBN(0x8ab8924b, 0x0ae50c30), TOBN(0x17fcd27a, 0x638b30ba)},
+      {TOBN(0xaf414d34, 0x10b3d5a5), TOBN(0x09c107d2, 0x2a9accf1),
+       TOBN(0x15dac49f, 0x946a6242), TOBN(0xaec3df2a, 0xd707d642)}},
+     {{TOBN(0x2c2492b7, 0x3f894ae0), TOBN(0xf59df3e5, 0xb75f18ce),
+       TOBN(0x7cb740d2, 0x8f53cad0), TOBN(0x3eb585fb, 0xc4f01294)},
+      {TOBN(0x17da0c86, 0x32c7f717), TOBN(0xeb8c795b, 0xaf943f4c),
+       TOBN(0x4ee23fb5, 0xf67c51d2), TOBN(0xef187575, 0x68889949)}},
+     {{TOBN(0xa6b4bdb2, 0x0389168b), TOBN(0xc4ecd258, 0xea577d03),
+       TOBN(0x3a63782b, 0x55743082), TOBN(0x6f678f4c, 0xc72f08cd)},
+      {TOBN(0x553511cf, 0x65e58dd8), TOBN(0xd53b4e3e, 0xd402c0cd),
+       TOBN(0x37de3e29, 0xa037c14c), TOBN(0x86b6c516, 0xc05712aa)}},
+     {{TOBN(0x2834da3e, 0xb38dff6f), TOBN(0xbe012c52, 0xea636be8),
+       TOBN(0x292d238c, 0x61dd37f8), TOBN(0x0e54523f, 0x8f8142db)},
+      {TOBN(0xe31eb436, 0x036a05d8), TOBN(0x83e3cdff, 0x1e93c0ff),
+       TOBN(0x3fd2fe0f, 0x50821ddf), TOBN(0xc8e19b0d, 0xff9eb33b)}},
+     {{TOBN(0xc8cc943f, 0xb569a5fe), TOBN(0xad0090d4, 0xd4342d75),
+       TOBN(0x82090b4b, 0xcaeca000), TOBN(0xca39687f, 0x1bd410eb)},
+      {TOBN(0xe7bb0df7, 0x65959d77), TOBN(0x39d78218, 0x9c964999),
+       TOBN(0xd87f62e8, 0xb2415451), TOBN(0xe5efb774, 0xbed76108)}},
+     {{TOBN(0x3ea011a4, 0xe822f0d0), TOBN(0xbc647ad1, 0x5a8704f8),
+       TOBN(0xbb315b35, 0x50c6820f), TOBN(0x863dec3d, 0xb7e76bec)},
+      {TOBN(0x01ff5d3a, 0xf017bfc7), TOBN(0x20054439, 0x976b8229),
+       TOBN(0x067fca37, 0x0bbd0d3b), TOBN(0xf63dde64, 0x7f5e3d0f)}},
+     {{TOBN(0x22dbefb3, 0x2a4c94e9), TOBN(0xafbff0fe, 0x96f8278a),
+       TOBN(0x80aea0b1, 0x3503793d), TOBN(0xb2238029, 0x5f06cd29)},
+      {TOBN(0x65703e57, 0x8ec3feca), TOBN(0x06c38314, 0x393e7053),
+       TOBN(0xa0b751eb, 0x7c6734c4), TOBN(0xd2e8a435, 0xc59f0f1e)}},
+     {{TOBN(0x147d9052, 0x5e9ca895), TOBN(0x2f4dd31e, 0x972072df),
+       TOBN(0xa16fda8e, 0xe6c6755c), TOBN(0xc66826ff, 0xcf196558)},
+      {TOBN(0x1f1a76a3, 0x0cf43895), TOBN(0xa9d604e0, 0x83c3097b),
+       TOBN(0xe1908309, 0x66390e0e), TOBN(0xa50bf753, 0xb3c85eff)}},
+     {{TOBN(0x0696bdde, 0xf6a70251), TOBN(0x548b801b, 0x3c6ab16a),
+       TOBN(0x37fcf704, 0xa4d08762), TOBN(0x090b3def, 0xdff76c4e)},
+      {TOBN(0x87e8cb89, 0x69cb9158), TOBN(0x44a90744, 0x995ece43),
+       TOBN(0xf85395f4, 0x0ad9fbf5), TOBN(0x49b0f6c5, 0x4fb0c82d)}},
+     {{TOBN(0x75d9bc15, 0xadf7cccf), TOBN(0x81a3e5d6, 0xdfa1e1b0),
+       TOBN(0x8c39e444, 0x249bc17e), TOBN(0xf37dccb2, 0x8ea7fd43)},
+      {TOBN(0xda654873, 0x907fba12), TOBN(0x35daa6da, 0x4a372904),
+       TOBN(0x0564cfc6, 0x6283a6c5), TOBN(0xd09fa4f6, 0x4a9395bf)}},
+     {{TOBN(0x688e9ec9, 0xaeb19a36), TOBN(0xd913f1ce, 0xc7bfbfb4),
+       TOBN(0x797b9a3c, 0x61c2faa6), TOBN(0x2f979bec, 0x6a0a9c12)},
+      {TOBN(0xb5969d0f, 0x359679ec), TOBN(0xebcf523d, 0x079b0460),
+       TOBN(0xfd6b0008, 0x10fab870), TOBN(0x3f2edcda, 0x9373a39c)}},
+     {{TOBN(0x0d64f9a7, 0x6f568431), TOBN(0xf848c27c, 0x02f8898c),
+       TOBN(0xf418ade1, 0x260b5bd5), TOBN(0xc1f3e323, 0x6973dee8)},
+      {TOBN(0x46e9319c, 0x26c185dd), TOBN(0x6d85b7d8, 0x546f0ac4),
+       TOBN(0x427965f2, 0x247f9d57), TOBN(0xb519b636, 0xb0035f48)}},
+     {{TOBN(0x6b6163a9, 0xab87d59c), TOBN(0xff9f58c3, 0x39caaa11),
+       TOBN(0x4ac39cde, 0x3177387b), TOBN(0x5f6557c2, 0x873e77f9)},
+      {TOBN(0x67504006, 0x36a83041), TOBN(0x9b1c96ca, 0x75ef196c),
+       TOBN(0xf34283de, 0xb08c7940), TOBN(0x7ea09644, 0x1128c316)}},
+     {{TOBN(0xb510b3b5, 0x6aa39dff), TOBN(0x59b43da2, 0x9f8e4d8c),
+       TOBN(0xa8ce31fd, 0x9e4c4b9f), TOBN(0x0e20be26, 0xc1303c01)},
+      {TOBN(0x18187182, 0xe8ee47c9), TOBN(0xd9687cdb, 0x7db98101),
+       TOBN(0x7a520e4d, 0xa1e14ff6), TOBN(0x429808ba, 0x8836d572)}},
+     {{TOBN(0xa37ca60d, 0x4944b663), TOBN(0xf901f7a9, 0xa3f91ae5),
+       TOBN(0xe4e3e76e, 0x9e36e3b1), TOBN(0x9aa219cf, 0x29d93250)},
+      {TOBN(0x347fe275, 0x056a2512), TOBN(0xa4d643d9, 0xde65d95c),
+       TOBN(0x9669d396, 0x699fc3ed), TOBN(0xb598dee2, 0xcf8c6bbe)}},
+     {{TOBN(0x682ac1e5, 0xdda9e5c6), TOBN(0x4e0d3c72, 0xcaa9fc95),
+       TOBN(0x17faaade, 0x772bea44), TOBN(0x5ef8428c, 0xab0009c8)},
+      {TOBN(0xcc4ce47a, 0x460ff016), TOBN(0xda6d12bf, 0x725281cb),
+       TOBN(0x44c67848, 0x0223aad2), TOBN(0x6e342afa, 0x36256e28)}},
+     {{TOBN(0x1400bb0b, 0x93a37c04), TOBN(0x62b1bc9b, 0xdd10bd96),
+       TOBN(0x7251adeb, 0x0dac46b7), TOBN(0x7d33b92e, 0x7be4ef51)},
+      {TOBN(0x28b2a94b, 0xe61fa29a), TOBN(0x4b2be13f, 0x06422233),
+       TOBN(0x36d6d062, 0x330d8d37), TOBN(0x5ef80e1e, 0xb28ca005)}},
+     {{TOBN(0x174d4699, 0x6d16768e), TOBN(0x9fc4ff6a, 0x628bf217),
+       TOBN(0x77705a94, 0x154e490d), TOBN(0x9d96dd28, 0x8d2d997a)},
+      {TOBN(0x77e2d9d8, 0xce5d72c4), TOBN(0x9d06c5a4, 0xc11c714f),
+       TOBN(0x02aa5136, 0x79e4a03e), TOBN(0x1386b3c2, 0x030ff28b)}},
+     {{TOBN(0xfe82e8a6, 0xfb283f61), TOBN(0x7df203e5, 0xf3abc3fb),
+       TOBN(0xeec7c351, 0x3a4d3622), TOBN(0xf7d17dbf, 0xdf762761)},
+      {TOBN(0xc3956e44, 0x522055f0), TOBN(0xde3012db, 0x8fa748db),
+       TOBN(0xca9fcb63, 0xbf1dcc14), TOBN(0xa56d9dcf, 0xbe4e2f3a)}},
+     {{TOBN(0xb86186b6, 0x8bcec9c2), TOBN(0x7cf24df9, 0x680b9f06),
+       TOBN(0xc46b45ea, 0xc0d29281), TOBN(0xfff42bc5, 0x07b10e12)},
+      {TOBN(0x12263c40, 0x4d289427), TOBN(0x3d5f1899, 0xb4848ec4),
+       TOBN(0x11f97010, 0xd040800c), TOBN(0xb4c5f529, 0x300feb20)}},
+     {{TOBN(0xcc543f8f, 0xde94fdcb), TOBN(0xe96af739, 0xc7c2f05e),
+       TOBN(0xaa5e0036, 0x882692e1), TOBN(0x09c75b68, 0x950d4ae9)},
+      {TOBN(0x62f63df2, 0xb5932a7a), TOBN(0x2658252e, 0xde0979ad),
+       TOBN(0x2a19343f, 0xb5e69631), TOBN(0x718c7501, 0x525b666b)}},
+     {{TOBN(0x26a42d69, 0xea40dc3a), TOBN(0xdc84ad22, 0xaecc018f),
+       TOBN(0x25c36c7b, 0x3270f04a), TOBN(0x46ba6d47, 0x50fa72ed)},
+      {TOBN(0x6c37d1c5, 0x93e58a8e), TOBN(0xa2394731, 0x120c088c),
+       TOBN(0xc3be4263, 0xcb6e86da), TOBN(0x2c417d36, 0x7126d038)}},
+     {{TOBN(0x5b70f9c5, 0x8b6f8efa), TOBN(0x671a2faa, 0x37718536),
+       TOBN(0xd3ced3c6, 0xb539c92b), TOBN(0xe56f1bd9, 0xa31203c2)},
+      {TOBN(0x8b096ec4, 0x9ff3c8eb), TOBN(0x2deae432, 0x43491cea),
+       TOBN(0x2465c6eb, 0x17943794), TOBN(0x5d267e66, 0x20586843)}},
+     {{TOBN(0x9d3d116d, 0xb07159d0), TOBN(0xae07a67f, 0xc1896210),
+       TOBN(0x8fc84d87, 0xbb961579), TOBN(0x30009e49, 0x1c1f8dd6)},
+      {TOBN(0x8a8caf22, 0xe3132819), TOBN(0xcffa197c, 0xf23ab4ff),
+       TOBN(0x58103a44, 0x205dd687), TOBN(0x57b796c3, 0x0ded67a2)}},
+     {{TOBN(0x0b9c3a6c, 0xa1779ad7), TOBN(0xa33cfe2e, 0x357c09c5),
+       TOBN(0x2ea29315, 0x3db4a57e), TOBN(0x91959695, 0x8ebeb52e)},
+      {TOBN(0x118db9a6, 0xe546c879), TOBN(0x8e996df4, 0x6295c8d6),
+       TOBN(0xdd990484, 0x55ec806b), TOBN(0x24f291ca, 0x165c1035)}},
+     {{TOBN(0xcca523bb, 0x440e2229), TOBN(0x324673a2, 0x73ef4d04),
+       TOBN(0xaf3adf34, 0x3e11ec39), TOBN(0x6136d7f1, 0xdc5968d3)},
+      {TOBN(0x7a7b2899, 0xb053a927), TOBN(0x3eaa2661, 0xae067ecd),
+       TOBN(0x8549b9c8, 0x02779cd9), TOBN(0x061d7940, 0xc53385ea)}},
+     {{TOBN(0x3e0ba883, 0xf06d18bd), TOBN(0x4ba6de53, 0xb2700843),
+       TOBN(0xb966b668, 0x591a9e4d), TOBN(0x93f67567, 0x7f4fa0ed)},
+      {TOBN(0x5a02711b, 0x4347237b), TOBN(0xbc041e2f, 0xe794608e),
+       TOBN(0x55af10f5, 0x70f73d8c), TOBN(0xd2d4d4f7, 0xbb7564f7)}},
+     {{TOBN(0xd7d27a89, 0xb3e93ce7), TOBN(0xf7b5a875, 0x5d3a2c1b),
+       TOBN(0xb29e68a0, 0x255b218a), TOBN(0xb533837e, 0x8af76754)},
+      {TOBN(0xd1b05a73, 0x579fab2e), TOBN(0xb41055a1, 0xecd74385),
+       TOBN(0xb2369274, 0x445e9115), TOBN(0x2972a7c4, 0xf520274e)}},
+     {{TOBN(0x6c08334e, 0xf678e68a), TOBN(0x4e4160f0, 0x99b057ed),
+       TOBN(0x3cfe11b8, 0x52ccb69a), TOBN(0x2fd1823a, 0x21c8f772)},
+      {TOBN(0xdf7f072f, 0x3298f055), TOBN(0x8c0566f9, 0xfec74a6e),
+       TOBN(0xe549e019, 0x5bb4d041), TOBN(0x7c3930ba, 0x9208d850)}},
+     {{TOBN(0xe07141fc, 0xaaa2902b), TOBN(0x539ad799, 0xe4f69ad3),
+       TOBN(0xa6453f94, 0x813f9ffd), TOBN(0xc58d3c48, 0x375bc2f7)},
+      {TOBN(0xb3326fad, 0x5dc64e96), TOBN(0x3aafcaa9, 0xb240e354),
+       TOBN(0x1d1b0903, 0xaca1e7a9), TOBN(0x4ceb9767, 0x1211b8a0)}},
+     {{TOBN(0xeca83e49, 0xe32a858e), TOBN(0x4c32892e, 0xae907bad),
+       TOBN(0xd5b42ab6, 0x2eb9b494), TOBN(0x7fde3ee2, 0x1eabae1b)},
+      {TOBN(0x13b5ab09, 0xcaf54957), TOBN(0xbfb028be, 0xe5f5d5d5),
+       TOBN(0x928a0650, 0x2003e2c0), TOBN(0x90793aac, 0x67476843)}},
+     {{TOBN(0x5e942e79, 0xc81710a0), TOBN(0x557e4a36, 0x27ccadd4),
+       TOBN(0x72a2bc56, 0x4bcf6d0c), TOBN(0x09ee5f43, 0x26d7b80c)},
+      {TOBN(0x6b70dbe9, 0xd4292f19), TOBN(0x56f74c26, 0x63f16b18),
+       TOBN(0xc23db0f7, 0x35fbb42a), TOBN(0xb606bdf6, 0x6ae10040)}},
+     {{TOBN(0x1eb15d4d, 0x044573ac), TOBN(0x7dc3cf86, 0x556b0ba4),
+       TOBN(0x97af9a33, 0xc60df6f7), TOBN(0x0b1ef85c, 0xa716ce8c)},
+      {TOBN(0x2922f884, 0xc96958be), TOBN(0x7c32fa94, 0x35690963),
+       TOBN(0x2d7f667c, 0xeaa00061), TOBN(0xeaaf7c17, 0x3547365c)}},
+     {{TOBN(0x1eb4de46, 0x87032d58), TOBN(0xc54f3d83, 0x5e2c79e0),
+       TOBN(0x07818df4, 0x5d04ef23), TOBN(0x55faa9c8, 0x673d41b4)},
+      {TOBN(0xced64f6f, 0x89b95355), TOBN(0x4860d2ea, 0xb7415c84),
+       TOBN(0x5fdb9bd2, 0x050ebad3), TOBN(0xdb53e0cc, 0x6685a5bf)}},
+     {{TOBN(0xb830c031, 0x9feb6593), TOBN(0xdd87f310, 0x6accff17),
+       TOBN(0x2303ebab, 0x9f555c10), TOBN(0x94603695, 0x287e7065)},
+      {TOBN(0xf88311c3, 0x2e83358c), TOBN(0x508dd9b4, 0xeefb0178),
+       TOBN(0x7ca23706, 0x2dba8652), TOBN(0x62aac5a3, 0x0047abe5)}},
+     {{TOBN(0x9a61d2a0, 0x8b1ea7b3), TOBN(0xd495ab63, 0xae8b1485),
+       TOBN(0x38740f84, 0x87052f99), TOBN(0x178ebe5b, 0xb2974eea)},
+      {TOBN(0x030bbcca, 0x5b36d17f), TOBN(0xb5e4cce3, 0xaaf86eea),
+       TOBN(0xb51a0220, 0x68f8e9e0), TOBN(0xa4348796, 0x09eb3e75)}},
+     {{TOBN(0xbe592309, 0xeef1a752), TOBN(0x5d7162d7, 0x6f2aa1ed),
+       TOBN(0xaebfb5ed, 0x0f007dd2), TOBN(0x255e14b2, 0xc89edd22)},
+      {TOBN(0xba85e072, 0x0303b697), TOBN(0xc5d17e25, 0xf05720ff),
+       TOBN(0x02b58d6e, 0x5128ebb6), TOBN(0x2c80242d, 0xd754e113)}},
+     {{TOBN(0x919fca5f, 0xabfae1ca), TOBN(0x937afaac, 0x1a21459b),
+       TOBN(0x9e0ca91c, 0x1f66a4d2), TOBN(0x194cc7f3, 0x23ec1331)},
+      {TOBN(0xad25143a, 0x8aa11690), TOBN(0xbe40ad8d, 0x09b59e08),
+       TOBN(0x37d60d9b, 0xe750860a), TOBN(0x6c53b008, 0xc6bf434c)}},
+     {{TOBN(0xb572415d, 0x1356eb80), TOBN(0xb8bf9da3, 0x9578ded8),
+       TOBN(0x22658e36, 0x5e8fb38b), TOBN(0x9b70ce22, 0x5af8cb22)},
+      {TOBN(0x7c00018a, 0x829a8180), TOBN(0x84329f93, 0xb81ed295),
+       TOBN(0x7c343ea2, 0x5f3cea83), TOBN(0x38f8655f, 0x67586536)}},
+     {{TOBN(0xa661a0d0, 0x1d3ec517), TOBN(0x98744652, 0x512321ae),
+       TOBN(0x084ca591, 0xeca92598), TOBN(0xa9bb9dc9, 0x1dcb3feb)},
+      {TOBN(0x14c54355, 0x78b4c240), TOBN(0x5ed62a3b, 0x610cafdc),
+       TOBN(0x07512f37, 0x1b38846b), TOBN(0x571bb70a, 0xb0e38161)}},
+     {{TOBN(0xb556b95b, 0x2da705d2), TOBN(0x3ef8ada6, 0xb1a08f98),
+       TOBN(0x85302ca7, 0xddecfbe5), TOBN(0x0e530573, 0x943105cd)},
+      {TOBN(0x60554d55, 0x21a9255d), TOBN(0x63a32fa1, 0xf2f3802a),
+       TOBN(0x35c8c5b0, 0xcd477875), TOBN(0x97f458ea, 0x6ad42da1)}},
+     {{TOBN(0x832d7080, 0xeb6b242d), TOBN(0xd30bd023, 0x3b71e246),
+       TOBN(0x7027991b, 0xbe31139d), TOBN(0x68797e91, 0x462e4e53)},
+      {TOBN(0x423fe20a, 0x6b4e185a), TOBN(0x82f2c67e, 0x42d9b707),
+       TOBN(0x25c81768, 0x4cf7811b), TOBN(0xbd53005e, 0x045bb95d)}}},
+    {{{TOBN(0xe5f649be, 0x9d8e68fd), TOBN(0xdb0f0533, 0x1b044320),
+       TOBN(0xf6fde9b3, 0xe0c33398), TOBN(0x92f4209b, 0x66c8cfae)},
+      {TOBN(0xe9d1afcc, 0x1a739d4b), TOBN(0x09aea75f, 0xa28ab8de),
+       TOBN(0x14375fb5, 0xeac6f1d0), TOBN(0x6420b560, 0x708f7aa5)}},
+     {{TOBN(0x9eae499c, 0x6254dc41), TOBN(0x7e293924, 0x7a837e7e),
+       TOBN(0x74aec08c, 0x090524a7), TOBN(0xf82b9219, 0x8d6f55f2)},
+      {TOBN(0x493c962e, 0x1402cec5), TOBN(0x9f17ca17, 0xfa2f30e7),
+       TOBN(0xbcd783e8, 0xe9b879cb), TOBN(0xea3d8c14, 0x5a6f145f)}},
+     {{TOBN(0xdede15e7, 0x5e0dee6e), TOBN(0x74f24872, 0xdc628aa2),
+       TOBN(0xd3e9c4fe, 0x7861bb93), TOBN(0x56d4822a, 0x6187b2e0)},
+      {TOBN(0xb66417cf, 0xc59826f9), TOBN(0xca260969, 0x2408169e),
+       TOBN(0xedf69d06, 0xc79ef885), TOBN(0x00031f8a, 0xdc7d138f)}},
+     {{TOBN(0x103c46e6, 0x0ebcf726), TOBN(0x4482b831, 0x6231470e),
+       TOBN(0x6f6dfaca, 0x487c2109), TOBN(0x2e0ace97, 0x62e666ef)},
+      {TOBN(0x3246a9d3, 0x1f8d1f42), TOBN(0x1b1e83f1, 0x574944d2),
+       TOBN(0x13dfa63a, 0xa57f334b), TOBN(0x0cf8daed, 0x9f025d81)}},
+     {{TOBN(0x30d78ea8, 0x00ee11c1), TOBN(0xeb053cd4, 0xb5e3dd75),
+       TOBN(0x9b65b13e, 0xd58c43c5), TOBN(0xc3ad49bd, 0xbd151663)},
+      {TOBN(0x99fd8e41, 0xb6427990), TOBN(0x12cf15bd, 0x707eae1e),
+       TOBN(0x29ad4f1b, 0x1aabb71e), TOBN(0x5143e74d, 0x07545d0e)}},
+     {{TOBN(0x30266336, 0xc88bdee1), TOBN(0x25f29306, 0x5876767c),
+       TOBN(0x9c078571, 0xc6731996), TOBN(0xc88690b2, 0xed552951)},
+      {TOBN(0x274f2c2d, 0x852705b4), TOBN(0xb0bf8d44, 0x4e09552d),
+       TOBN(0x7628beeb, 0x986575d1), TOBN(0x407be238, 0x7f864651)}},
+     {{TOBN(0x0e5e3049, 0xa639fc6b), TOBN(0xe75c35d9, 0x86003625),
+       TOBN(0x0cf35bd8, 0x5dcc1646), TOBN(0x8bcaced2, 0x6c26273a)},
+      {TOBN(0xe22ecf1d, 0xb5536742), TOBN(0x013dd897, 0x1a9e068b),
+       TOBN(0x17f411cb, 0x8a7909c5), TOBN(0x5757ac98, 0x861dd506)}},
+     {{TOBN(0x85de1f0d, 0x1e935abb), TOBN(0xdefd10b4, 0x154de37a),
+       TOBN(0xb8d9e392, 0x369cebb5), TOBN(0x54d5ef9b, 0x761324be)},
+      {TOBN(0x4d6341ba, 0x74f17e26), TOBN(0xc0a0e3c8, 0x78c1dde4),
+       TOBN(0xa6d77581, 0x87d918fd), TOBN(0x66876015, 0x02ca3a13)}},
+     {{TOBN(0xc7313e9c, 0xf36658f0), TOBN(0xc433ef1c, 0x71f8057e),
+       TOBN(0x85326246, 0x1b6a835a), TOBN(0xc8f05398, 0x7c86394c)},
+      {TOBN(0xff398cdf, 0xe983c4a1), TOBN(0xbf5e8162, 0x03b7b931),
+       TOBN(0x93193c46, 0xb7b9045b), TOBN(0x1e4ebf5d, 0xa4a6e46b)}},
+     {{TOBN(0xf9942a60, 0x43a24fe7), TOBN(0x29c1191e, 0xffb3492b),
+       TOBN(0x9f662449, 0x902fde05), TOBN(0xc792a7ac, 0x6713c32d)},
+      {TOBN(0x2fd88ad8, 0xb737982c), TOBN(0x7e3a0319, 0xa21e60e3),
+       TOBN(0x09b0de44, 0x7383591a), TOBN(0x6df141ee, 0x8310a456)}},
+     {{TOBN(0xaec1a039, 0xe6d6f471), TOBN(0x14b2ba0f, 0x1198d12e),
+       TOBN(0xebc1a160, 0x3aeee5ac), TOBN(0x401f4836, 0xe0b964ce)},
+      {TOBN(0x2ee43796, 0x4fd03f66), TOBN(0x3fdb4e49, 0xdd8f3f12),
+       TOBN(0x6ef267f6, 0x29380f18), TOBN(0x3e8e9670, 0x8da64d16)}},
+     {{TOBN(0xbc19180c, 0x207674f1), TOBN(0x112e09a7, 0x33ae8fdb),
+       TOBN(0x99667554, 0x6aaeb71e), TOBN(0x79432af1, 0xe101b1c7)},
+      {TOBN(0xd5eb558f, 0xde2ddec6), TOBN(0x81392d1f, 0x5357753f),
+       TOBN(0xa7a76b97, 0x3ae1158a), TOBN(0x416fbbff, 0x4a899991)}},
+     {{TOBN(0x9e65fdfd, 0x0d4a9dcf), TOBN(0x7bc29e48, 0x944ddf12),
+       TOBN(0xbc1a92d9, 0x3c856866), TOBN(0x273c6905, 0x6e98dfe2)},
+      {TOBN(0x69fce418, 0xcdfaa6b8), TOBN(0x606bd823, 0x5061c69f),
+       TOBN(0x42d495a0, 0x6af75e27), TOBN(0x8ed3d505, 0x6d873a1f)}},
+     {{TOBN(0xaf552841, 0x6ab25b6a), TOBN(0xc6c0ffc7, 0x2b1a4523),
+       TOBN(0xab18827b, 0x21c99e03), TOBN(0x060e8648, 0x9034691b)},
+      {TOBN(0x5207f90f, 0x93c7f398), TOBN(0x9f4a96cb, 0x82f8d10b),
+       TOBN(0xdd71cd79, 0x3ad0f9e3), TOBN(0x84f435d2, 0xfc3a54f5)}},
+     {{TOBN(0x4b03c55b, 0x8e33787f), TOBN(0xef42f975, 0xa6384673),
+       TOBN(0xff7304f7, 0x5051b9f0), TOBN(0x18aca1dc, 0x741c87c2)},
+      {TOBN(0x56f120a7, 0x2d4bfe80), TOBN(0xfd823b3d, 0x053e732c),
+       TOBN(0x11bccfe4, 0x7537ca16), TOBN(0xdf6c9c74, 0x1b5a996b)}},
+     {{TOBN(0xee7332c7, 0x904fc3fa), TOBN(0x14a23f45, 0xc7e3636a),
+       TOBN(0xc38659c3, 0xf091d9aa), TOBN(0x4a995e5d, 0xb12d8540)},
+      {TOBN(0x20a53bec, 0xf3a5598a), TOBN(0x56534b17, 0xb1eaa995),
+       TOBN(0x9ed3dca4, 0xbf04e03c), TOBN(0x716c563a, 0xd8d56268)}},
+     {{TOBN(0x27ba77a4, 0x1d6178e7), TOBN(0xe4c80c40, 0x68a1ff8e),
+       TOBN(0x75011099, 0x0a13f63d), TOBN(0x7bf33521, 0xa61d46f3)},
+      {TOBN(0x0aff218e, 0x10b365bb), TOBN(0x81021804, 0x0fd7ea75),
+       TOBN(0x05a3fd8a, 0xa4b3a925), TOBN(0xb829e75f, 0x9b3db4e6)}},
+     {{TOBN(0x6bdc75a5, 0x4d53e5fb), TOBN(0x04a5dc02, 0xd52717e3),
+       TOBN(0x86af502f, 0xe9a42ec2), TOBN(0x8867e8fb, 0x2630e382)},
+      {TOBN(0xbf845c6e, 0xbec9889b), TOBN(0x54f491f2, 0xcb47c98d),
+       TOBN(0xa3091fba, 0x790c2a12), TOBN(0xd7f6fd78, 0xc20f708b)}},
+     {{TOBN(0xa569ac30, 0xacde5e17), TOBN(0xd0f996d0, 0x6852b4d7),
+       TOBN(0xe51d4bb5, 0x4609ae54), TOBN(0x3fa37d17, 0x0daed061)},
+      {TOBN(0x62a88684, 0x34b8fb41), TOBN(0x99a2acbd, 0x9efb64f1),
+       TOBN(0xb75c1a5e, 0x6448e1f2), TOBN(0xfa99951a, 0x42b5a069)}},
+     {{TOBN(0x6d956e89, 0x2f3b26e7), TOBN(0xf4709860, 0xda875247),
+       TOBN(0x3ad15179, 0x2482dda3), TOBN(0xd64110e3, 0x017d82f0)},
+      {TOBN(0x14928d2c, 0xfad414e4), TOBN(0x2b155f58, 0x2ed02b24),
+       TOBN(0x481a141b, 0xcb821bf1), TOBN(0x12e3c770, 0x4f81f5da)}},
+     {{TOBN(0xe49c5de5, 0x9fff8381), TOBN(0x11053232, 0x5bbec894),
+       TOBN(0xa0d051cc, 0x454d88c4), TOBN(0x4f6db89c, 0x1f8e531b)},
+      {TOBN(0x34fe3fd6, 0xca563a44), TOBN(0x7f5c2215, 0x58da8ab9),
+       TOBN(0x8445016d, 0x9474f0a1), TOBN(0x17d34d61, 0xcb7d8a0a)}},
+     {{TOBN(0x8e9d3910, 0x1c474019), TOBN(0xcaff2629, 0xd52ceefb),
+       TOBN(0xf9cf3e32, 0xc1622c2b), TOBN(0xd4b95e3c, 0xe9071a05)},
+      {TOBN(0xfbbca61f, 0x1594438c), TOBN(0x1eb6e6a6, 0x04aadedf),
+       TOBN(0x853027f4, 0x68e14940), TOBN(0x221d322a, 0xdfabda9c)}},
+     {{TOBN(0xed8ea9f6, 0xb7cb179a), TOBN(0xdc7b764d, 0xb7934dcc),
+       TOBN(0xfcb13940, 0x5e09180d), TOBN(0x6629a6bf, 0xb47dc2dd)},
+      {TOBN(0xbfc55e4e, 0x9f5a915e), TOBN(0xb1db9d37, 0x6204441e),
+       TOBN(0xf82d68cf, 0x930c5f53), TOBN(0x17d3a142, 0xcbb605b1)}},
+     {{TOBN(0xdd5944ea, 0x308780f2), TOBN(0xdc8de761, 0x3845f5e4),
+       TOBN(0x6beaba7d, 0x7624d7a3), TOBN(0x1e709afd, 0x304df11e)},
+      {TOBN(0x95364376, 0x02170456), TOBN(0xbf204b3a, 0xc8f94b64),
+       TOBN(0x4e53af7c, 0x5680ca68), TOBN(0x0526074a, 0xe0c67574)}},
+     {{TOBN(0x95d8cef8, 0xecd92af6), TOBN(0xe6b9fa7a, 0x6cd1745a),
+       TOBN(0x3d546d3d, 0xa325c3e4), TOBN(0x1f57691d, 0x9ae93aae)},
+      {TOBN(0xe891f3fe, 0x9d2e1a33), TOBN(0xd430093f, 0xac063d35),
+       TOBN(0xeda59b12, 0x5513a327), TOBN(0xdc2134f3, 0x5536f18f)}},
+     {{TOBN(0xaa51fe2c, 0x5c210286), TOBN(0x3f68aaee, 0x1cab658c),
+       TOBN(0x5a23a00b, 0xf9357292), TOBN(0x9a626f39, 0x7efdabed)},
+      {TOBN(0xfe2b3bf3, 0x199d78e3), TOBN(0xb7a2af77, 0x71bbc345),
+       TOBN(0x3d19827a, 0x1e59802c), TOBN(0x823bbc15, 0xb487a51c)}},
+     {{TOBN(0x856139f2, 0x99d0a422), TOBN(0x9ac3df65, 0xf456c6fb),
+       TOBN(0xaddf65c6, 0x701f8bd6), TOBN(0x149f321e, 0x3758df87)},
+      {TOBN(0xb1ecf714, 0x721b7eba), TOBN(0xe17df098, 0x31a3312a),
+       TOBN(0xdb2fd6ec, 0xd5c4d581), TOBN(0xfd02996f, 0x8fcea1b3)}},
+     {{TOBN(0xe29fa63e, 0x7882f14f), TOBN(0xc9f6dc35, 0x07c6cadc),
+       TOBN(0x46f22d6f, 0xb882bed0), TOBN(0x1a45755b, 0xd118e52c)},
+      {TOBN(0x9f2c7c27, 0x7c4608cf), TOBN(0x7ccbdf32, 0x568012c2),
+       TOBN(0xfcb0aedd, 0x61729b0e), TOBN(0x7ca2ca9e, 0xf7d75dbf)}},
+     {{TOBN(0xf58fecb1, 0x6f640f62), TOBN(0xe274b92b, 0x39f51946),
+       TOBN(0x7f4dfc04, 0x6288af44), TOBN(0x0a91f32a, 0xeac329e5)},
+      {TOBN(0x43ad274b, 0xd6aaba31), TOBN(0x719a1640, 0x0f6884f9),
+       TOBN(0x685d29f6, 0xdaf91e20), TOBN(0x5ec1cc33, 0x27e49d52)}},
+     {{TOBN(0x38f4de96, 0x3b54a059), TOBN(0x0e0015e5, 0xefbcfdb3),
+       TOBN(0x177d23d9, 0x4dbb8da6), TOBN(0x98724aa2, 0x97a617ad)},
+      {TOBN(0x30f0885b, 0xfdb6558e), TOBN(0xf9f7a28a, 0xc7899a96),
+       TOBN(0xd2ae8ac8, 0x872dc112), TOBN(0xfa0642ca, 0x73c3c459)}},
+     {{TOBN(0x15296981, 0xe7dfc8d6), TOBN(0x67cd4450, 0x1fb5b94a),
+       TOBN(0x0ec71cf1, 0x0eddfd37), TOBN(0xc7e5eeb3, 0x9a8eddc7)},
+      {TOBN(0x02ac8e3d, 0x81d95028), TOBN(0x0088f172, 0x70b0e35d),
+       TOBN(0xec041fab, 0xe1881fe3), TOBN(0x62cf71b8, 0xd99e7faa)}},
+     {{TOBN(0x5043dea7, 0xe0f222c2), TOBN(0x309d42ac, 0x72e65142),
+       TOBN(0x94fe9ddd, 0x9216cd30), TOBN(0xd6539c7d, 0x0f87feec)},
+      {TOBN(0x03c5a57c, 0x432ac7d7), TOBN(0x72692cf0, 0x327fda10),
+       TOBN(0xec28c85f, 0x280698de), TOBN(0x2331fb46, 0x7ec283b1)}},
+     {{TOBN(0xd34bfa32, 0x2867e633), TOBN(0x78709a82, 0x0a9cc815),
+       TOBN(0xb7fe6964, 0x875e2fa5), TOBN(0x25cc064f, 0x9e98bfb5)},
+      {TOBN(0x9eb0151c, 0x493a65c5), TOBN(0x5fb5d941, 0x53182464),
+       TOBN(0x69e6f130, 0xf04618e2), TOBN(0xa8ecec22, 0xf89c8ab6)}},
+     {{TOBN(0xcd6ac88b, 0xb96209bd), TOBN(0x65fa8cdb, 0xb3e1c9e0),
+       TOBN(0xa47d22f5, 0x4a8d8eac), TOBN(0x83895cdf, 0x8d33f963)},
+      {TOBN(0xa8adca59, 0xb56cd3d1), TOBN(0x10c8350b, 0xdaf38232),
+       TOBN(0x2b161fb3, 0xa5080a9f), TOBN(0xbe7f5c64, 0x3af65b3a)}},
+     {{TOBN(0x2c754039, 0x97403a11), TOBN(0x94626cf7, 0x121b96af),
+       TOBN(0x431de7c4, 0x6a983ec2), TOBN(0x3780dd3a, 0x52cc3df7)},
+      {TOBN(0xe28a0e46, 0x2baf8e3b), TOBN(0xabe68aad, 0x51d299ae),
+       TOBN(0x603eb8f9, 0x647a2408), TOBN(0x14c61ed6, 0x5c750981)}},
+     {{TOBN(0x88b34414, 0xc53352e7), TOBN(0x5a34889c, 0x1337d46e),
+       TOBN(0x612c1560, 0xf95f2bc8), TOBN(0x8a3f8441, 0xd4807a3a)},
+      {TOBN(0x680d9e97, 0x5224da68), TOBN(0x60cd6e88, 0xc3eb00e9),
+       TOBN(0x3875a98e, 0x9a6bc375), TOBN(0xdc80f924, 0x4fd554c2)}},
+     {{TOBN(0x6c4b3415, 0x6ac77407), TOBN(0xa1e5ea8f, 0x25420681),
+       TOBN(0x541bfa14, 0x4607a458), TOBN(0x5dbc7e7a, 0x96d7fbf9)},
+      {TOBN(0x646a851b, 0x31590a47), TOBN(0x039e85ba, 0x15ee6df8),
+       TOBN(0xd19fa231, 0xd7b43fc0), TOBN(0x84bc8be8, 0x299a0e04)}},
+     {{TOBN(0x2b9d2936, 0xf20df03a), TOBN(0x24054382, 0x8608d472),
+       TOBN(0x76b6ba04, 0x9149202a), TOBN(0xb21c3831, 0x3670e7b7)},
+      {TOBN(0xddd93059, 0xd6fdee10), TOBN(0x9da47ad3, 0x78488e71),
+       TOBN(0x99cc1dfd, 0xa0fcfb25), TOBN(0x42abde10, 0x64696954)}},
+     {{TOBN(0x14cc15fc, 0x17eab9fe), TOBN(0xd6e863e4, 0xd3e70972),
+       TOBN(0x29a7765c, 0x6432112c), TOBN(0x88660001, 0x5b0774d8)},
+      {TOBN(0x3729175a, 0x2c088eae), TOBN(0x13afbcae, 0x8230b8d4),
+       TOBN(0x44768151, 0x915f4379), TOBN(0xf086431a, 0xd8d22812)}},
+     {{TOBN(0x37461955, 0xc298b974), TOBN(0x905fb5f0, 0xf8711e04),
+       TOBN(0x787abf3a, 0xfe969d18), TOBN(0x392167c2, 0x6f6a494e)},
+      {TOBN(0xfc7a0d2d, 0x28c511da), TOBN(0xf127c7dc, 0xb66a262d),
+       TOBN(0xf9c4bb95, 0xfd63fdf0), TOBN(0x90016589, 0x3913ef46)}},
+     {{TOBN(0x74d2a73c, 0x11aa600d), TOBN(0x2f5379bd, 0x9fb5ab52),
+       TOBN(0xe49e53a4, 0x7fb70068), TOBN(0x68dd39e5, 0x404aa9a7)},
+      {TOBN(0xb9b0cf57, 0x2ecaa9c3), TOBN(0xba0e103b, 0xe824826b),
+       TOBN(0x60c2198b, 0x4631a3c4), TOBN(0xc5ff84ab, 0xfa8966a2)}},
+     {{TOBN(0x2d6ebe22, 0xac95aff8), TOBN(0x1c9bb6db, 0xb5a46d09),
+       TOBN(0x419062da, 0x53ee4f8d), TOBN(0x7b9042d0, 0xbb97efef)},
+      {TOBN(0x0f87f080, 0x830cf6bd), TOBN(0x4861d19a, 0x6ec8a6c6),
+       TOBN(0xd3a0daa1, 0x202f01aa), TOBN(0xb0111674, 0xf25afbd5)}},
+     {{TOBN(0x6d00d6cf, 0x1afb20d9), TOBN(0x13695000, 0x40671bc5),
+       TOBN(0x913ab0dc, 0x2485ea9b), TOBN(0x1f2bed06, 0x9eef61ac)},
+      {TOBN(0x850c8217, 0x6d799e20), TOBN(0x93415f37, 0x3271c2de),
+       TOBN(0x5afb06e9, 0x6c4f5910), TOBN(0x688a52df, 0xc4e9e421)}},
+     {{TOBN(0x30495ba3, 0xe2a9a6db), TOBN(0x4601303d, 0x58f9268b),
+       TOBN(0xbe3b0dad, 0x7eb0f04f), TOBN(0x4ea47250, 0x4456936d)},
+      {TOBN(0x8caf8798, 0xd33fd3e7), TOBN(0x1ccd8a89, 0xeb433708),
+       TOBN(0x9effe3e8, 0x87fd50ad), TOBN(0xbe240a56, 0x6b29c4df)}},
+     {{TOBN(0xec4ffd98, 0xca0e7ebd), TOBN(0xf586783a, 0xe748616e),
+       TOBN(0xa5b00d8f, 0xc77baa99), TOBN(0x0acada29, 0xb4f34c9c)},
+      {TOBN(0x36dad67d, 0x0fe723ac), TOBN(0x1d8e53a5, 0x39c36c1e),
+       TOBN(0xe4dd342d, 0x1f4bea41), TOBN(0x64fd5e35, 0xebc9e4e0)}},
+     {{TOBN(0x96f01f90, 0x57908805), TOBN(0xb5b9ea3d, 0x5ed480dd),
+       TOBN(0x366c5dc2, 0x3efd2dd0), TOBN(0xed2fe305, 0x6e9dfa27)},
+      {TOBN(0x4575e892, 0x6e9197e2), TOBN(0x11719c09, 0xab502a5d),
+       TOBN(0x264c7bec, 0xe81f213f), TOBN(0x741b9241, 0x55f5c457)}},
+     {{TOBN(0x78ac7b68, 0x49a5f4f4), TOBN(0xf91d70a2, 0x9fc45b7d),
+       TOBN(0x39b05544, 0xb0f5f355), TOBN(0x11f06bce, 0xeef930d9)},
+      {TOBN(0xdb84d25d, 0x038d05e1), TOBN(0x04838ee5, 0xbacc1d51),
+       TOBN(0x9da3ce86, 0x9e8ee00b), TOBN(0xc3412057, 0xc36eda1f)}},
+     {{TOBN(0xae80b913, 0x64d9c2f4), TOBN(0x7468bac3, 0xa010a8ff),
+       TOBN(0xdfd20037, 0x37359d41), TOBN(0x1a0f5ab8, 0x15efeacc)},
+      {TOBN(0x7c25ad2f, 0x659d0ce0), TOBN(0x4011bcbb, 0x6785cff1),
+       TOBN(0x128b9912, 0x7e2192c7), TOBN(0xa549d8e1, 0x13ccb0e8)}},
+     {{TOBN(0x805588d8, 0xc85438b1), TOBN(0x5680332d, 0xbc25cb27),
+       TOBN(0xdcd1bc96, 0x1a4bfdf4), TOBN(0x779ff428, 0x706f6566)},
+      {TOBN(0x8bbee998, 0xf059987a), TOBN(0xf6ce8cf2, 0xcc686de7),
+       TOBN(0xf8ad3c4a, 0x953cfdb2), TOBN(0xd1d426d9, 0x2205da36)}},
+     {{TOBN(0xb3c0f13f, 0xc781a241), TOBN(0x3e89360e, 0xd75362a8),
+       TOBN(0xccd05863, 0xc8a91184), TOBN(0x9bd0c9b7, 0xefa8a7f4)},
+      {TOBN(0x97ee4d53, 0x8a912a4b), TOBN(0xde5e15f8, 0xbcf518fd),
+       TOBN(0x6a055bf8, 0xc467e1e0), TOBN(0x10be4b4b, 0x1587e256)}},
+     {{TOBN(0xd90c14f2, 0x668621c9), TOBN(0xd5518f51, 0xab9c92c1),
+       TOBN(0x8e6a0100, 0xd6d47b3c), TOBN(0xcbe980dd, 0x66716175)},
+      {TOBN(0x500d3f10, 0xddd83683), TOBN(0x3b6cb35d, 0x99cac73c),
+       TOBN(0x53730c8b, 0x6083d550), TOBN(0xcf159767, 0xdf0a1987)}},
+     {{TOBN(0x84bfcf53, 0x43ad73b3), TOBN(0x1b528c20, 0x4f035a94),
+       TOBN(0x4294edf7, 0x33eeac69), TOBN(0xb6283e83, 0x817f3240)},
+      {TOBN(0xc3fdc959, 0x0a5f25b1), TOBN(0xefaf8aa5, 0x5844ee22),
+       TOBN(0xde269ba5, 0xdbdde4de), TOBN(0xe3347160, 0xc56133bf)}},
+     {{TOBN(0xc1184219, 0x8d9ea9f8), TOBN(0x090de5db, 0xf3fc1ab5),
+       TOBN(0x404c37b1, 0x0bf22cda), TOBN(0x7de20ec8, 0xf5618894)},
+      {TOBN(0x754c588e, 0xecdaecab), TOBN(0x6ca4b0ed, 0x88342743),
+       TOBN(0x76f08bdd, 0xf4a938ec), TOBN(0xd182de89, 0x91493ccb)}},
+     {{TOBN(0xd652c53e, 0xc8a4186a), TOBN(0xb3e878db, 0x946d8e33),
+       TOBN(0x088453c0, 0x5f37663c), TOBN(0x5cd9daaa, 0xb407748b)},
+      {TOBN(0xa1f5197f, 0x586d5e72), TOBN(0x47500be8, 0xc443ca59),
+       TOBN(0x78ef35b2, 0xe2652424), TOBN(0x09c5d26f, 0x6dd7767d)}},
+     {{TOBN(0x7175a79a, 0xa74d3f7b), TOBN(0x0428fd8d, 0xcf5ea459),
+       TOBN(0x511cb97c, 0xa5d1746d), TOBN(0x36363939, 0xe71d1278)},
+      {TOBN(0xcf2df955, 0x10350bf4), TOBN(0xb3817439, 0x60aae782),
+       TOBN(0xa748c0e4, 0x3e688809), TOBN(0x98021fbf, 0xd7a5a006)}},
+     {{TOBN(0x9076a70c, 0x0e367a98), TOBN(0xbea1bc15, 0x0f62b7c2),
+       TOBN(0x2645a68c, 0x30fe0343), TOBN(0xacaffa78, 0x699dc14f)},
+      {TOBN(0xf4469964, 0x457bf9c4), TOBN(0x0db6407b, 0x0d2ead83),
+       TOBN(0x68d56cad, 0xb2c6f3eb), TOBN(0x3b512e73, 0xf376356c)}},
+     {{TOBN(0xe43b0e1f, 0xfce10408), TOBN(0x89ddc003, 0x5a5e257d),
+       TOBN(0xb0ae0d12, 0x0362e5b3), TOBN(0x07f983c7, 0xb0519161)},
+      {TOBN(0xc2e94d15, 0x5d5231e7), TOBN(0xcff22aed, 0x0b4f9513),
+       TOBN(0xb02588dd, 0x6ad0b0b5), TOBN(0xb967d1ac, 0x11d0dcd5)}},
+     {{TOBN(0x8dac6bc6, 0xcf777b6c), TOBN(0x0062bdbd, 0x4c6d1959),
+       TOBN(0x53da71b5, 0x0ef5cc85), TOBN(0x07012c7d, 0x4006f14f)},
+      {TOBN(0x4617f962, 0xac47800d), TOBN(0x53365f2b, 0xc102ed75),
+       TOBN(0xb422efcb, 0x4ab8c9d3), TOBN(0x195cb26b, 0x34af31c9)}},
+     {{TOBN(0x3a926e29, 0x05f2c4ce), TOBN(0xbd2bdecb, 0x9856966c),
+       TOBN(0x5d16ab3a, 0x85527015), TOBN(0x9f81609e, 0x4486c231)},
+      {TOBN(0xd8b96b2c, 0xda350002), TOBN(0xbd054690, 0xfa1b7d36),
+       TOBN(0xdc90ebf5, 0xe71d79bc), TOBN(0xf241b6f9, 0x08964e4e)}},
+     {{TOBN(0x7c838643, 0x2fe3cd4c), TOBN(0xe0f33acb, 0xb4bc633c),
+       TOBN(0xb4a9ecec, 0x3d139f1f), TOBN(0x05ce69cd, 0xdc4a1f49)},
+      {TOBN(0xa19d1b16, 0xf5f98aaf), TOBN(0x45bb71d6, 0x6f23e0ef),
+       TOBN(0x33789fcd, 0x46cdfdd3), TOBN(0x9b8e2978, 0xcee040ca)}},
+     {{TOBN(0x9c69b246, 0xae0a6828), TOBN(0xba533d24, 0x7078d5aa),
+       TOBN(0x7a2e42c0, 0x7bb4fbdb), TOBN(0xcfb4879a, 0x7035385c)},
+      {TOBN(0x8c3dd30b, 0x3281705b), TOBN(0x7e361c6c, 0x404fe081),
+       TOBN(0x7b21649c, 0x3f604edf), TOBN(0x5dbf6a3f, 0xe52ffe47)}},
+     {{TOBN(0xc41b7c23, 0x4b54d9bf), TOBN(0x1374e681, 0x3511c3d9),
+       TOBN(0x1863bf16, 0xc1b2b758), TOBN(0x90e78507, 0x1e9e6a96)},
+      {TOBN(0xab4bf98d, 0x5d86f174), TOBN(0xd74e0bd3, 0x85e96fe4),
+       TOBN(0x8afde39f, 0xcac5d344), TOBN(0x90946dbc, 0xbd91b847)}},
+     {{TOBN(0xf5b42358, 0xfe1a838c), TOBN(0x05aae6c5, 0x620ac9d8),
+       TOBN(0x8e193bd8, 0xa1ce5a0b), TOBN(0x8f710571, 0x4dabfd72)},
+      {TOBN(0x8d8fdd48, 0x182caaac), TOBN(0x8c4aeefa, 0x040745cf),
+       TOBN(0x73c6c30a, 0xf3b93e6d), TOBN(0x991241f3, 0x16f42011)}},
+     {{TOBN(0xa0158eea, 0xe457a477), TOBN(0xd19857db, 0xee6ddc05),
+       TOBN(0xb3265224, 0x18c41671), TOBN(0x3ffdfc7e, 0x3c2c0d58)},
+      {TOBN(0x3a3a5254, 0x26ee7cda), TOBN(0x341b0869, 0xdf02c3a8),
+       TOBN(0xa023bf42, 0x723bbfc8), TOBN(0x3d15002a, 0x14452691)}}},
+    {{{TOBN(0x5ef7324c, 0x85edfa30), TOBN(0x25976554, 0x87d4f3da),
+       TOBN(0x352f5bc0, 0xdcb50c86), TOBN(0x8f6927b0, 0x4832a96c)},
+      {TOBN(0xd08ee1ba, 0x55f2f94c), TOBN(0x6a996f99, 0x344b45fa),
+       TOBN(0xe133cb8d, 0xa8aa455d), TOBN(0x5d0721ec, 0x758dc1f7)}},
+     {{TOBN(0x6ba7a920, 0x79e5fb67), TOBN(0xe1331feb, 0x70aa725e),
+       TOBN(0x5080ccf5, 0x7df5d837), TOBN(0xe4cae01d, 0x7ff72e21)},
+      {TOBN(0xd9243ee6, 0x0412a77d), TOBN(0x06ff7cac, 0xdf449025),
+       TOBN(0xbe75f7cd, 0x23ef5a31), TOBN(0xbc957822, 0x0ddef7a8)}},
+     {{TOBN(0x8cf7230c, 0xb0ce1c55), TOBN(0x5b534d05, 0x0bbfb607),
+       TOBN(0xee1ef113, 0x0e16363b), TOBN(0x27e0aa7a, 0xb4999e82)},
+      {TOBN(0xce1dac2d, 0x79362c41), TOBN(0x67920c90, 0x91bb6cb0),
+       TOBN(0x1e648d63, 0x2223df24), TOBN(0x0f7d9eef, 0xe32e8f28)}},
+     {{TOBN(0x6943f39a, 0xfa833834), TOBN(0x22951722, 0xa6328562),
+       TOBN(0x81d63dd5, 0x4170fc10), TOBN(0x9f5fa58f, 0xaecc2e6d)},
+      {TOBN(0xb66c8725, 0xe77d9a3b), TOBN(0x11235cea, 0x6384ebe0),
+       TOBN(0x06a8c118, 0x5845e24a), TOBN(0x0137b286, 0xebd093b1)}},
+     {{TOBN(0xc589e1ce, 0x44ace150), TOBN(0xe0f8d3d9, 0x4381e97c),
+       TOBN(0x59e99b11, 0x62c5a4b8), TOBN(0x90d262f7, 0xfd0ec9f9)},
+      {TOBN(0xfbc854c9, 0x283e13c9), TOBN(0x2d04fde7, 0xaedc7085),
+       TOBN(0x057d7765, 0x47dcbecb), TOBN(0x8dbdf591, 0x9a76fa5f)}},
+     {{TOBN(0xd0150695, 0x0de1e578), TOBN(0x2e1463e7, 0xe9f72bc6),
+       TOBN(0xffa68441, 0x1b39eca5), TOBN(0x673c8530, 0x7c037f2f)},
+      {TOBN(0xd0d6a600, 0x747f91da), TOBN(0xb08d43e1, 0xc9cb78e9),
+       TOBN(0x0fc0c644, 0x27b5cef5), TOBN(0x5c1d160a, 0xa60a2fd6)}},
+     {{TOBN(0xf98cae53, 0x28c8e13b), TOBN(0x375f10c4, 0xb2eddcd1),
+       TOBN(0xd4eb8b7f, 0x5cce06ad), TOBN(0xb4669f45, 0x80a2e1ef)},
+      {TOBN(0xd593f9d0, 0x5bbd8699), TOBN(0x5528a4c9, 0xe7976d13),
+       TOBN(0x3923e095, 0x1c7e28d3), TOBN(0xb9293790, 0x3f6bb577)}},
+     {{TOBN(0xdb567d6a, 0xc42bd6d2), TOBN(0x6df86468, 0xbb1f96ae),
+       TOBN(0x0efe5b1a, 0x4843b28e), TOBN(0x961bbb05, 0x6379b240)},
+      {TOBN(0xb6caf5f0, 0x70a6a26b), TOBN(0x70686c0d, 0x328e6e39),
+       TOBN(0x80da06cf, 0x895fc8d3), TOBN(0x804d8810, 0xb363fdc9)}},
+     {{TOBN(0xbe22877b, 0x207f1670), TOBN(0x9b0dd188, 0x4e615291),
+       TOBN(0x625ae8dc, 0x97a3c2bf), TOBN(0x08584ef7, 0x439b86e8)},
+      {TOBN(0xde7190a5, 0xdcd898ff), TOBN(0x26286c40, 0x2058ee3d),
+       TOBN(0x3db0b217, 0x5f87b1c1), TOBN(0xcc334771, 0x102a6db5)}},
+     {{TOBN(0xd99de954, 0x2f770fb1), TOBN(0x97c1c620, 0x4cd7535e),
+       TOBN(0xd3b6c448, 0x3f09cefc), TOBN(0xd725af15, 0x5a63b4f8)},
+      {TOBN(0x0c95d24f, 0xc01e20ec), TOBN(0xdfd37494, 0x9ae7121f),
+       TOBN(0x7d6ddb72, 0xec77b7ec), TOBN(0xfe079d3b, 0x0353a4ae)}},
+     {{TOBN(0x3066e70a, 0x2e6ac8d2), TOBN(0x9c6b5a43, 0x106e5c05),
+       TOBN(0x52d3c6f5, 0xede59b8c), TOBN(0x30d6a5c3, 0xfccec9ae)},
+      {TOBN(0xedec7c22, 0x4fc0a9ef), TOBN(0x190ff083, 0x95c16ced),
+       TOBN(0xbe12ec8f, 0x94de0fde), TOBN(0x0d131ab8, 0x852d3433)}},
+     {{TOBN(0x42ace07e, 0x85701291), TOBN(0x94793ed9, 0x194061a8),
+       TOBN(0x30e83ed6, 0xd7f4a485), TOBN(0x9eec7269, 0xf9eeff4d)},
+      {TOBN(0x90acba59, 0x0c9d8005), TOBN(0x5feca458, 0x1e79b9d1),
+       TOBN(0x8fbe5427, 0x1d506a1e), TOBN(0xa32b2c8e, 0x2439cfa7)}},
+     {{TOBN(0x1671c173, 0x73dd0b4e), TOBN(0x37a28214, 0x44a054c6),
+       TOBN(0x81760a1b, 0x4e8b53f1), TOBN(0xa6c04224, 0xf9f93b9e)},
+      {TOBN(0x18784b34, 0xcf671e3c), TOBN(0x81bbecd2, 0xcda9b994),
+       TOBN(0x38831979, 0xb2ab3848), TOBN(0xef54feb7, 0xf2e03c2d)}},
+     {{TOBN(0xcf197ca7, 0xfb8088fa), TOBN(0x01427247, 0x4ddc96c5),
+       TOBN(0xa2d2550a, 0x30777176), TOBN(0x53469898, 0x4d0cf71d)},
+      {TOBN(0x6ce937b8, 0x3a2aaac6), TOBN(0xe9f91dc3, 0x5af38d9b),
+       TOBN(0x2598ad83, 0xc8bf2899), TOBN(0x8e706ac9, 0xb5536c16)}},
+     {{TOBN(0x40dc7495, 0xf688dc98), TOBN(0x26490cd7, 0x124c4afc),
+       TOBN(0xe651ec84, 0x1f18775c), TOBN(0x393ea6c3, 0xb4fdaf4a)},
+      {TOBN(0x1e1f3343, 0x7f338e0d), TOBN(0x39fb832b, 0x6053e7b5),
+       TOBN(0x46e702da, 0x619e14d5), TOBN(0x859cacd1, 0xcdeef6e0)}},
+     {{TOBN(0x63b99ce7, 0x4462007d), TOBN(0xb8ab48a5, 0x4cb5f5b7),
+       TOBN(0x9ec673d2, 0xf55edde7), TOBN(0xd1567f74, 0x8cfaefda)},
+      {TOBN(0x46381b6b, 0x0887bcec), TOBN(0x694497ce, 0xe178f3c2),
+       TOBN(0x5e6525e3, 0x1e6266cb), TOBN(0x5931de26, 0x697d6413)}},
+     {{TOBN(0x87f8df7c, 0x0e58d493), TOBN(0xb1ae5ed0, 0x58b73f12),
+       TOBN(0xc368f784, 0xdea0c34d), TOBN(0x9bd0a120, 0x859a91a0)},
+      {TOBN(0xb00d88b7, 0xcc863c68), TOBN(0x3a1cc11e, 0x3d1f4d65),
+       TOBN(0xea38e0e7, 0x0aa85593), TOBN(0x37f13e98, 0x7dc4aee8)}},
+     {{TOBN(0x10d38667, 0xbc947bad), TOBN(0x738e07ce, 0x2a36ee2e),
+       TOBN(0xc93470cd, 0xc577fcac), TOBN(0xdee1b616, 0x2782470d)},
+      {TOBN(0x36a25e67, 0x2e793d12), TOBN(0xd6aa6cae, 0xe0f186da),
+       TOBN(0x474d0fd9, 0x80e07af7), TOBN(0xf7cdc47d, 0xba8a5cd4)}},
+     {{TOBN(0x28af6d9d, 0xab15247f), TOBN(0x7c789c10, 0x493a537f),
+       TOBN(0x7ac9b110, 0x23a334e7), TOBN(0x0236ac09, 0x12c9c277)},
+      {TOBN(0xa7e5bd25, 0x1d7a5144), TOBN(0x098b9c2a, 0xf13ec4ec),
+       TOBN(0x3639daca, 0xd3f0abca), TOBN(0x642da81a, 0xa23960f9)}},
+     {{TOBN(0x7d2e5c05, 0x4f7269b1), TOBN(0xfcf30777, 0xe287c385),
+       TOBN(0x10edc84f, 0xf2a46f21), TOBN(0x35441757, 0x4f43fa36)},
+      {TOBN(0xf1327899, 0xfd703431), TOBN(0xa438d7a6, 0x16dd587a),
+       TOBN(0x65c34c57, 0xe9c8352d), TOBN(0xa728edab, 0x5cc5a24e)}},
+     {{TOBN(0xaed78abc, 0x42531689), TOBN(0x0a51a0e8, 0x010963ef),
+       TOBN(0x5776fa0a, 0xd717d9b3), TOBN(0xf356c239, 0x7dd3428b)},
+      {TOBN(0x29903fff, 0x8d3a3dac), TOBN(0x409597fa, 0x3d94491f),
+       TOBN(0x4cd7a5ff, 0xbf4a56a4), TOBN(0xe5096474, 0x8adab462)}},
+     {{TOBN(0xa97b5126, 0x5c3427b0), TOBN(0x6401405c, 0xd282c9bd),
+       TOBN(0x3629f8d7, 0x222c5c45), TOBN(0xb1c02c16, 0xe8d50aed)},
+      {TOBN(0xbea2ed75, 0xd9635bc9), TOBN(0x226790c7, 0x6e24552f),
+       TOBN(0x3c33f2a3, 0x65f1d066), TOBN(0x2a43463e, 0x6dfccc2e)}},
+     {{TOBN(0x8cc3453a, 0xdb483761), TOBN(0xe7cc6085, 0x65d5672b),
+       TOBN(0x277ed6cb, 0xde3efc87), TOBN(0x19f2f368, 0x69234eaf)},
+      {TOBN(0x9aaf4317, 0x5c0b800b), TOBN(0x1f1e7c89, 0x8b6da6e2),
+       TOBN(0x6cfb4715, 0xb94ec75e), TOBN(0xd590dd5f, 0x453118c2)}},
+     {{TOBN(0x14e49da1, 0x1f17a34c), TOBN(0x5420ab39, 0x235a1456),
+       TOBN(0xb7637241, 0x2f50363b), TOBN(0x7b15d623, 0xc3fabb6e)},
+      {TOBN(0xa0ef40b1, 0xe274e49c), TOBN(0x5cf50744, 0x96b1860a),
+       TOBN(0xd6583fbf, 0x66afe5a4), TOBN(0x44240510, 0xf47e3e9a)}},
+     {{TOBN(0x99254343, 0x11b2d595), TOBN(0xf1367499, 0xeec8df57),
+       TOBN(0x3cb12c61, 0x3e73dd05), TOBN(0xd248c033, 0x7dac102a)},
+      {TOBN(0xcf154f13, 0xa77739f5), TOBN(0xbf4288cb, 0x23d2af42),
+       TOBN(0xaa64c9b6, 0x32e4a1cf), TOBN(0xee8c07a8, 0xc8a208f3)}},
+     {{TOBN(0xe10d4999, 0x6fe8393f), TOBN(0x0f809a3f, 0xe91f3a32),
+       TOBN(0x61096d1c, 0x802f63c8), TOBN(0x289e1462, 0x57750d3d)},
+      {TOBN(0xed06167e, 0x9889feea), TOBN(0xd5c9c0e2, 0xe0993909),
+       TOBN(0x46fca0d8, 0x56508ac6), TOBN(0x91826047, 0x4f1b8e83)}},
+     {{TOBN(0x4f2c877a, 0x9a4a2751), TOBN(0x71bd0072, 0xcae6fead),
+       TOBN(0x38df8dcc, 0x06aa1941), TOBN(0x5a074b4c, 0x63beeaa8)},
+      {TOBN(0xd6d65934, 0xc1cec8ed), TOBN(0xa6ecb49e, 0xaabc03bd),
+       TOBN(0xaade91c2, 0xde8a8415), TOBN(0xcfb0efdf, 0x691136e0)}},
+     {{TOBN(0x11af45ee, 0x23ab3495), TOBN(0xa132df88, 0x0b77463d),
+       TOBN(0x8923c15c, 0x815d06f4), TOBN(0xc3ceb3f5, 0x0d61a436)},
+      {TOBN(0xaf52291d, 0xe88fb1da), TOBN(0xea057974, 0x1da12179),
+       TOBN(0xb0d7218c, 0xd2fef720), TOBN(0x6c0899c9, 0x8e1d8845)}},
+     {{TOBN(0x98157504, 0x752ddad7), TOBN(0xd60bd74f, 0xa1a68a97),
+       TOBN(0x7047a3a9, 0xf658fb99), TOBN(0x1f5d86d6, 0x5f8511e4)},
+      {TOBN(0xb8a4bc42, 0x4b5a6d88), TOBN(0x69eb2c33, 0x1abefa7d),
+       TOBN(0x95bf39e8, 0x13c9c510), TOBN(0xf571960a, 0xd48aab43)}},
+     {{TOBN(0x7e8cfbcf, 0x704e23c6), TOBN(0xc71b7d22, 0x28aaa65b),
+       TOBN(0xa041b2bd, 0x245e3c83), TOBN(0x69b98834, 0xd21854ff)},
+      {TOBN(0x89d227a3, 0x963bfeec), TOBN(0x99947aaa, 0xde7da7cb),
+       TOBN(0x1d9ee9db, 0xee68a9b1), TOBN(0x0a08f003, 0x698ec368)}},
+     {{TOBN(0xe9ea4094, 0x78ef2487), TOBN(0xc8d2d415, 0x02cfec26),
+       TOBN(0xc52f9a6e, 0xb7dcf328), TOBN(0x0ed489e3, 0x85b6a937)},
+      {TOBN(0x9b94986b, 0xbef3366e), TOBN(0x0de59c70, 0xedddddb8),
+       TOBN(0xffdb748c, 0xeadddbe2), TOBN(0x9b9784bb, 0x8266ea40)}},
+     {{TOBN(0x142b5502, 0x1a93507a), TOBN(0xb4cd1187, 0x8d3c06cf),
+       TOBN(0xdf70e76a, 0x91ec3f40), TOBN(0x484e81ad, 0x4e7553c2)},
+      {TOBN(0x830f87b5, 0x272e9d6e), TOBN(0xea1c93e5, 0xc6ff514a),
+       TOBN(0x67cc2adc, 0xc4192a8e), TOBN(0xc77e27e2, 0x42f4535a)}},
+     {{TOBN(0x9cdbab36, 0xd2b713c5), TOBN(0x86274ea0, 0xcf7b0cd3),
+       TOBN(0x784680f3, 0x09af826b), TOBN(0xbfcc837a, 0x0c72dea3)},
+      {TOBN(0xa8bdfe9d, 0xd6529b73), TOBN(0x708aa228, 0x63a88002),
+       TOBN(0x6c7a9a54, 0xc91d45b9), TOBN(0xdf1a38bb, 0xfd004f56)}},
+     {{TOBN(0x2e8c9a26, 0xb8bad853), TOBN(0x2d52cea3, 0x3723eae7),
+       TOBN(0x054d6d81, 0x56ca2830), TOBN(0xa3317d14, 0x9a8dc411)},
+      {TOBN(0xa08662fe, 0xfd4ddeda), TOBN(0xed2a153a, 0xb55d792b),
+       TOBN(0x7035c16a, 0xbfc6e944), TOBN(0xb6bc5834, 0x00171cf3)}},
+     {{TOBN(0xe27152b3, 0x83d102b6), TOBN(0xfe695a47, 0x0646b848),
+       TOBN(0xa5bb09d8, 0x916e6d37), TOBN(0xb4269d64, 0x0d17015e)},
+      {TOBN(0x8d8156a1, 0x0a1d2285), TOBN(0xfeef6c51, 0x46d26d72),
+       TOBN(0x9dac57c8, 0x4c5434a7), TOBN(0x0282e5be, 0x59d39e31)}},
+     {{TOBN(0xedfff181, 0x721c486d), TOBN(0x301baf10, 0xbc58824e),
+       TOBN(0x8136a6aa, 0x00570031), TOBN(0x55aaf78c, 0x1cddde68)},
+      {TOBN(0x26829371, 0x59c63952), TOBN(0x3a3bd274, 0x8bc25baf),
+       TOBN(0xecdf8657, 0xb7e52dc3), TOBN(0x2dd8c087, 0xfd78e6c8)}},
+     {{TOBN(0x20553274, 0xf5531461), TOBN(0x8b4a1281, 0x5d95499b),
+       TOBN(0xe2c8763a, 0x1a80f9d2), TOBN(0xd1dbe32b, 0x4ddec758)},
+      {TOBN(0xaf12210d, 0x30c34169), TOBN(0xba74a953, 0x78baa533),
+       TOBN(0x3d133c6e, 0xa438f254), TOBN(0xa431531a, 0x201bef5b)}},
+     {{TOBN(0x15295e22, 0xf669d7ec), TOBN(0xca374f64, 0x357fb515),
+       TOBN(0x8a8406ff, 0xeaa3fdb3), TOBN(0x106ae448, 0xdf3f2da8)},
+      {TOBN(0x8f9b0a90, 0x33c8e9a1), TOBN(0x234645e2, 0x71ad5885),
+       TOBN(0x3d083224, 0x1c0aed14), TOBN(0xf10a7d3e, 0x7a942d46)}},
+     {{TOBN(0x7c11deee, 0x40d5c9be), TOBN(0xb2bae7ff, 0xba84ed98),
+       TOBN(0x93e97139, 0xaad58ddd), TOBN(0x3d872796, 0x3f6d1fa3)},
+      {TOBN(0x483aca81, 0x8569ff13), TOBN(0x8b89a5fb, 0x9a600f72),
+       TOBN(0x4cbc27c3, 0xc06f2b86), TOBN(0x22130713, 0x63ad9c0b)}},
+     {{TOBN(0xb5358b1e, 0x48ac2840), TOBN(0x18311294, 0xecba9477),
+       TOBN(0xda58f990, 0xa6946b43), TOBN(0x3098baf9, 0x9ab41819)},
+      {TOBN(0x66c4c158, 0x4198da52), TOBN(0xab4fc17c, 0x146bfd1b),
+       TOBN(0x2f0a4c3c, 0xbf36a908), TOBN(0x2ae9e34b, 0x58cf7838)}},
+     {{TOBN(0xf411529e, 0x3fa11b1f), TOBN(0x21e43677, 0x974af2b4),
+       TOBN(0x7c20958e, 0xc230793b), TOBN(0x710ea885, 0x16e840f3)},
+      {TOBN(0xfc0b21fc, 0xc5dc67cf), TOBN(0x08d51647, 0x88405718),
+       TOBN(0xd955c21f, 0xcfe49eb7), TOBN(0x9722a5d5, 0x56dd4a1f)}},
+     {{TOBN(0xc9ef50e2, 0xc861baa5), TOBN(0xc0c21a5d, 0x9505ac3e),
+       TOBN(0xaf6b9a33, 0x8b7c063f), TOBN(0xc6370339, 0x2f4779c1)},
+      {TOBN(0x22df99c7, 0x638167c3), TOBN(0xfe6ffe76, 0x795db30c),
+       TOBN(0x2b822d33, 0xa4854989), TOBN(0xfef031dd, 0x30563aa5)}},
+     {{TOBN(0x16b09f82, 0xd57c667f), TOBN(0xc70312ce, 0xcc0b76f1),
+       TOBN(0xbf04a9e6, 0xc9118aec), TOBN(0x82fcb419, 0x3409d133)},
+      {TOBN(0x1a8ab385, 0xab45d44d), TOBN(0xfba07222, 0x617b83a3),
+       TOBN(0xb05f50dd, 0x58e81b52), TOBN(0x1d8db553, 0x21ce5aff)}},
+     {{TOBN(0x3097b8d4, 0xe344a873), TOBN(0x7d8d116d, 0xfe36d53e),
+       TOBN(0x6db22f58, 0x7875e750), TOBN(0x2dc5e373, 0x43e144ea)},
+      {TOBN(0xc05f32e6, 0xe799eb95), TOBN(0xe9e5f4df, 0x6899e6ec),
+       TOBN(0xbdc3bd68, 0x1fab23d5), TOBN(0xb72b8ab7, 0x73af60e6)}},
+     {{TOBN(0x8db27ae0, 0x2cecc84a), TOBN(0x600016d8, 0x7bdb871c),
+       TOBN(0x42a44b13, 0xd7c46f58), TOBN(0xb8919727, 0xc3a77d39)},
+      {TOBN(0xcfc6bbbd, 0xdafd6088), TOBN(0x1a740146, 0x6bd20d39),
+       TOBN(0x8c747abd, 0x98c41072), TOBN(0x4c91e765, 0xbdf68ea1)}},
+     {{TOBN(0x7c95e5ca, 0x08819a78), TOBN(0xcf48b729, 0xc9587921),
+       TOBN(0x091c7c5f, 0xdebbcc7d), TOBN(0x6f287404, 0xf0e05149)},
+      {TOBN(0xf83b5ac2, 0x26cd44ec), TOBN(0x88ae32a6, 0xcfea250e),
+       TOBN(0x6ac5047a, 0x1d06ebc5), TOBN(0xc7e550b4, 0xd434f781)}},
+     {{TOBN(0x61ab1cf2, 0x5c727bd2), TOBN(0x2e4badb1, 0x1cf915b0),
+       TOBN(0x1b4dadec, 0xf69d3920), TOBN(0xe61b1ca6, 0xf14c1dfe)},
+      {TOBN(0x90b479cc, 0xbd6bd51f), TOBN(0x8024e401, 0x8045ec30),
+       TOBN(0xcab29ca3, 0x25ef0e62), TOBN(0x4f2e9416, 0x49e4ebc0)}},
+     {{TOBN(0x45eb40ec, 0x0ccced58), TOBN(0x25cd4b9c, 0x0da44f98),
+       TOBN(0x43e06458, 0x871812c6), TOBN(0x99f80d55, 0x16cef651)},
+      {TOBN(0x571340c9, 0xce6dc153), TOBN(0x138d5117, 0xd8665521),
+       TOBN(0xacdb45bc, 0x4e07014d), TOBN(0x2f34bb38, 0x84b60b91)}},
+     {{TOBN(0xf44a4fd2, 0x2ae8921e), TOBN(0xb039288e, 0x892ba1e2),
+       TOBN(0x9da50174, 0xb1c180b2), TOBN(0x6b70ab66, 0x1693dc87)},
+      {TOBN(0x7e9babc9, 0xe7057481), TOBN(0x4581ddef, 0x9c80dc41),
+       TOBN(0x0c890da9, 0x51294682), TOBN(0x0b5629d3, 0x3f4736e5)}},
+     {{TOBN(0x2340c79e, 0xb06f5b41), TOBN(0xa42e84ce, 0x4e243469),
+       TOBN(0xf9a20135, 0x045a71a9), TOBN(0xefbfb415, 0xd27b6fb6)},
+      {TOBN(0x25ebea23, 0x9d33cd6f), TOBN(0x9caedb88, 0xaa6c0af8),
+       TOBN(0x53dc7e9a, 0xd9ce6f96), TOBN(0x3897f9fd, 0x51e0b15a)}},
+     {{TOBN(0xf51cb1f8, 0x8e5d788e), TOBN(0x1aec7ba8, 0xe1d490ee),
+       TOBN(0x265991e0, 0xcc58cb3c), TOBN(0x9f306e8c, 0x9fc3ad31)},
+      {TOBN(0x5fed006e, 0x5040a0ac), TOBN(0xca9d5043, 0xfb476f2e),
+       TOBN(0xa19c06e8, 0xbeea7a23), TOBN(0xd2865801, 0x0edabb63)}},
+     {{TOBN(0xdb92293f, 0x6967469a), TOBN(0x2894d839, 0x8d8a8ed8),
+       TOBN(0x87c9e406, 0xbbc77122), TOBN(0x8671c6f1, 0x2ea3a26a)},
+      {TOBN(0xe42df8d6, 0xd7de9853), TOBN(0x2e3ce346, 0xb1f2bcc7),
+       TOBN(0xda601dfc, 0x899d50cf), TOBN(0xbfc913de, 0xfb1b598f)}},
+     {{TOBN(0x81c4909f, 0xe61f7908), TOBN(0x192e304f, 0x9bbc7b29),
+       TOBN(0xc3ed8738, 0xc104b338), TOBN(0xedbe9e47, 0x783f5d61)},
+      {TOBN(0x0c06e9be, 0x2db30660), TOBN(0xda3e613f, 0xc0eb7d8e),
+       TOBN(0xd8fa3e97, 0x322e096e), TOBN(0xfebd91e8, 0xd336e247)}},
+     {{TOBN(0x8f13ccc4, 0xdf655a49), TOBN(0xa9e00dfc, 0x5eb20210),
+       TOBN(0x84631d0f, 0xc656b6ea), TOBN(0x93a058cd, 0xd8c0d947)},
+      {TOBN(0x6846904a, 0x67bd3448), TOBN(0x4a3d4e1a, 0xf394fd5c),
+       TOBN(0xc102c1a5, 0xdb225f52), TOBN(0xe3455bba, 0xfc4f5e9a)}},
+     {{TOBN(0x6b36985b, 0x4b9ad1ce), TOBN(0xa9818536, 0x5bb7f793),
+       TOBN(0x6c25e1d0, 0x48b1a416), TOBN(0x1381dd53, 0x3c81bee7)},
+      {TOBN(0xd2a30d61, 0x7a4a7620), TOBN(0xc8412926, 0x39b8944c),
+       TOBN(0x3c1c6fbe, 0x7a97c33a), TOBN(0x941e541d, 0x938664e7)}},
+     {{TOBN(0x417499e8, 0x4a34f239), TOBN(0x15fdb83c, 0xb90402d5),
+       TOBN(0xb75f46bf, 0x433aa832), TOBN(0xb61e15af, 0x63215db1)},
+      {TOBN(0xaabe59d4, 0xa127f89a), TOBN(0x5d541e0c, 0x07e816da),
+       TOBN(0xaaba0659, 0xa618b692), TOBN(0x55327733, 0x17266026)}},
+     {{TOBN(0xaf53a0fc, 0x95f57552), TOBN(0x32947650, 0x6cacb0c9),
+       TOBN(0x253ff58d, 0xc821be01), TOBN(0xb0309531, 0xa06f1146)},
+      {TOBN(0x59bbbdf5, 0x05c2e54d), TOBN(0x158f27ad, 0x26e8dd22),
+       TOBN(0xcc5b7ffb, 0x397e1e53), TOBN(0xae03f65b, 0x7fc1e50d)}},
+     {{TOBN(0xa9784ebd, 0x9c95f0f9), TOBN(0x5ed9deb2, 0x24640771),
+       TOBN(0x31244af7, 0x035561c4), TOBN(0x87332f3a, 0x7ee857de)},
+      {TOBN(0x09e16e9e, 0x2b9e0d88), TOBN(0x52d910f4, 0x56a06049),
+       TOBN(0x507ed477, 0xa9592f48), TOBN(0x85cb917b, 0x2365d678)}},
+     {{TOBN(0xf8511c93, 0x4c8998d1), TOBN(0x2186a3f1, 0x730ea58f),
+       TOBN(0x50189626, 0xb2029db0), TOBN(0x9137a6d9, 0x02ceb75a)},
+      {TOBN(0x2fe17f37, 0x748bc82c), TOBN(0x87c2e931, 0x80469f8c),
+       TOBN(0x850f71cd, 0xbf891aa2), TOBN(0x0ca1b89b, 0x75ec3d8d)}},
+     {{TOBN(0x516c43aa, 0x5e1cd3cd), TOBN(0x89397808, 0x9a887c28),
+       TOBN(0x0059c699, 0xddea1f9f), TOBN(0x7737d6fa, 0x8e6868f7)},
+      {TOBN(0x6d93746a, 0x60f1524b), TOBN(0x36985e55, 0xba052aa7),
+       TOBN(0x41b1d322, 0xed923ea5), TOBN(0x3429759f, 0x25852a11)}},
+     {{TOBN(0xbeca6ec3, 0x092e9f41), TOBN(0x3a238c66, 0x62256bbd),
+       TOBN(0xd82958ea, 0x70ad487d), TOBN(0x4ac8aaf9, 0x65610d93)},
+      {TOBN(0x3fa101b1, 0x5e4ccab0), TOBN(0x9bf430f2, 0x9de14bfb),
+       TOBN(0xa10f5cc6, 0x6531899d), TOBN(0x590005fb, 0xea8ce17d)}},
+     {{TOBN(0xc437912f, 0x24544cb6), TOBN(0x9987b71a, 0xd79ac2e3),
+       TOBN(0x13e3d9dd, 0xc058a212), TOBN(0x00075aac, 0xd2de9606)},
+      {TOBN(0x80ab508b, 0x6cac8369), TOBN(0x87842be7, 0xf54f6c89),
+       TOBN(0xa7ad663d, 0x6bc532a4), TOBN(0x67813de7, 0x78a91bc8)}},
+     {{TOBN(0x5dcb61ce, 0xc3427239), TOBN(0x5f3c7cf0, 0xc56934d9),
+       TOBN(0xc079e0fb, 0xe3191591), TOBN(0xe40896bd, 0xb01aada7)},
+      {TOBN(0x8d466791, 0x0492d25f), TOBN(0x8aeb30c9, 0xe7408276),
+       TOBN(0xe9437495, 0x9287aacc), TOBN(0x23d4708d, 0x79fe03d4)}},
+     {{TOBN(0x8cda9cf2, 0xd0c05199), TOBN(0x502fbc22, 0xfae78454),
+       TOBN(0xc0bda9df, 0xf572a182), TOBN(0x5f9b71b8, 0x6158b372)},
+      {TOBN(0xe0f33a59, 0x2b82dd07), TOBN(0x76302735, 0x9523032e),
+       TOBN(0x7fe1a721, 0xc4505a32), TOBN(0x7b6e3e82, 0xf796409f)}}},
+    {{{TOBN(0xe3417bc0, 0x35d0b34a), TOBN(0x440b386b, 0x8327c0a7),
+       TOBN(0x8fb7262d, 0xac0362d1), TOBN(0x2c41114c, 0xe0cdf943)},
+      {TOBN(0x2ba5cef1, 0xad95a0b1), TOBN(0xc09b37a8, 0x67d54362),
+       TOBN(0x26d6cdd2, 0x01e486c9), TOBN(0x20477abf, 0x42ff9297)}},
+     {{TOBN(0xa004dcb3, 0x292a9287), TOBN(0xddc15cf6, 0x77b092c7),
+       TOBN(0x083a8464, 0x806c0605), TOBN(0x4a68df70, 0x3db997b0)},
+      {TOBN(0x9c134e45, 0x05bf7dd0), TOBN(0xa4e63d39, 0x8ccf7f8c),
+       TOBN(0xa6e6517f, 0x41b5f8af), TOBN(0xaa8b9342, 0xad7bc1cc)}},
+     {{TOBN(0x126f35b5, 0x1e706ad9), TOBN(0xb99cebb4, 0xc3a9ebdf),
+       TOBN(0xa75389af, 0xbf608d90), TOBN(0x76113c4f, 0xc6c89858)},
+      {TOBN(0x80de8eb0, 0x97e2b5aa), TOBN(0x7e1022cc, 0x63b91304),
+       TOBN(0x3bdab605, 0x6ccc066c), TOBN(0x33cbb144, 0xb2edf900)}},
+     {{TOBN(0xc4176471, 0x7af715d2), TOBN(0xe2f7f594, 0xd0134a96),
+       TOBN(0x2c1873ef, 0xa41ec956), TOBN(0xe4e7b4f6, 0x77821304)},
+      {TOBN(0xe5c8ff97, 0x88d5374a), TOBN(0x2b915e63, 0x80823d5b),
+       TOBN(0xea6bc755, 0xb2ee8fe2), TOBN(0x6657624c, 0xe7112651)}},
+     {{TOBN(0x157af101, 0xdace5aca), TOBN(0xc4fdbcf2, 0x11a6a267),
+       TOBN(0xdaddf340, 0xc49c8609), TOBN(0x97e49f52, 0xe9604a65)},
+      {TOBN(0x9be8e790, 0x937e2ad5), TOBN(0x846e2508, 0x326e17f1),
+       TOBN(0x3f38007a, 0x0bbbc0dc), TOBN(0xcf03603f, 0xb11e16d6)}},
+     {{TOBN(0xd6f800e0, 0x7442f1d5), TOBN(0x475607d1, 0x66e0e3ab),
+       TOBN(0x82807f16, 0xb7c64047), TOBN(0x8858e1e3, 0xa749883d)},
+      {TOBN(0x5859120b, 0x8231ee10), TOBN(0x1b80e7eb, 0x638a1ece),
+       TOBN(0xcb72525a, 0xc6aa73a4), TOBN(0xa7cdea3d, 0x844423ac)}},
+     {{TOBN(0x5ed0c007, 0xf8ae7c38), TOBN(0x6db07a5c, 0x3d740192),
+       TOBN(0xbe5e9c2a, 0x5fe36db3), TOBN(0xd5b9d57a, 0x76e95046)},
+      {TOBN(0x54ac32e7, 0x8eba20f2), TOBN(0xef11ca8f, 0x71b9a352),
+       TOBN(0x305e373e, 0xff98a658), TOBN(0xffe5a100, 0x823eb667)}},
+     {{TOBN(0x57477b11, 0xe51732d2), TOBN(0xdfd6eb28, 0x2538fc0e),
+       TOBN(0x5c43b0cc, 0x3b39eec5), TOBN(0x6af12778, 0xcb36cc57)},
+      {TOBN(0x70b0852d, 0x06c425ae), TOBN(0x6df92f8c, 0x5c221b9b),
+       TOBN(0x6c8d4f9e, 0xce826d9c), TOBN(0xf59aba7b, 0xb49359c3)}},
+     {{TOBN(0x5c8ed8d5, 0xda64309d), TOBN(0x61a6de56, 0x91b30704),
+       TOBN(0xd6b52f6a, 0x2f9b5808), TOBN(0x0eee4194, 0x98c958a7)},
+      {TOBN(0xcddd9aab, 0x771e4caa), TOBN(0x83965dfd, 0x78bc21be),
+       TOBN(0x02affce3, 0xb3b504f5), TOBN(0x30847a21, 0x561c8291)}},
+     {{TOBN(0xd2eb2cf1, 0x52bfda05), TOBN(0xe0e4c4e9, 0x6197b98c),
+       TOBN(0x1d35076c, 0xf8a1726f), TOBN(0x6c06085b, 0x2db11e3d)},
+      {TOBN(0x15c0c4d7, 0x4463ba14), TOBN(0x9d292f83, 0x0030238c),
+       TOBN(0x1311ee8b, 0x3727536d), TOBN(0xfeea86ef, 0xbeaedc1e)}},
+     {{TOBN(0xb9d18cd3, 0x66131e2e), TOBN(0xf31d974f, 0x80fe2682),
+       TOBN(0xb6e49e0f, 0xe4160289), TOBN(0x7c48ec0b, 0x08e92799)},
+      {TOBN(0x818111d8, 0xd1989aa7), TOBN(0xb34fa0aa, 0xebf926f9),
+       TOBN(0xdb5fe2f5, 0xa245474a), TOBN(0xf80a6ebb, 0x3c7ca756)}},
+     {{TOBN(0xa7f96054, 0xafa05dd8), TOBN(0x26dfcf21, 0xfcaf119e),
+       TOBN(0xe20ef2e3, 0x0564bb59), TOBN(0xef4dca50, 0x61cb02b8)},
+      {TOBN(0xcda7838a, 0x65d30672), TOBN(0x8b08d534, 0xfd657e86),
+       TOBN(0x4c5b4395, 0x46d595c8), TOBN(0x39b58725, 0x425cb836)}},
+     {{TOBN(0x8ea61059, 0x3de9abe3), TOBN(0x40434881, 0x9cdc03be),
+       TOBN(0x9b261245, 0xcfedce8c), TOBN(0x78c318b4, 0xcf5234a1)},
+      {TOBN(0x510bcf16, 0xfde24c99), TOBN(0x2a77cb75, 0xa2c2ff5d),
+       TOBN(0x9c895c2b, 0x27960fb4), TOBN(0xd30ce975, 0xb0eda42b)}},
+     {{TOBN(0xfda85393, 0x1a62cc26), TOBN(0x23c69b96, 0x50c0e052),
+       TOBN(0xa227df15, 0xbfc633f3), TOBN(0x2ac78848, 0x1bae7d48)},
+      {TOBN(0x487878f9, 0x187d073d), TOBN(0x6c2be919, 0x967f807d),
+       TOBN(0x765861d8, 0x336e6d8f), TOBN(0x88b8974c, 0xce528a43)}},
+     {{TOBN(0x09521177, 0xff57d051), TOBN(0x2ff38037, 0xfb6a1961),
+       TOBN(0xfc0aba74, 0xa3d76ad4), TOBN(0x7c764803, 0x25a7ec17)},
+      {TOBN(0x7532d75f, 0x48879bc8), TOBN(0xea7eacc0, 0x58ce6bc1),
+       TOBN(0xc82176b4, 0x8e896c16), TOBN(0x9a30e0b2, 0x2c750fed)}},
+     {{TOBN(0xc37e2c2e, 0x421d3aa4), TOBN(0xf926407c, 0xe84fa840),
+       TOBN(0x18abc03d, 0x1454e41c), TOBN(0x26605ecd, 0x3f7af644)},
+      {TOBN(0x242341a6, 0xd6a5eabf), TOBN(0x1edb84f4, 0x216b668e),
+       TOBN(0xd836edb8, 0x04010102), TOBN(0x5b337ce7, 0x945e1d8c)}},
+     {{TOBN(0xd2075c77, 0xc055dc14), TOBN(0x2a0ffa25, 0x81d89cdf),
+       TOBN(0x8ce815ea, 0x6ffdcbaf), TOBN(0xa3428878, 0xfb648867)},
+      {TOBN(0x277699cf, 0x884655fb), TOBN(0xfa5b5bd6, 0x364d3e41),
+       TOBN(0x01f680c6, 0x441e1cb7), TOBN(0x3fd61e66, 0xb70a7d67)}},
+     {{TOBN(0x666ba2dc, 0xcc78cf66), TOBN(0xb3018174, 0x6fdbff77),
+       TOBN(0x8d4dd0db, 0x168d4668), TOBN(0x259455d0, 0x1dab3a2a)},
+      {TOBN(0xf58564c5, 0xcde3acec), TOBN(0x77141925, 0x13adb276),
+       TOBN(0x527d725d, 0x8a303f65), TOBN(0x55deb6c9, 0xe6f38f7b)}},
+     {{TOBN(0xfd5bb657, 0xb1fa70fb), TOBN(0xfa07f50f, 0xd8073a00),
+       TOBN(0xf72e3aa7, 0xbca02500), TOBN(0xf68f895d, 0x9975740d)},
+      {TOBN(0x30112060, 0x5cae2a6a), TOBN(0x01bd7218, 0x02874842),
+       TOBN(0x3d423891, 0x7ce47bd3), TOBN(0xa66663c1, 0x789544f6)}},
+     {{TOBN(0x864d05d7, 0x3272d838), TOBN(0xe22924f9, 0xfa6295c5),
+       TOBN(0x8189593f, 0x6c2fda32), TOBN(0x330d7189, 0xb184b544)},
+      {TOBN(0x79efa62c, 0xbde1f714), TOBN(0x35771c94, 0xe5cb1a63),
+       TOBN(0x2f4826b8, 0x641c8332), TOBN(0x00a894fb, 0xc8cee854)}},
+     {{TOBN(0xb4b9a39b, 0x36194d40), TOBN(0xe857a7c5, 0x77612601),
+       TOBN(0xf4209dd2, 0x4ecf2f58), TOBN(0x82b9e66d, 0x5a033487)},
+      {TOBN(0xc1e36934, 0xe4e8b9dd), TOBN(0xd2372c9d, 0xa42377d7),
+       TOBN(0x51dc94c7, 0x0e3ae43b), TOBN(0x4c57761e, 0x04474f6f)}},
+     {{TOBN(0xdcdacd0a, 0x1058a318), TOBN(0x369cf3f5, 0x78053a9a),
+       TOBN(0xc6c3de50, 0x31c68de2), TOBN(0x4653a576, 0x3c4b6d9f)},
+      {TOBN(0x1688dd5a, 0xaa4e5c97), TOBN(0x5be80aa1, 0xb7ab3c74),
+       TOBN(0x70cefe7c, 0xbc65c283), TOBN(0x57f95f13, 0x06867091)}},
+     {{TOBN(0xa39114e2, 0x4415503b), TOBN(0xc08ff7c6, 0x4cbb17e9),
+       TOBN(0x1eff674d, 0xd7dec966), TOBN(0x6d4690af, 0x53376f63)},
+      {TOBN(0xff6fe32e, 0xea74237b), TOBN(0xc436d17e, 0xcd57508e),
+       TOBN(0x15aa28e1, 0xedcc40fe), TOBN(0x0d769c04, 0x581bbb44)}},
+     {{TOBN(0xc240b6de, 0x34eaacda), TOBN(0xd9e116e8, 0x2ba0f1de),
+       TOBN(0xcbe45ec7, 0x79438e55), TOBN(0x91787c9d, 0x96f752d7)},
+      {TOBN(0x897f532b, 0xf129ac2f), TOBN(0xd307b7c8, 0x5a36e22c),
+       TOBN(0x91940675, 0x749fb8f3), TOBN(0xd14f95d0, 0x157fdb28)}},
+     {{TOBN(0xfe51d029, 0x6ae55043), TOBN(0x8931e98f, 0x44a87de1),
+       TOBN(0xe57f1cc6, 0x09e4fee2), TOBN(0x0d063b67, 0x4e072d92)},
+      {TOBN(0x70a998b9, 0xed0e4316), TOBN(0xe74a736b, 0x306aca46),
+       TOBN(0xecf0fbf2, 0x4fda97c7), TOBN(0xa40f65cb, 0x3e178d93)}},
+     {{TOBN(0x16253604, 0x16df4285), TOBN(0xb0c9babb, 0xd0c56ae2),
+       TOBN(0x73032b19, 0xcfc5cfc3), TOBN(0xe497e5c3, 0x09752056)},
+      {TOBN(0x12096bb4, 0x164bda96), TOBN(0x1ee42419, 0xa0b74da1),
+       TOBN(0x8fc36243, 0x403826ba), TOBN(0x0c8f0069, 0xdc09e660)}},
+     {{TOBN(0x8667e981, 0xc27253c9), TOBN(0x05a6aefb, 0x92b36a45),
+       TOBN(0xa62c4b36, 0x9cb7bb46), TOBN(0x8394f375, 0x11f7027b)},
+      {TOBN(0x747bc79c, 0x5f109d0f), TOBN(0xcad88a76, 0x5b8cc60a),
+       TOBN(0x80c5a66b, 0x58f09e68), TOBN(0xe753d451, 0xf6127eac)}},
+     {{TOBN(0xc44b74a1, 0x5b0ec6f5), TOBN(0x47989fe4, 0x5289b2b8),
+       TOBN(0x745f8484, 0x58d6fc73), TOBN(0xec362a6f, 0xf61c70ab)},
+      {TOBN(0x070c98a7, 0xb3a8ad41), TOBN(0x73a20fc0, 0x7b63db51),
+       TOBN(0xed2c2173, 0xf44c35f4), TOBN(0x8a56149d, 0x9acc9dca)}},
+     {{TOBN(0x98f17881, 0x9ac6e0f4), TOBN(0x360fdeaf, 0xa413b5ed),
+       TOBN(0x0625b8f4, 0xa300b0fd), TOBN(0xf1f4d76a, 0x5b3222d3)},
+      {TOBN(0x9d6f5109, 0x587f76b8), TOBN(0x8b4ee08d, 0x2317fdb5),
+       TOBN(0x88089bb7, 0x8c68b095), TOBN(0x95570e9a, 0x5808d9b9)}},
+     {{TOBN(0xa395c36f, 0x35d33ae7), TOBN(0x200ea123, 0x50bb5a94),
+       TOBN(0x20c789bd, 0x0bafe84b), TOBN(0x243ef52d, 0x0919276a)},
+      {TOBN(0x3934c577, 0xe23ae233), TOBN(0xb93807af, 0xa460d1ec),
+       TOBN(0xb72a53b1, 0xf8fa76a4), TOBN(0xd8914cb0, 0xc3ca4491)}},
+     {{TOBN(0x2e128494, 0x3fb42622), TOBN(0x3b2700ac, 0x500907d5),
+       TOBN(0xf370fb09, 0x1a95ec63), TOBN(0xf8f30be2, 0x31b6dfbd)},
+      {TOBN(0xf2b2f8d2, 0x69e55f15), TOBN(0x1fead851, 0xcc1323e9),
+       TOBN(0xfa366010, 0xd9e5eef6), TOBN(0x64d487b0, 0xe316107e)}},
+     {{TOBN(0x4c076b86, 0xd23ddc82), TOBN(0x03fd344c, 0x7e0143f0),
+       TOBN(0xa95362ff, 0x317af2c5), TOBN(0x0add3db7, 0xe18b7a4f)},
+      {TOBN(0x9c673e3f, 0x8260e01b), TOBN(0xfbeb49e5, 0x54a1cc91),
+       TOBN(0x91351bf2, 0x92f2e433), TOBN(0xc755e7ec, 0x851141eb)}},
+     {{TOBN(0xc9a95139, 0x29607745), TOBN(0x0ca07420, 0xa26f2b28),
+       TOBN(0xcb2790e7, 0x4bc6f9dd), TOBN(0x345bbb58, 0xadcaffc0)},
+      {TOBN(0xc65ea38c, 0xbe0f27a2), TOBN(0x67c24d7c, 0x641fcb56),
+       TOBN(0x2c25f0a7, 0xa9e2c757), TOBN(0x93f5cdb0, 0x16f16c49)}},
+     {{TOBN(0x2ca5a9d7, 0xc5ee30a1), TOBN(0xd1593635, 0xb909b729),
+       TOBN(0x804ce9f3, 0xdadeff48), TOBN(0xec464751, 0xb07c30c3)},
+      {TOBN(0x89d65ff3, 0x9e49af6a), TOBN(0xf2d6238a, 0x6f3d01bc),
+       TOBN(0x1095561e, 0x0bced843), TOBN(0x51789e12, 0xc8a13fd8)}},
+     {{TOBN(0xd633f929, 0x763231df), TOBN(0x46df9f7d, 0xe7cbddef),
+       TOBN(0x01c889c0, 0xcb265da8), TOBN(0xfce1ad10, 0xaf4336d2)},
+      {TOBN(0x8d110df6, 0xfc6a0a7e), TOBN(0xdd431b98, 0x6da425dc),
+       TOBN(0xcdc4aeab, 0x1834aabe), TOBN(0x84deb124, 0x8439b7fc)}},
+     {{TOBN(0x8796f169, 0x3c2a5998), TOBN(0x9b9247b4, 0x7947190d),
+       TOBN(0x55b9d9a5, 0x11597014), TOBN(0x7e9dd70d, 0x7b1566ee)},
+      {TOBN(0x94ad78f7, 0xcbcd5e64), TOBN(0x0359ac17, 0x9bd4c032),
+       TOBN(0x3b11baaf, 0x7cc222ae), TOBN(0xa6a6e284, 0xba78e812)}},
+     {{TOBN(0x8392053f, 0x24cea1a0), TOBN(0xc97bce4a, 0x33621491),
+       TOBN(0x7eb1db34, 0x35399ee9), TOBN(0x473f78ef, 0xece81ad1)},
+      {TOBN(0x41d72fe0, 0xf63d3d0d), TOBN(0xe620b880, 0xafab62fc),
+       TOBN(0x92096bc9, 0x93158383), TOBN(0x41a21357, 0x8f896f6c)}},
+     {{TOBN(0x1b5ee2fa, 0xc7dcfcab), TOBN(0x650acfde, 0x9546e007),
+       TOBN(0xc081b749, 0xb1b02e07), TOBN(0xda9e41a0, 0xf9eca03d)},
+      {TOBN(0x013ba727, 0x175a54ab), TOBN(0xca0cd190, 0xea5d8d10),
+       TOBN(0x85ea52c0, 0x95fd96a9), TOBN(0x2c591b9f, 0xbc5c3940)}},
+     {{TOBN(0x6fb4d4e4, 0x2bad4d5f), TOBN(0xfa4c3590, 0xfef0059b),
+       TOBN(0x6a10218a, 0xf5122294), TOBN(0x9a78a81a, 0xa85751d1)},
+      {TOBN(0x04f20579, 0xa98e84e7), TOBN(0xfe1242c0, 0x4997e5b5),
+       TOBN(0xe77a273b, 0xca21e1e4), TOBN(0xfcc8b1ef, 0x9411939d)}},
+     {{TOBN(0xe20ea302, 0x92d0487a), TOBN(0x1442dbec, 0x294b91fe),
+       TOBN(0x1f7a4afe, 0xbb6b0e8f), TOBN(0x1700ef74, 0x6889c318)},
+      {TOBN(0xf5bbffc3, 0x70f1fc62), TOBN(0x3b31d4b6, 0x69c79cca),
+       TOBN(0xe8bc2aab, 0xa7f6340d), TOBN(0xb0b08ab4, 0xa725e10a)}},
+     {{TOBN(0x44f05701, 0xae340050), TOBN(0xba4b3016, 0x1cf0c569),
+       TOBN(0x5aa29f83, 0xfbe19a51), TOBN(0x1b9ed428, 0xb71d752e)},
+      {TOBN(0x1666e54e, 0xeb4819f5), TOBN(0x616cdfed, 0x9e18b75b),
+       TOBN(0x112ed5be, 0x3ee27b0b), TOBN(0xfbf28319, 0x44c7de4d)}},
+     {{TOBN(0xd685ec85, 0xe0e60d84), TOBN(0x68037e30, 0x1db7ee78),
+       TOBN(0x5b65bdcd, 0x003c4d6e), TOBN(0x33e7363a, 0x93e29a6a)},
+      {TOBN(0x995b3a61, 0x08d0756c), TOBN(0xd727f85c, 0x2faf134b),
+       TOBN(0xfac6edf7, 0x1d337823), TOBN(0x99b9aa50, 0x0439b8b4)}},
+     {{TOBN(0x722eb104, 0xe2b4e075), TOBN(0x49987295, 0x437c4926),
+       TOBN(0xb1e4c0e4, 0x46a9b82d), TOBN(0xd0cb3197, 0x57a006f5)},
+      {TOBN(0xf3de0f7d, 0xd7808c56), TOBN(0xb5c54d8f, 0x51f89772),
+       TOBN(0x500a114a, 0xadbd31aa), TOBN(0x9afaaaa6, 0x295f6cab)}},
+     {{TOBN(0x94705e21, 0x04cf667a), TOBN(0xfc2a811b, 0x9d3935d7),
+       TOBN(0x560b0280, 0x6d09267c), TOBN(0xf19ed119, 0xf780e53b)},
+      {TOBN(0xf0227c09, 0x067b6269), TOBN(0x967b8533, 0x5caef599),
+       TOBN(0x155b9243, 0x68efeebc), TOBN(0xcd6d34f5, 0xc497bae6)}},
+     {{TOBN(0x1dd8d5d3, 0x6cceb370), TOBN(0x2aeac579, 0xa78d7bf9),
+       TOBN(0x5d65017d, 0x70b67a62), TOBN(0x70c8e44f, 0x17c53f67)},
+      {TOBN(0xd1fc0950, 0x86a34d09), TOBN(0xe0fca256, 0xe7134907),
+       TOBN(0xe24fa29c, 0x80fdd315), TOBN(0x2c4acd03, 0xd87499ad)}},
+     {{TOBN(0xbaaf7517, 0x3b5a9ba6), TOBN(0xb9cbe1f6, 0x12e51a51),
+       TOBN(0xd88edae3, 0x5e154897), TOBN(0xe4309c3c, 0x77b66ca0)},
+      {TOBN(0xf5555805, 0xf67f3746), TOBN(0x85fc37ba, 0xa36401ff),
+       TOBN(0xdf86e2ca, 0xd9499a53), TOBN(0x6270b2a3, 0xecbc955b)}},
+     {{TOBN(0xafae64f5, 0x974ad33b), TOBN(0x04d85977, 0xfe7b2df1),
+       TOBN(0x2a3db3ff, 0x4ab03f73), TOBN(0x0b87878a, 0x8702740a)},
+      {TOBN(0x6d263f01, 0x5a061732), TOBN(0xc25430ce, 0xa32a1901),
+       TOBN(0xf7ebab3d, 0xdb155018), TOBN(0x3a86f693, 0x63a9b78e)}},
+     {{TOBN(0x349ae368, 0xda9f3804), TOBN(0x470f07fe, 0xa164349c),
+       TOBN(0xd52f4cc9, 0x8562baa5), TOBN(0xc74a9e86, 0x2b290df3)},
+      {TOBN(0xd3a1aa35, 0x43471a24), TOBN(0x239446be, 0xb8194511),
+       TOBN(0xbec2dd00, 0x81dcd44d), TOBN(0xca3d7f0f, 0xc42ac82d)}},
+     {{TOBN(0x1f3db085, 0xfdaf4520), TOBN(0xbb6d3e80, 0x4549daf2),
+       TOBN(0xf5969d8a, 0x19ad5c42), TOBN(0x7052b13d, 0xdbfd1511)},
+      {TOBN(0x11890d1b, 0x682b9060), TOBN(0xa71d3883, 0xac34452c),
+       TOBN(0xa438055b, 0x783805b4), TOBN(0x43241277, 0x4725b23e)}},
+     {{TOBN(0xf20cf96e, 0x4901bbed), TOBN(0x6419c710, 0xf432a2bb),
+       TOBN(0x57a0fbb9, 0xdfa9cd7d), TOBN(0x589111e4, 0x00daa249)},
+      {TOBN(0x19809a33, 0x7b60554e), TOBN(0xea5f8887, 0xede283a4),
+       TOBN(0x2d713802, 0x503bfd35), TOBN(0x151bb0af, 0x585d2a53)}},
+     {{TOBN(0x40b08f74, 0x43b30ca8), TOBN(0xe10b5bba, 0xd9934583),
+       TOBN(0xe8a546d6, 0xb51110ad), TOBN(0x1dd50e66, 0x28e0b6c5)},
+      {TOBN(0x292e9d54, 0xcff2b821), TOBN(0x3882555d, 0x47281760),
+       TOBN(0x134838f8, 0x3724d6e3), TOBN(0xf2c679e0, 0x22ddcda1)}},
+     {{TOBN(0x40ee8815, 0x6d2a5768), TOBN(0x7f227bd2, 0x1c1e7e2d),
+       TOBN(0x487ba134, 0xd04ff443), TOBN(0x76e2ff3d, 0xc614e54b)},
+      {TOBN(0x36b88d6f, 0xa3177ec7), TOBN(0xbf731d51, 0x2328fff5),
+       TOBN(0x758caea2, 0x49ba158e), TOBN(0x5ab8ff4c, 0x02938188)}},
+     {{TOBN(0x33e16056, 0x35edc56d), TOBN(0x5a69d349, 0x7e940d79),
+       TOBN(0x6c4fd001, 0x03866dcb), TOBN(0x20a38f57, 0x4893cdef)},
+      {TOBN(0xfbf3e790, 0xfac3a15b), TOBN(0x6ed7ea2e, 0x7a4f8e6b),
+       TOBN(0xa663eb4f, 0xbc3aca86), TOBN(0x22061ea5, 0x080d53f7)}},
+     {{TOBN(0x2480dfe6, 0xf546783f), TOBN(0xd38bc6da, 0x5a0a641e),
+       TOBN(0xfb093cd1, 0x2ede8965), TOBN(0x89654db4, 0xacb455cf)},
+      {TOBN(0x413cbf9a, 0x26e1adee), TOBN(0x291f3764, 0x373294d4),
+       TOBN(0x00797257, 0x648083fe), TOBN(0x25f504d3, 0x208cc341)}},
+     {{TOBN(0x635a8e5e, 0xc3a0ee43), TOBN(0x70aaebca, 0x679898ff),
+       TOBN(0x9ee9f547, 0x5dc63d56), TOBN(0xce987966, 0xffb34d00)},
+      {TOBN(0xf9f86b19, 0x5e26310a), TOBN(0x9e435484, 0x382a8ca8),
+       TOBN(0x253bcb81, 0xc2352fe4), TOBN(0xa4eac8b0, 0x4474b571)}},
+     {{TOBN(0xc1b97512, 0xc1ad8cf8), TOBN(0x193b4e9e, 0x99e0b697),
+       TOBN(0x939d2716, 0x01e85df0), TOBN(0x4fb265b3, 0xcd44eafd)},
+      {TOBN(0x321e7dcd, 0xe51e1ae2), TOBN(0x8e3a8ca6, 0xe3d8b096),
+       TOBN(0x8de46cb0, 0x52604998), TOBN(0x91099ad8, 0x39072aa7)}},
+     {{TOBN(0x2617f91c, 0x93aa96b8), TOBN(0x0fc8716b, 0x7fca2e13),
+       TOBN(0xa7106f5e, 0x95328723), TOBN(0xd1c9c40b, 0x262e6522)},
+      {TOBN(0xb9bafe86, 0x42b7c094), TOBN(0x1873439d, 0x1543c021),
+       TOBN(0xe1baa5de, 0x5cbefd5d), TOBN(0xa363fc5e, 0x521e8aff)}},
+     {{TOBN(0xefe6320d, 0xf862eaac), TOBN(0x14419c63, 0x22c647dc),
+       TOBN(0x0e06707c, 0x4e46d428), TOBN(0xcb6c834f, 0x4a178f8f)},
+      {TOBN(0x0f993a45, 0xd30f917c), TOBN(0xd4c4b049, 0x9879afee),
+       TOBN(0xb6142a1e, 0x70500063), TOBN(0x7c9b41c3, 0xa5d9d605)}},
+     {{TOBN(0xbc00fc2f, 0x2f8ba2c7), TOBN(0x0966eb2f, 0x7c67aa28),
+       TOBN(0x13f7b516, 0x5a786972), TOBN(0x3bfb7557, 0x8a2fbba0)},
+      {TOBN(0x131c4f23, 0x5a2b9620), TOBN(0xbff3ed27, 0x6faf46be),
+       TOBN(0x9b4473d1, 0x7e172323), TOBN(0x421e8878, 0x339f6246)}},
+     {{TOBN(0x0fa8587a, 0x25a41632), TOBN(0xc0814124, 0xa35b6c93),
+       TOBN(0x2b18a9f5, 0x59ebb8db), TOBN(0x264e3357, 0x76edb29c)},
+      {TOBN(0xaf245ccd, 0xc87c51e2), TOBN(0x16b3015b, 0x501e6214),
+       TOBN(0xbb31c560, 0x0a3882ce), TOBN(0x6961bb94, 0xfec11e04)}},
+     {{TOBN(0x3b825b8d, 0xeff7a3a0), TOBN(0xbec33738, 0xb1df7326),
+       TOBN(0x68ad747c, 0x99604a1f), TOBN(0xd154c934, 0x9a3bd499)},
+      {TOBN(0xac33506f, 0x1cc7a906), TOBN(0x73bb5392, 0x6c560e8f),
+       TOBN(0x6428fcbe, 0x263e3944), TOBN(0xc11828d5, 0x1c387434)}},
+     {{TOBN(0x3cd04be1, 0x3e4b12ff), TOBN(0xc3aad9f9, 0x2d88667c),
+       TOBN(0xc52ddcf8, 0x248120cf), TOBN(0x985a892e, 0x2a389532)},
+      {TOBN(0xfbb4b21b, 0x3bb85fa0), TOBN(0xf95375e0, 0x8dfc6269),
+       TOBN(0xfb4fb06c, 0x7ee2acea), TOBN(0x6785426e, 0x309c4d1f)}},
+     {{TOBN(0x659b17c8, 0xd8ceb147), TOBN(0x9b649eee, 0xb70a5554),
+       TOBN(0x6b7fa0b5, 0xac6bc634), TOBN(0xd99fe2c7, 0x1d6e732f)},
+      {TOBN(0x30e6e762, 0x8d3abba2), TOBN(0x18fee6e7, 0xa797b799),
+       TOBN(0x5c9d360d, 0xc696464d), TOBN(0xe3baeb48, 0x27bfde12)}},
+     {{TOBN(0x2bf5db47, 0xf23206d5), TOBN(0x2f6d3420, 0x1d260152),
+       TOBN(0x17b87653, 0x3f8ff89a), TOBN(0x5157c30c, 0x378fa458)},
+      {TOBN(0x7517c5c5, 0x2d4fb936), TOBN(0xef22f7ac, 0xe6518cdc),
+       TOBN(0xdeb483e6, 0xbf847a64), TOBN(0xf5084558, 0x92e0fa89)}}},
+    {{{TOBN(0xab9659d8, 0xdf7304d4), TOBN(0xb71bcf1b, 0xff210e8e),
+       TOBN(0xa9a2438b, 0xd73fbd60), TOBN(0x4595cd1f, 0x5d11b4de)},
+      {TOBN(0x9c0d329a, 0x4835859d), TOBN(0x4a0f0d2d, 0x7dbb6e56),
+       TOBN(0xc6038e5e, 0xdf928a4e), TOBN(0xc9429621, 0x8f5ad154)}},
+     {{TOBN(0x91213462, 0xf23f2d92), TOBN(0x6cab71bd, 0x60b94078),
+       TOBN(0x6bdd0a63, 0x176cde20), TOBN(0x54c9b20c, 0xee4d54bc)},
+      {TOBN(0x3cd2d8aa, 0x9f2ac02f), TOBN(0x03f8e617, 0x206eedb0),
+       TOBN(0xc7f68e16, 0x93086434), TOBN(0x831469c5, 0x92dd3db9)}},
+     {{TOBN(0x8521df24, 0x8f981354), TOBN(0x587e23ec, 0x3588a259),
+       TOBN(0xcbedf281, 0xd7a0992c), TOBN(0x06930a55, 0x38961407)},
+      {TOBN(0x09320deb, 0xbe5bbe21), TOBN(0xa7ffa5b5, 0x2491817f),
+       TOBN(0xe6c8b4d9, 0x09065160), TOBN(0xac4f3992, 0xfff6d2a9)}},
+     {{TOBN(0x7aa7a158, 0x3ae9c1bd), TOBN(0xe0af6d98, 0xe37ce240),
+       TOBN(0xe54342d9, 0x28ab38b4), TOBN(0xe8b75007, 0x0a1c98ca)},
+      {TOBN(0xefce86af, 0xe02358f2), TOBN(0x31b8b856, 0xea921228),
+       TOBN(0x052a1912, 0x0a1c67fc), TOBN(0xb4069ea4, 0xe3aead59)}},
+     {{TOBN(0x3232d6e2, 0x7fa03cb3), TOBN(0xdb938e5b, 0x0fdd7d88),
+       TOBN(0x04c1d2cd, 0x2ccbfc5d), TOBN(0xd2f45c12, 0xaf3a580f)},
+      {TOBN(0x592620b5, 0x7883e614), TOBN(0x5fd27e68, 0xbe7c5f26),
+       TOBN(0x139e45a9, 0x1567e1e3), TOBN(0x2cc71d2d, 0x44d8aaaf)}},
+     {{TOBN(0x4a9090cd, 0xe36d0757), TOBN(0xf722d7b1, 0xd9a29382),
+       TOBN(0xfb7fb04c, 0x04b48ddf), TOBN(0x628ad2a7, 0xebe16f43)},
+      {TOBN(0xcd3fbfb5, 0x20226040), TOBN(0x6c34ecb1, 0x5104b6c4),
+       TOBN(0x30c0754e, 0xc903c188), TOBN(0xec336b08, 0x2d23cab0)}},
+     {{TOBN(0x473d62a2, 0x1e206ee5), TOBN(0xf1e27480, 0x8c49a633),
+       TOBN(0x87ab956c, 0xe9f6b2c3), TOBN(0x61830b48, 0x62b606ea)},
+      {TOBN(0x67cd6846, 0xe78e815f), TOBN(0xfe40139f, 0x4c02082a),
+       TOBN(0x52bbbfcb, 0x952ec365), TOBN(0x74c11642, 0x6b9836ab)}},
+     {{TOBN(0x9f51439e, 0x558df019), TOBN(0x230da4ba, 0xac712b27),
+       TOBN(0x518919e3, 0x55185a24), TOBN(0x4dcefcdd, 0x84b78f50)},
+      {TOBN(0xa7d90fb2, 0xa47d4c5a), TOBN(0x55ac9abf, 0xb30e009e),
+       TOBN(0xfd2fc359, 0x74eed273), TOBN(0xb72d824c, 0xdbea8faf)}},
+     {{TOBN(0xce721a74, 0x4513e2ca), TOBN(0x0b418612, 0x38240b2c),
+       TOBN(0x05199968, 0xd5baa450), TOBN(0xeb1757ed, 0x2b0e8c25)},
+      {TOBN(0x6ebc3e28, 0x3dfac6d5), TOBN(0xb2431e2e, 0x48a237f5),
+       TOBN(0x2acb5e23, 0x52f61499), TOBN(0x5558a2a7, 0xe06c936b)}},
+     {{TOBN(0xd213f923, 0xcbb13d1b), TOBN(0x98799f42, 0x5bfb9bfe),
+       TOBN(0x1ae8ddc9, 0x701144a9), TOBN(0x0b8b3bb6, 0x4c5595ee)},
+      {TOBN(0x0ea9ef2e, 0x3ecebb21), TOBN(0x17cb6c4b, 0x3671f9a7),
+       TOBN(0x47ef464f, 0x726f1d1f), TOBN(0x171b9484, 0x6943a276)}},
+     {{TOBN(0x51a4ae2d, 0x7ef0329c), TOBN(0x08509222, 0x91c4402a),
+       TOBN(0x64a61d35, 0xafd45bbc), TOBN(0x38f096fe, 0x3035a851)},
+      {TOBN(0xc7468b74, 0xa1dec027), TOBN(0xe8cf10e7, 0x4fc7dcba),
+       TOBN(0xea35ff40, 0xf4a06353), TOBN(0x0b4c0dfa, 0x8b77dd66)}},
+     {{TOBN(0x779b8552, 0xde7e5c19), TOBN(0xfab28609, 0xc1c0256c),
+       TOBN(0x64f58eee, 0xabd4743d), TOBN(0x4e8ef838, 0x7b6cc93b)},
+      {TOBN(0xee650d26, 0x4cb1bf3d), TOBN(0x4c1f9d09, 0x73dedf61),
+       TOBN(0xaef7c9d7, 0xbfb70ced), TOBN(0x1ec0507e, 0x1641de1e)}},
+     {{TOBN(0xcd7e5cc7, 0xcde45079), TOBN(0xde173c9a, 0x516ac9e4),
+       TOBN(0x517a8494, 0xc170315c), TOBN(0x438fd905, 0x91d8e8fb)},
+      {TOBN(0x5145c506, 0xc7d9630b), TOBN(0x6457a87b, 0xf47d4d75),
+       TOBN(0xd31646bf, 0x0d9a80e8), TOBN(0x453add2b, 0xcef3aabe)}},
+     {{TOBN(0xc9941109, 0xa607419d), TOBN(0xfaa71e62, 0xbb6bca80),
+       TOBN(0x34158c13, 0x07c431f3), TOBN(0x594abebc, 0x992bc47a)},
+      {TOBN(0x6dfea691, 0xeb78399f), TOBN(0x48aafb35, 0x3f42cba4),
+       TOBN(0xedcd65af, 0x077c04f0), TOBN(0x1a29a366, 0xe884491a)}},
+     {{TOBN(0x023a40e5, 0x1c21f2bf), TOBN(0xf99a513c, 0xa5057aee),
+       TOBN(0xa3fe7e25, 0xbcab072e), TOBN(0x8568d2e1, 0x40e32bcf)},
+      {TOBN(0x904594eb, 0xd3f69d9f), TOBN(0x181a9733, 0x07affab1),
+       TOBN(0xe4d68d76, 0xb6e330f4), TOBN(0x87a6dafb, 0xc75a7fc1)}},
+     {{TOBN(0x549db2b5, 0xef7d9289), TOBN(0x2480d4a8, 0x197f015a),
+       TOBN(0x61d5590b, 0xc40493b6), TOBN(0x3a55b52e, 0x6f780331)},
+      {TOBN(0x40eb8115, 0x309eadb0), TOBN(0xdea7de5a, 0x92e5c625),
+       TOBN(0x64d631f0, 0xcc6a3d5a), TOBN(0x9d5e9d7c, 0x93e8dd61)}},
+     {{TOBN(0xf297bef5, 0x206d3ffc), TOBN(0x23d5e033, 0x7d808bd4),
+       TOBN(0x4a4f6912, 0xd24cf5ba), TOBN(0xe4d8163b, 0x09cdaa8a)},
+      {TOBN(0x0e0de9ef, 0xd3082e8e), TOBN(0x4fe1246c, 0x0192f360),
+       TOBN(0x1f900150, 0x4b8eee0a), TOBN(0x5219da81, 0xf1da391b)}},
+     {{TOBN(0x7bf6a5c1, 0xf7ea25aa), TOBN(0xd165e6bf, 0xfbb07d5f),
+       TOBN(0xe3539361, 0x89e78671), TOBN(0xa3fcac89, 0x2bac4219)},
+      {TOBN(0xdfab6fd4, 0xf0baa8ab), TOBN(0x5a4adac1, 0xe2c1c2e5),
+       TOBN(0x6cd75e31, 0x40d85849), TOBN(0xce263fea, 0x19b39181)}},
+     {{TOBN(0xcb6803d3, 0x07032c72), TOBN(0x7f40d5ce, 0x790968c8),
+       TOBN(0xa6de86bd, 0xdce978f0), TOBN(0x25547c4f, 0x368f751c)},
+      {TOBN(0xb1e685fd, 0x65fb2a9e), TOBN(0xce69336f, 0x1eb9179c),
+       TOBN(0xb15d1c27, 0x12504442), TOBN(0xb7df465c, 0xb911a06b)}},
+     {{TOBN(0xb8d804a3, 0x315980cd), TOBN(0x693bc492, 0xfa3bebf7),
+       TOBN(0x3578aeee, 0x2253c504), TOBN(0x158de498, 0xcd2474a2)},
+      {TOBN(0x1331f5c7, 0xcfda8368), TOBN(0xd2d7bbb3, 0x78d7177e),
+       TOBN(0xdf61133a, 0xf3c1e46e), TOBN(0x5836ce7d, 0xd30e7be8)}},
+     {{TOBN(0x83084f19, 0x94f834cb), TOBN(0xd35653d4, 0x429ed782),
+       TOBN(0xa542f16f, 0x59e58243), TOBN(0xc2b52f65, 0x0470a22d)},
+      {TOBN(0xe3b6221b, 0x18f23d96), TOBN(0xcb05abac, 0x3f5252b4),
+       TOBN(0xca00938b, 0x87d61402), TOBN(0x2f186cdd, 0x411933e4)}},
+     {{TOBN(0xe042ece5, 0x9a29a5c5), TOBN(0xb19b3c07, 0x3b6c8402),
+       TOBN(0xc97667c7, 0x19d92684), TOBN(0xb5624622, 0xebc66372)},
+      {TOBN(0x0cb96e65, 0x3c04fa02), TOBN(0x83a7176c, 0x8eaa39aa),
+       TOBN(0x2033561d, 0xeaa1633f), TOBN(0x45a9d086, 0x4533df73)}},
+     {{TOBN(0xe0542c1d, 0x3dc090bc), TOBN(0x82c996ef, 0xaa59c167),
+       TOBN(0xe3f735e8, 0x0ee7fc4d), TOBN(0x7b179393, 0x7c35db79)},
+      {TOBN(0xb6419e25, 0xf8c5dbfd), TOBN(0x4d9d7a1e, 0x1f327b04),
+       TOBN(0x979f6f9b, 0x298dfca8), TOBN(0xc7c5dff1, 0x8de9366a)}},
+     {{TOBN(0x1b7a588d, 0x04c82bdd), TOBN(0x68005534, 0xf8319dfd),
+       TOBN(0xde8a55b5, 0xd8eb9580), TOBN(0x5ea886da, 0x8d5bca81)},
+      {TOBN(0xe8530a01, 0x252a0b4d), TOBN(0x1bffb4fe, 0x35eaa0a1),
+       TOBN(0x2ad828b1, 0xd8e99563), TOBN(0x7de96ef5, 0x95f9cd87)}},
+     {{TOBN(0x4abb2d0c, 0xd77d970c), TOBN(0x03cfb933, 0xd33ef9cb),
+       TOBN(0xb0547c01, 0x8b211fe9), TOBN(0x2fe64809, 0xa56ed1c6)},
+      {TOBN(0xcb7d5624, 0xc2ac98cc), TOBN(0x2a1372c0, 0x1a393e33),
+       TOBN(0xc8d1ec1c, 0x29660521), TOBN(0xf3d31b04, 0xb37ac3e9)}},
+     {{TOBN(0xa29ae9df, 0x5ece6e7c), TOBN(0x0603ac8f, 0x0facfb55),
+       TOBN(0xcfe85b7a, 0xdda233a5), TOBN(0xe618919f, 0xbd75f0b8)},
+      {TOBN(0xf555a3d2, 0x99bf1603), TOBN(0x1f43afc9, 0xf184255a),
+       TOBN(0xdcdaf341, 0x319a3e02), TOBN(0xd3b117ef, 0x03903a39)}},
+     {{TOBN(0xe095da13, 0x65d1d131), TOBN(0x86f16367, 0xc37ad03e),
+       TOBN(0x5f37389e, 0x462cd8dd), TOBN(0xc103fa04, 0xd67a60e6)},
+      {TOBN(0x57c34344, 0xf4b478f0), TOBN(0xce91edd8, 0xe117c98d),
+       TOBN(0x001777b0, 0x231fc12e), TOBN(0x11ae47f2, 0xb207bccb)}},
+     {{TOBN(0xd983cf8d, 0x20f8a242), TOBN(0x7aff5b1d, 0xf22e1ad8),
+       TOBN(0x68fd11d0, 0x7fc4feb3), TOBN(0x5d53ae90, 0xb0f1c3e1)},
+      {TOBN(0x50fb7905, 0xec041803), TOBN(0x85e3c977, 0x14404888),
+       TOBN(0x0e67faed, 0xac628d8f), TOBN(0x2e865150, 0x6668532c)}},
+     {{TOBN(0x15acaaa4, 0x6a67a6b0), TOBN(0xf4cdee25, 0xb25cec41),
+       TOBN(0x49ee565a, 0xe4c6701e), TOBN(0x2a04ca66, 0xfc7d63d8)},
+      {TOBN(0xeb105018, 0xef0543fb), TOBN(0xf709a4f5, 0xd1b0d81d),
+       TOBN(0x5b906ee6, 0x2915d333), TOBN(0xf4a87412, 0x96f1f0ab)}},
+     {{TOBN(0xb6b82fa7, 0x4d82f4c2), TOBN(0x90725a60, 0x6804efb3),
+       TOBN(0xbc82ec46, 0xadc3425e), TOBN(0xb7b80581, 0x2787843e)},
+      {TOBN(0xdf46d91c, 0xdd1fc74c), TOBN(0xdc1c62cb, 0xe783a6c4),
+       TOBN(0x59d1b9f3, 0x1a04cbba), TOBN(0xd87f6f72, 0x95e40764)}},
+     {{TOBN(0x02b4cfc1, 0x317f4a76), TOBN(0x8d2703eb, 0x91036bce),
+       TOBN(0x98206cc6, 0xa5e72a56), TOBN(0x57be9ed1, 0xcf53fb0f)},
+      {TOBN(0x09374571, 0xef0b17ac), TOBN(0x74b2655e, 0xd9181b38),
+       TOBN(0xc8f80ea8, 0x89935d0e), TOBN(0xc0d9e942, 0x91529936)}},
+     {{TOBN(0x19686041, 0x1e84e0e5), TOBN(0xa5db84d3, 0xaea34c93),
+       TOBN(0xf9d5bb19, 0x7073a732), TOBN(0xb8d2fe56, 0x6bcfd7c0)},
+      {TOBN(0x45775f36, 0xf3eb82fa), TOBN(0x8cb20ccc, 0xfdff8b58),
+       TOBN(0x1659b65f, 0x8374c110), TOBN(0xb8b4a422, 0x330c789a)}},
+     {{TOBN(0x75e3c3ea, 0x6fe8208b), TOBN(0xbd74b9e4, 0x286e78fe),
+       TOBN(0x0be2e81b, 0xd7d93a1a), TOBN(0x7ed06e27, 0xdd0a5aae)},
+      {TOBN(0x721f5a58, 0x6be8b800), TOBN(0x428299d1, 0xd846db28),
+       TOBN(0x95cb8e6b, 0x5be88ed3), TOBN(0xc3186b23, 0x1c034e11)}},
+     {{TOBN(0xa6312c9e, 0x8977d99b), TOBN(0xbe944331, 0x83f531e7),
+       TOBN(0x8232c0c2, 0x18d3b1d4), TOBN(0x617aae8b, 0xe1247b73)},
+      {TOBN(0x40153fc4, 0x282aec3b), TOBN(0xc6063d2f, 0xf7b8f823),
+       TOBN(0x68f10e58, 0x3304f94c), TOBN(0x31efae74, 0xee676346)}},
+     {{TOBN(0xbadb6c6d, 0x40a9b97c), TOBN(0x14702c63, 0x4f666256),
+       TOBN(0xdeb954f1, 0x5184b2e3), TOBN(0x5184a526, 0x94b6ca40)},
+      {TOBN(0xfff05337, 0x003c32ea), TOBN(0x5aa374dd, 0x205974c7),
+       TOBN(0x9a763854, 0x4b0dd71a), TOBN(0x459cd27f, 0xdeb947ec)}},
+     {{TOBN(0xa6e28161, 0x459c2b92), TOBN(0x2f020fa8, 0x75ee8ef5),
+       TOBN(0xb132ec2d, 0x30b06310), TOBN(0xc3e15899, 0xbc6a4530)},
+      {TOBN(0xdc5f53fe, 0xaa3f451a), TOBN(0x3a3c7f23, 0xc2d9acac),
+       TOBN(0x2ec2f892, 0x6b27e58b), TOBN(0x68466ee7, 0xd742799f)}},
+     {{TOBN(0x98324dd4, 0x1fa26613), TOBN(0xa2dc6dab, 0xbdc29d63),
+       TOBN(0xf9675faa, 0xd712d657), TOBN(0x813994be, 0x21fd8d15)},
+      {TOBN(0x5ccbb722, 0xfd4f7553), TOBN(0x5135ff8b, 0xf3a36b20),
+       TOBN(0x44be28af, 0x69559df5), TOBN(0x40b65bed, 0x9d41bf30)}},
+     {{TOBN(0xd98bf2a4, 0x3734e520), TOBN(0x5e3abbe3, 0x209bdcba),
+       TOBN(0x77c76553, 0xbc945b35), TOBN(0x5331c093, 0xc6ef14aa)},
+      {TOBN(0x518ffe29, 0x76b60c80), TOBN(0x2285593b, 0x7ace16f8),
+       TOBN(0xab1f64cc, 0xbe2b9784), TOBN(0xe8f2c0d9, 0xab2421b6)}},
+     {{TOBN(0x617d7174, 0xc1df065c), TOBN(0xafeeb5ab, 0x5f6578fa),
+       TOBN(0x16ff1329, 0x263b54a8), TOBN(0x45c55808, 0xc990dce3)},
+      {TOBN(0x42eab6c0, 0xecc8c177), TOBN(0x799ea9b5, 0x5982ecaa),
+       TOBN(0xf65da244, 0xb607ef8e), TOBN(0x8ab226ce, 0x32a3fc2c)}},
+     {{TOBN(0x745741e5, 0x7ea973dc), TOBN(0x5c00ca70, 0x20888f2e),
+       TOBN(0x7cdce3cf, 0x45fd9cf1), TOBN(0x8a741ef1, 0x5507f872)},
+      {TOBN(0x47c51c2f, 0x196b4cec), TOBN(0x70d08e43, 0xc97ea618),
+       TOBN(0x930da15c, 0x15b18a2b), TOBN(0x33b6c678, 0x2f610514)}},
+     {{TOBN(0xc662e4f8, 0x07ac9794), TOBN(0x1eccf050, 0xba06cb79),
+       TOBN(0x1ff08623, 0xe7d954e5), TOBN(0x6ef2c5fb, 0x24cf71c3)},
+      {TOBN(0xb2c063d2, 0x67978453), TOBN(0xa0cf3796, 0x1d654af8),
+       TOBN(0x7cb242ea, 0x7ebdaa37), TOBN(0x206e0b10, 0xb86747e0)}},
+     {{TOBN(0x481dae5f, 0xd5ecfefc), TOBN(0x07084fd8, 0xc2bff8fc),
+       TOBN(0x8040a01a, 0xea324596), TOBN(0x4c646980, 0xd4de4036)},
+      {TOBN(0x9eb8ab4e, 0xd65abfc3), TOBN(0xe01cb91f, 0x13541ec7),
+       TOBN(0x8f029adb, 0xfd695012), TOBN(0x9ae28483, 0x3c7569ec)}},
+     {{TOBN(0xa5614c9e, 0xa66d80a1), TOBN(0x680a3e44, 0x75f5f911),
+       TOBN(0x0c07b14d, 0xceba4fc1), TOBN(0x891c285b, 0xa13071c1)},
+      {TOBN(0xcac67ceb, 0x799ece3c), TOBN(0x29b910a9, 0x41e07e27),
+       TOBN(0x66bdb409, 0xf2e43123), TOBN(0x06f8b137, 0x7ac9ecbe)}},
+     {{TOBN(0x5981fafd, 0x38547090), TOBN(0x19ab8b9f, 0x85e3415d),
+       TOBN(0xfc28c194, 0xc7e31b27), TOBN(0x843be0aa, 0x6fbcbb42)},
+      {TOBN(0xf3b1ed43, 0xa6db836c), TOBN(0x2a1330e4, 0x01a45c05),
+       TOBN(0x4f19f3c5, 0x95c1a377), TOBN(0xa85f39d0, 0x44b5ee33)}},
+     {{TOBN(0x3da18e6d, 0x4ae52834), TOBN(0x5a403b39, 0x7423dcb0),
+       TOBN(0xbb555e0a, 0xf2374aef), TOBN(0x2ad599c4, 0x1e8ca111)},
+      {TOBN(0x1b3a2fb9, 0x014b3bf8), TOBN(0x73092684, 0xf66d5007),
+       TOBN(0x079f1426, 0xc4340102), TOBN(0x1827cf81, 0x8fddf4de)}},
+     {{TOBN(0xc83605f6, 0xf10ff927), TOBN(0xd3871451, 0x23739fc6),
+       TOBN(0x6d163450, 0xcac1c2cc), TOBN(0x6b521296, 0xa2ec1ac5)},
+      {TOBN(0x0606c4f9, 0x6e3cb4a5), TOBN(0xe47d3f41, 0x778abff7),
+       TOBN(0x425a8d5e, 0xbe8e3a45), TOBN(0x53ea9e97, 0xa6102160)}},
+     {{TOBN(0x477a106e, 0x39cbb688), TOBN(0x532401d2, 0xf3386d32),
+       TOBN(0x8e564f64, 0xb1b9b421), TOBN(0xca9b8388, 0x81dad33f)},
+      {TOBN(0xb1422b4e, 0x2093913e), TOBN(0x533d2f92, 0x69bc8112),
+       TOBN(0x3fa017be, 0xebe7b2c7), TOBN(0xb2767c4a, 0xcaf197c6)}},
+     {{TOBN(0xc925ff87, 0xaedbae9f), TOBN(0x7daf0eb9, 0x36880a54),
+       TOBN(0x9284ddf5, 0x9c4d0e71), TOBN(0x1581cf93, 0x316f8cf5)},
+      {TOBN(0x3eeca887, 0x3ac1f452), TOBN(0xb417fce9, 0xfb6aeffe),
+       TOBN(0xa5918046, 0xeefb8dc3), TOBN(0x73d318ac, 0x02209400)}},
+     {{TOBN(0xe800400f, 0x728693e5), TOBN(0xe87d814b, 0x339927ed),
+       TOBN(0x93e94d3b, 0x57ea9910), TOBN(0xff8a35b6, 0x2245fb69)},
+      {TOBN(0x043853d7, 0x7f200d34), TOBN(0x470f1e68, 0x0f653ce1),
+       TOBN(0x81ac05bd, 0x59a06379), TOBN(0xa14052c2, 0x03930c29)}},
+     {{TOBN(0x6b72fab5, 0x26bc2797), TOBN(0x13670d16, 0x99f16771),
+       TOBN(0x00170052, 0x1e3e48d1), TOBN(0x978fe401, 0xb7adf678)},
+      {TOBN(0x55ecfb92, 0xd41c5dd4), TOBN(0x5ff8e247, 0xc7b27da5),
+       TOBN(0xe7518272, 0x013fb606), TOBN(0x5768d7e5, 0x2f547a3c)}},
+     {{TOBN(0xbb24eaa3, 0x60017a5f), TOBN(0x6b18e6e4, 0x9c64ce9b),
+       TOBN(0xc225c655, 0x103dde07), TOBN(0xfc3672ae, 0x7592f7ea)},
+      {TOBN(0x9606ad77, 0xd06283a1), TOBN(0x542fc650, 0xe4d59d99),
+       TOBN(0xabb57c49, 0x2a40e7c2), TOBN(0xac948f13, 0xa8db9f55)}},
+     {{TOBN(0x6d4c9682, 0xb04465c3), TOBN(0xe3d062fa, 0x6468bd15),
+       TOBN(0xa51729ac, 0x5f318d7e), TOBN(0x1fc87df6, 0x9eb6fc95)},
+      {TOBN(0x63d146a8, 0x0591f652), TOBN(0xa861b8f7, 0x589621aa),
+       TOBN(0x59f5f15a, 0xce31348c), TOBN(0x8f663391, 0x440da6da)}},
+     {{TOBN(0xcfa778ac, 0xb591ffa3), TOBN(0x027ca9c5, 0x4cdfebce),
+       TOBN(0xbe8e05a5, 0x444ea6b3), TOBN(0x8aab4e69, 0xa78d8254)},
+      {TOBN(0x2437f04f, 0xb474d6b8), TOBN(0x6597ffd4, 0x045b3855),
+       TOBN(0xbb0aea4e, 0xca47ecaa), TOBN(0x568aae83, 0x85c7ebfc)}},
+     {{TOBN(0x0e966e64, 0xc73b2383), TOBN(0x49eb3447, 0xd17d8762),
+       TOBN(0xde107821, 0x8da05dab), TOBN(0x443d8baa, 0x016b7236)},
+      {TOBN(0x163b63a5, 0xea7610d6), TOBN(0xe47e4185, 0xce1ca979),
+       TOBN(0xae648b65, 0x80baa132), TOBN(0xebf53de2, 0x0e0d5b64)}},
+     {{TOBN(0x8d3bfcb4, 0xd3c8c1ca), TOBN(0x0d914ef3, 0x5d04b309),
+       TOBN(0x55ef6415, 0x3de7d395), TOBN(0xbde1666f, 0x26b850e8)},
+      {TOBN(0xdbe1ca6e, 0xd449ab19), TOBN(0x8902b322, 0xe89a2672),
+       TOBN(0xb1674b7e, 0xdacb7a53), TOBN(0x8e9faf6e, 0xf52523ff)}},
+     {{TOBN(0x6ba535da, 0x9a85788b), TOBN(0xd21f03ae, 0xbd0626d4),
+       TOBN(0x099f8c47, 0xe873dc64), TOBN(0xcda8564d, 0x018ec97e)},
+      {TOBN(0x3e8d7a5c, 0xde92c68c), TOBN(0x78e035a1, 0x73323cc4),
+       TOBN(0x3ef26275, 0xf880ff7c), TOBN(0xa4ee3dff, 0x273eedaa)}},
+     {{TOBN(0x58823507, 0xaf4e18f8), TOBN(0x967ec9b5, 0x0672f328),
+       TOBN(0x9ded19d9, 0x559d3186), TOBN(0x5e2ab3de, 0x6cdce39c)},
+      {TOBN(0xabad6e4d, 0x11c226df), TOBN(0xf9783f43, 0x87723014),
+       TOBN(0x9a49a0cf, 0x1a885719), TOBN(0xfc0c1a5a, 0x90da9dbf)}},
+     {{TOBN(0x8bbaec49, 0x571d92ac), TOBN(0x569e85fe, 0x4692517f),
+       TOBN(0x8333b014, 0xa14ea4af), TOBN(0x32f2a62f, 0x12e5c5ad)},
+      {TOBN(0x98c2ce3a, 0x06d89b85), TOBN(0xb90741aa, 0x2ff77a08),
+       TOBN(0x2530defc, 0x01f795a2), TOBN(0xd6e5ba0b, 0x84b3c199)}},
+     {{TOBN(0x7d8e8451, 0x12e4c936), TOBN(0xae419f7d, 0xbd0be17b),
+       TOBN(0xa583fc8c, 0x22262bc9), TOBN(0x6b842ac7, 0x91bfe2bd)},
+      {TOBN(0x33cef4e9, 0x440d6827), TOBN(0x5f69f4de, 0xef81fb14),
+       TOBN(0xf16cf6f6, 0x234fbb92), TOBN(0x76ae3fc3, 0xd9e7e158)}},
+     {{TOBN(0x4e89f6c2, 0xe9740b33), TOBN(0x677bc85d, 0x4962d6a1),
+       TOBN(0x6c6d8a7f, 0x68d10d15), TOBN(0x5f9a7224, 0x0257b1cd)},
+      {TOBN(0x7096b916, 0x4ad85961), TOBN(0x5f8c47f7, 0xe657ab4a),
+       TOBN(0xde57d7d0, 0xf7461d7e), TOBN(0x7eb6094d, 0x80ce5ee2)}},
+     {{TOBN(0x0b1e1dfd, 0x34190547), TOBN(0x8a394f43, 0xf05dd150),
+       TOBN(0x0a9eb24d, 0x97df44e6), TOBN(0x78ca06bf, 0x87675719)},
+      {TOBN(0x6f0b3462, 0x6ffeec22), TOBN(0x9d91bcea, 0x36cdd8fb),
+       TOBN(0xac83363c, 0xa105be47), TOBN(0x81ba76c1, 0x069710e3)}},
+     {{TOBN(0x3d1b24cb, 0x28c682c6), TOBN(0x27f25228, 0x8612575b),
+       TOBN(0xb587c779, 0xe8e66e98), TOBN(0x7b0c03e9, 0x405eb1fe)},
+      {TOBN(0xfdf0d030, 0x15b548e7), TOBN(0xa8be76e0, 0x38b36af7),
+       TOBN(0x4cdab04a, 0x4f310c40), TOBN(0x6287223e, 0xf47ecaec)}},
+     {{TOBN(0x678e6055, 0x8b399320), TOBN(0x61fe3fa6, 0xc01e4646),
+       TOBN(0xc482866b, 0x03261a5e), TOBN(0xdfcf45b8, 0x5c2f244a)},
+      {TOBN(0x8fab9a51, 0x2f684b43), TOBN(0xf796c654, 0xc7220a66),
+       TOBN(0x1d90707e, 0xf5afa58f), TOBN(0x2c421d97, 0x4fdbe0de)}},
+     {{TOBN(0xc4f4cda3, 0xaf2ebc2f), TOBN(0xa0af843d, 0xcb4efe24),
+       TOBN(0x53b857c1, 0x9ccd10b1), TOBN(0xddc9d1eb, 0x914d3e04)},
+      {TOBN(0x7bdec8bb, 0x62771deb), TOBN(0x829277aa, 0x91c5aa81),
+       TOBN(0x7af18dd6, 0x832391ae), TOBN(0x1740f316, 0xc71a84ca)}}},
+    {{{TOBN(0x8928e99a, 0xeeaf8c49), TOBN(0xee7aa73d, 0x6e24d728),
+       TOBN(0x4c5007c2, 0xe72b156c), TOBN(0x5fcf57c5, 0xed408a1d)},
+      {TOBN(0x9f719e39, 0xb6057604), TOBN(0x7d343c01, 0xc2868bbf),
+       TOBN(0x2cca254b, 0x7e103e2d), TOBN(0xe6eb38a9, 0xf131bea2)}},
+     {{TOBN(0xb33e624f, 0x8be762b4), TOBN(0x2a9ee4d1, 0x058e3413),
+       TOBN(0x968e6369, 0x67d805fa), TOBN(0x9848949b, 0x7db8bfd7)},
+      {TOBN(0x5308d7e5, 0xd23a8417), TOBN(0x892f3b1d, 0xf3e29da5),
+       TOBN(0xc95c139e, 0x3dee471f), TOBN(0x8631594d, 0xd757e089)}},
+     {{TOBN(0xe0c82a3c, 0xde918dcc), TOBN(0x2e7b5994, 0x26fdcf4b),
+       TOBN(0x82c50249, 0x32cb1b2d), TOBN(0xea613a9d, 0x7657ae07)},
+      {TOBN(0xc2eb5f6c, 0xf1fdc9f7), TOBN(0xb6eae8b8, 0x879fe682),
+       TOBN(0x253dfee0, 0x591cbc7f), TOBN(0x000da713, 0x3e1290e6)}},
+     {{TOBN(0x1083e2ea, 0x1f095615), TOBN(0x0a28ad77, 0x14e68c33),
+       TOBN(0x6bfc0252, 0x3d8818be), TOBN(0xb585113a, 0xf35850cd)},
+      {TOBN(0x7d935f0b, 0x30df8aa1), TOBN(0xaddda07c, 0x4ab7e3ac),
+       TOBN(0x92c34299, 0x552f00cb), TOBN(0xc33ed1de, 0x2909df6c)}},
+     {{TOBN(0x22c2195d, 0x80e87766), TOBN(0x9e99e6d8, 0x9ddf4ac0),
+       TOBN(0x09642e4e, 0x65e74934), TOBN(0x2610ffa2, 0xff1ff241)},
+      {TOBN(0x4d1d47d4, 0x751c8159), TOBN(0x697b4985, 0xaf3a9363),
+       TOBN(0x0318ca46, 0x87477c33), TOBN(0xa90cb565, 0x9441eff3)}},
+     {{TOBN(0x58bb3848, 0x36f024cb), TOBN(0x85be1f77, 0x36016168),
+       TOBN(0x6c59587c, 0xdc7e07f1), TOBN(0x191be071, 0xaf1d8f02)},
+      {TOBN(0xbf169fa5, 0xcca5e55c), TOBN(0x3864ba3c, 0xf7d04eac),
+       TOBN(0x915e367f, 0x8d7d05db), TOBN(0xb48a876d, 0xa6549e5d)}},
+     {{TOBN(0xef89c656, 0x580e40a2), TOBN(0xf194ed8c, 0x728068bc),
+       TOBN(0x74528045, 0xa47990c9), TOBN(0xf53fc7d7, 0x5e1a4649)},
+      {TOBN(0xbec5ae9b, 0x78593e7d), TOBN(0x2cac4ee3, 0x41db65d7),
+       TOBN(0xa8c1eb24, 0x04a3d39b), TOBN(0x53b7d634, 0x03f8f3ef)}},
+     {{TOBN(0x2dc40d48, 0x3e07113c), TOBN(0x6e4a5d39, 0x7d8b63ae),
+       TOBN(0x5582a94b, 0x79684c2b), TOBN(0x932b33d4, 0x622da26c)},
+      {TOBN(0xf534f651, 0x0dbbf08d), TOBN(0x211d07c9, 0x64c23a52),
+       TOBN(0x0eeece0f, 0xee5bdc9b), TOBN(0xdf178168, 0xf7015558)}},
+     {{TOBN(0xd4294635, 0x0a712229), TOBN(0x93cbe448, 0x09273f8c),
+       TOBN(0x00b095ef, 0x8f13bc83), TOBN(0xbb741972, 0x8798978c)},
+      {TOBN(0x9d7309a2, 0x56dbe6e7), TOBN(0xe578ec56, 0x5a5d39ec),
+       TOBN(0x3961151b, 0x851f9a31), TOBN(0x2da7715d, 0xe5709eb4)}},
+     {{TOBN(0x867f3017, 0x53dfabf0), TOBN(0x728d2078, 0xb8e39259),
+       TOBN(0x5c75a0cd, 0x815d9958), TOBN(0xf84867a6, 0x16603be1)},
+      {TOBN(0xc865b13d, 0x70e35b1c), TOBN(0x02414468, 0x19b03e2c),
+       TOBN(0xe46041da, 0xac1f3121), TOBN(0x7c9017ad, 0x6f028a7c)}},
+     {{TOBN(0xabc96de9, 0x0a482873), TOBN(0x4265d6b1, 0xb77e54d4),
+       TOBN(0x68c38e79, 0xa57d88e7), TOBN(0xd461d766, 0x9ce82de3)},
+      {TOBN(0x817a9ec5, 0x64a7e489), TOBN(0xcc5675cd, 0xa0def5f2),
+       TOBN(0x9a00e785, 0x985d494e), TOBN(0xc626833f, 0x1b03514a)}},
+     {{TOBN(0xabe7905a, 0x83cdd60e), TOBN(0x50602fb5, 0xa1170184),
+       TOBN(0x689886cd, 0xb023642a), TOBN(0xd568d090, 0xa6e1fb00)},
+      {TOBN(0x5b1922c7, 0x0259217f), TOBN(0x93831cd9, 0xc43141e4),
+       TOBN(0xdfca3587, 0x0c95f86e), TOBN(0xdec2057a, 0x568ae828)}},
+     {{TOBN(0xc44ea599, 0xf98a759a), TOBN(0x55a0a7a2, 0xf7c23c1d),
+       TOBN(0xd5ffb6e6, 0x94c4f687), TOBN(0x3563cce2, 0x12848478)},
+      {TOBN(0x812b3517, 0xe7b1fbe1), TOBN(0x8a7dc979, 0x4f7338e0),
+       TOBN(0x211ecee9, 0x52d048db), TOBN(0x2eea4056, 0xc86ea3b8)}},
+     {{TOBN(0xd8cb68a7, 0xba772b34), TOBN(0xe16ed341, 0x5f4e2541),
+       TOBN(0x9b32f6a6, 0x0fec14db), TOBN(0xeee376f7, 0x391698be)},
+      {TOBN(0xe9a7aa17, 0x83674c02), TOBN(0x65832f97, 0x5843022a),
+       TOBN(0x29f3a8da, 0x5ba4990f), TOBN(0x79a59c3a, 0xfb8e3216)}},
+     {{TOBN(0x9cdc4d2e, 0xbd19bb16), TOBN(0xc6c7cfd0, 0xb3262d86),
+       TOBN(0xd4ce14d0, 0x969c0b47), TOBN(0x1fa352b7, 0x13e56128)},
+      {TOBN(0x383d55b8, 0x973db6d3), TOBN(0x71836850, 0xe8e5b7bf),
+       TOBN(0xc7714596, 0xe6bb571f), TOBN(0x259df31f, 0x2d5b2dd2)}},
+     {{TOBN(0x568f8925, 0x913cc16d), TOBN(0x18bc5b6d, 0xe1a26f5a),
+       TOBN(0xdfa413be, 0xf5f499ae), TOBN(0xf8835dec, 0xc3f0ae84)},
+      {TOBN(0xb6e60bd8, 0x65a40ab0), TOBN(0x65596439, 0x194b377e),
+       TOBN(0xbcd85625, 0x92084a69), TOBN(0x5ce433b9, 0x4f23ede0)}},
+     {{TOBN(0xe8e8f04f, 0x6ad65143), TOBN(0x11511827, 0xd6e14af6),
+       TOBN(0x3d390a10, 0x8295c0c7), TOBN(0x71e29ee4, 0x621eba16)},
+      {TOBN(0xa588fc09, 0x63717b46), TOBN(0x02be02fe, 0xe06ad4a2),
+       TOBN(0x931558c6, 0x04c22b22), TOBN(0xbb4d4bd6, 0x12f3c849)}},
+     {{TOBN(0x54a4f496, 0x20efd662), TOBN(0x92ba6d20, 0xc5952d14),
+       TOBN(0x2db8ea1e, 0xcc9784c2), TOBN(0x81cc10ca, 0x4b353644)},
+      {TOBN(0x40b570ad, 0x4b4d7f6c), TOBN(0x5c9f1d96, 0x84a1dcd2),
+       TOBN(0x01379f81, 0x3147e797), TOBN(0xe5c6097b, 0x2bd499f5)}},
+     {{TOBN(0x40dcafa6, 0x328e5e20), TOBN(0xf7b5244a, 0x54815550),
+       TOBN(0xb9a4f118, 0x47bfc978), TOBN(0x0ea0e79f, 0xd25825b1)},
+      {TOBN(0xa50f96eb, 0x646c7ecf), TOBN(0xeb811493, 0x446dea9d),
+       TOBN(0x2af04677, 0xdfabcf69), TOBN(0xbe3a068f, 0xc713f6e8)}},
+     {{TOBN(0x860d523d, 0x42e06189), TOBN(0xbf077941, 0x4e3aff13),
+       TOBN(0x0b616dca, 0xc1b20650), TOBN(0xe66dd6d1, 0x2131300d)},
+      {TOBN(0xd4a0fd67, 0xff99abde), TOBN(0xc9903550, 0xc7aac50d),
+       TOBN(0x022ecf8b, 0x7c46b2d7), TOBN(0x3333b1e8, 0x3abf92af)}},
+     {{TOBN(0x11cc113c, 0x6c491c14), TOBN(0x05976688, 0x80dd3f88),
+       TOBN(0xf5b4d9e7, 0x29d932ed), TOBN(0xe982aad8, 0xa2c38b6d)},
+      {TOBN(0x6f925347, 0x8be0dcf0), TOBN(0x700080ae, 0x65ca53f2),
+       TOBN(0xd8131156, 0x443ca77f), TOBN(0xe92d6942, 0xec51f984)}},
+     {{TOBN(0xd2a08af8, 0x85dfe9ae), TOBN(0xd825d9a5, 0x4d2a86ca),
+       TOBN(0x2c53988d, 0x39dff020), TOBN(0xf38b135a, 0x430cdc40)},
+      {TOBN(0x0c918ae0, 0x62a7150b), TOBN(0xf31fd8de, 0x0c340e9b),
+       TOBN(0xafa0e7ae, 0x4dbbf02e), TOBN(0x5847fb2a, 0x5eba6239)}},
+     {{TOBN(0x6b1647dc, 0xdccbac8b), TOBN(0xb642aa78, 0x06f485c8),
+       TOBN(0x873f3765, 0x7038ecdf), TOBN(0x2ce5e865, 0xfa49d3fe)},
+      {TOBN(0xea223788, 0xc98c4400), TOBN(0x8104a8cd, 0xf1fa5279),
+       TOBN(0xbcf7cc7a, 0x06becfd7), TOBN(0x49424316, 0xc8f974ae)}},
+     {{TOBN(0xc0da65e7, 0x84d6365d), TOBN(0xbcb7443f, 0x8f759fb8),
+       TOBN(0x35c712b1, 0x7ae81930), TOBN(0x80428dff, 0x4c6e08ab)},
+      {TOBN(0xf19dafef, 0xa4faf843), TOBN(0xced8538d, 0xffa9855f),
+       TOBN(0x20ac409c, 0xbe3ac7ce), TOBN(0x358c1fb6, 0x882da71e)}},
+     {{TOBN(0xafa9c0e5, 0xfd349961), TOBN(0x2b2cfa51, 0x8421c2fc),
+       TOBN(0x2a80db17, 0xf3a28d38), TOBN(0xa8aba539, 0x5d138e7e)},
+      {TOBN(0x52012d1d, 0x6e96eb8d), TOBN(0x65d8dea0, 0xcbaf9622),
+       TOBN(0x57735447, 0xb264f56c), TOBN(0xbeebef3f, 0x1b6c8da2)}},
+     {{TOBN(0xfc346d98, 0xce785254), TOBN(0xd50e8d72, 0xbb64a161),
+       TOBN(0xc03567c7, 0x49794add), TOBN(0x15a76065, 0x752c7ef6)},
+      {TOBN(0x59f3a222, 0x961f23d6), TOBN(0x378e4438, 0x73ecc0b0),
+       TOBN(0xc74be434, 0x5a82fde4), TOBN(0xae509af2, 0xd8b9cf34)}},
+     {{TOBN(0x4a61ee46, 0x577f44a1), TOBN(0xe09b748c, 0xb611deeb),
+       TOBN(0xc0481b2c, 0xf5f7b884), TOBN(0x35626678, 0x61acfa6b)},
+      {TOBN(0x37f4c518, 0xbf8d21e6), TOBN(0x22d96531, 0xb205a76d),
+       TOBN(0x37fb85e1, 0x954073c0), TOBN(0xbceafe4f, 0x65b3a567)}},
+     {{TOBN(0xefecdef7, 0xbe42a582), TOBN(0xd3fc6080, 0x65046be6),
+       TOBN(0xc9af13c8, 0x09e8dba9), TOBN(0x1e6c9847, 0x641491ff)},
+      {TOBN(0x3b574925, 0xd30c31f7), TOBN(0xb7eb72ba, 0xac2a2122),
+       TOBN(0x776a0dac, 0xef0859e7), TOBN(0x06fec314, 0x21900942)}},
+     {{TOBN(0x2464bc10, 0xf8c22049), TOBN(0x9bfbcce7, 0x875ebf69),
+       TOBN(0xd7a88e2a, 0x4336326b), TOBN(0xda05261c, 0x5bc2acfa)},
+      {TOBN(0xc29f5bdc, 0xeba7efc8), TOBN(0x471237ca, 0x25dbbf2e),
+       TOBN(0xa72773f2, 0x2975f127), TOBN(0xdc744e8e, 0x04d0b326)}},
+     {{TOBN(0x38a7ed16, 0xa56edb73), TOBN(0x64357e37, 0x2c007e70),
+       TOBN(0xa167d15b, 0x5080b400), TOBN(0x07b41164, 0x23de4be1)},
+      {TOBN(0xb2d91e32, 0x74c89883), TOBN(0x3c162821, 0x2882e7ed),
+       TOBN(0xad6b36ba, 0x7503e482), TOBN(0x48434e8e, 0x0ea34331)}},
+     {{TOBN(0x79f4f24f, 0x2c7ae0b9), TOBN(0xc46fbf81, 0x1939b44a),
+       TOBN(0x76fefae8, 0x56595eb1), TOBN(0x417b66ab, 0xcd5f29c7)},
+      {TOBN(0x5f2332b2, 0xc5ceec20), TOBN(0xd69661ff, 0xe1a1cae2),
+       TOBN(0x5ede7e52, 0x9b0286e6), TOBN(0x9d062529, 0xe276b993)}},
+     {{TOBN(0x324794b0, 0x7e50122b), TOBN(0xdd744f8b, 0x4af07ca5),
+       TOBN(0x30a12f08, 0xd63fc97b), TOBN(0x39650f1a, 0x76626d9d)},
+      {TOBN(0x101b47f7, 0x1fa38477), TOBN(0x3d815f19, 0xd4dc124f),
+       TOBN(0x1569ae95, 0xb26eb58a), TOBN(0xc3cde188, 0x95fb1887)}},
+     {{TOBN(0x54e9f37b, 0xf9539a48), TOBN(0xb0100e06, 0x7408c1a5),
+       TOBN(0x821d9811, 0xea580cbb), TOBN(0x8af52d35, 0x86e50c56)},
+      {TOBN(0xdfbd9d47, 0xdbbf698b), TOBN(0x2961a1ea, 0x03dc1c73),
+       TOBN(0x203d38f8, 0xe76a5df8), TOBN(0x08a53a68, 0x6def707a)}},
+     {{TOBN(0x26eefb48, 0x1bee45d4), TOBN(0xb3cee346, 0x3c688036),
+       TOBN(0x463c5315, 0xc42f2469), TOBN(0x19d84d2e, 0x81378162)},
+      {TOBN(0x22d7c3c5, 0x1c4d349f), TOBN(0x65965844, 0x163d59c5),
+       TOBN(0xcf198c56, 0xb8abceae), TOBN(0x6fb1fb1b, 0x628559d5)}},
+     {{TOBN(0x8bbffd06, 0x07bf8fe3), TOBN(0x46259c58, 0x3467734b),
+       TOBN(0xd8953cea, 0x35f7f0d3), TOBN(0x1f0bece2, 0xd65b0ff1)},
+      {TOBN(0xf7d5b4b3, 0xf3c72914), TOBN(0x29e8ea95, 0x3cb53389),
+       TOBN(0x4a365626, 0x836b6d46), TOBN(0xe849f910, 0xea174fde)}},
+     {{TOBN(0x7ec62fbb, 0xf4737f21), TOBN(0xd8dba5ab, 0x6209f5ac),
+       TOBN(0x24b5d7a9, 0xa5f9adbe), TOBN(0x707d28f7, 0xa61dc768)},
+      {TOBN(0x7711460b, 0xcaa999ea), TOBN(0xba7b174d, 0x1c92e4cc),
+       TOBN(0x3c4bab66, 0x18d4bf2d), TOBN(0xb8f0c980, 0xeb8bd279)}},
+     {{TOBN(0x024bea9a, 0x324b4737), TOBN(0xfba9e423, 0x32a83bca),
+       TOBN(0x6e635643, 0xa232dced), TOBN(0x99619367, 0x2571c8ba)},
+      {TOBN(0xe8c9f357, 0x54b7032b), TOBN(0xf936b3ba, 0x2442d54a),
+       TOBN(0x2263f0f0, 0x8290c65a), TOBN(0x48989780, 0xee2c7fdb)}},
+     {{TOBN(0xadc5d55a, 0x13d4f95e), TOBN(0x737cff85, 0xad9b8500),
+       TOBN(0x271c557b, 0x8a73f43d), TOBN(0xbed617a4, 0xe18bc476)},
+      {TOBN(0x66245401, 0x7dfd8ab2), TOBN(0xae7b89ae, 0x3a2870aa),
+       TOBN(0x1b555f53, 0x23a7e545), TOBN(0x6791e247, 0xbe057e4c)}},
+     {{TOBN(0x860136ad, 0x324fa34d), TOBN(0xea111447, 0x4cbeae28),
+       TOBN(0x023a4270, 0xbedd3299), TOBN(0x3d5c3a7f, 0xc1c35c34)},
+      {TOBN(0xb0f6db67, 0x8d0412d2), TOBN(0xd92625e2, 0xfcdc6b9a),
+       TOBN(0x92ae5ccc, 0x4e28a982), TOBN(0xea251c36, 0x47a3ce7e)}},
+     {{TOBN(0x9d658932, 0x790691bf), TOBN(0xed610589, 0x06b736ae),
+       TOBN(0x712c2f04, 0xc0d63b6e), TOBN(0x5cf06fd5, 0xc63d488f)},
+      {TOBN(0x97363fac, 0xd9588e41), TOBN(0x1f9bf762, 0x2b93257e),
+       TOBN(0xa9d1ffc4, 0x667acace), TOBN(0x1cf4a1aa, 0x0a061ecf)}},
+     {{TOBN(0x40e48a49, 0xdc1818d0), TOBN(0x0643ff39, 0xa3621ab0),
+       TOBN(0x5768640c, 0xe39ef639), TOBN(0x1fc099ea, 0x04d86854)},
+      {TOBN(0x9130b9c3, 0xeccd28fd), TOBN(0xd743cbd2, 0x7eec54ab),
+       TOBN(0x052b146f, 0xe5b475b6), TOBN(0x058d9a82, 0x900a7d1f)}},
+     {{TOBN(0x65e02292, 0x91262b72), TOBN(0x96f924f9, 0xbb0edf03),
+       TOBN(0x5cfa59c8, 0xfe206842), TOBN(0xf6037004, 0x5eafa720)},
+      {TOBN(0x5f30699e, 0x18d7dd96), TOBN(0x381e8782, 0xcbab2495),
+       TOBN(0x91669b46, 0xdd8be949), TOBN(0xb40606f5, 0x26aae8ef)}},
+     {{TOBN(0x2812b839, 0xfc6751a4), TOBN(0x16196214, 0xfba800ef),
+       TOBN(0x4398d5ca, 0x4c1a2875), TOBN(0x720c00ee, 0x653d8349)},
+      {TOBN(0xc2699eb0, 0xd820007c), TOBN(0x880ee660, 0xa39b5825),
+       TOBN(0x70694694, 0x471f6984), TOBN(0xf7d16ea8, 0xe3dda99a)}},
+     {{TOBN(0x28d675b2, 0xc0519a23), TOBN(0x9ebf94fe, 0x4f6952e3),
+       TOBN(0xf28bb767, 0xa2294a8a), TOBN(0x85512b4d, 0xfe0af3f5)},
+      {TOBN(0x18958ba8, 0x99b16a0d), TOBN(0x95c2430c, 0xba7548a7),
+       TOBN(0xb30d1b10, 0xa16be615), TOBN(0xe3ebbb97, 0x85bfb74c)}},
+     {{TOBN(0xa3273cfe, 0x18549fdb), TOBN(0xf6e200bf, 0x4fcdb792),
+       TOBN(0x54a76e18, 0x83aba56c), TOBN(0x73ec66f6, 0x89ef6aa2)},
+      {TOBN(0x8d17add7, 0xd1b9a305), TOBN(0xa959c5b9, 0xb7ae1b9d),
+       TOBN(0x88643522, 0x6bcc094a), TOBN(0xcc5616c4, 0xd7d429b9)}},
+     {{TOBN(0xa6dada01, 0xe6a33f7c), TOBN(0xc6217a07, 0x9d4e70ad),
+       TOBN(0xd619a818, 0x09c15b7c), TOBN(0xea06b329, 0x0e80c854)},
+      {TOBN(0x174811ce, 0xa5f5e7b9), TOBN(0x66dfc310, 0x787c65f4),
+       TOBN(0x4ea7bd69, 0x3316ab54), TOBN(0xc12c4acb, 0x1dcc0f70)}},
+     {{TOBN(0xe4308d1a, 0x1e407dd9), TOBN(0xe8a3587c, 0x91afa997),
+       TOBN(0xea296c12, 0xab77b7a5), TOBN(0xb5ad49e4, 0x673c0d52)},
+      {TOBN(0x40f9b2b2, 0x7006085a), TOBN(0xa88ff340, 0x87bf6ec2),
+       TOBN(0x978603b1, 0x4e3066a6), TOBN(0xb3f99fc2, 0xb5e486e2)}},
+     {{TOBN(0x07b53f5e, 0xb2e63645), TOBN(0xbe57e547, 0x84c84232),
+       TOBN(0xd779c216, 0x7214d5cf), TOBN(0x617969cd, 0x029a3aca)},
+      {TOBN(0xd17668cd, 0x8a7017a0), TOBN(0x77b4d19a, 0xbe9b7ee8),
+       TOBN(0x58fd0e93, 0x9c161776), TOBN(0xa8c4f4ef, 0xd5968a72)}},
+     {{TOBN(0x296071cc, 0x67b3de77), TOBN(0xae3c0b8e, 0x634f7905),
+       TOBN(0x67e440c2, 0x8a7100c9), TOBN(0xbb8c3c1b, 0xeb4b9b42)},
+      {TOBN(0x6d71e8ea, 0xc51b3583), TOBN(0x7591f5af, 0x9525e642),
+       TOBN(0xf73a2f7b, 0x13f509f3), TOBN(0x618487aa, 0x5619ac9b)}},
+     {{TOBN(0x3a72e5f7, 0x9d61718a), TOBN(0x00413bcc, 0x7592d28c),
+       TOBN(0x7d9b11d3, 0x963c35cf), TOBN(0x77623bcf, 0xb90a46ed)},
+      {TOBN(0xdeef273b, 0xdcdd2a50), TOBN(0x4a741f9b, 0x0601846e),
+       TOBN(0x33b89e51, 0x0ec6e929), TOBN(0xcb02319f, 0x8b7f22cd)}},
+     {{TOBN(0xbbe1500d, 0x084bae24), TOBN(0x2f0ae8d7, 0x343d2693),
+       TOBN(0xacffb5f2, 0x7cdef811), TOBN(0xaa0c030a, 0x263fb94f)},
+      {TOBN(0x6eef0d61, 0xa0f442de), TOBN(0xf92e1817, 0x27b139d3),
+       TOBN(0x1ae6deb7, 0x0ad8bc28), TOBN(0xa89e38dc, 0xc0514130)}},
+     {{TOBN(0x81eeb865, 0xd2fdca23), TOBN(0x5a15ee08, 0xcc8ef895),
+       TOBN(0x768fa10a, 0x01905614), TOBN(0xeff5b8ef, 0x880ee19b)},
+      {TOBN(0xf0c0cabb, 0xcb1c8a0e), TOBN(0x2e1ee9cd, 0xb8c838f9),
+       TOBN(0x0587d8b8, 0x8a4a14c0), TOBN(0xf6f27896, 0x2ff698e5)}},
+     {{TOBN(0xed38ef1c, 0x89ee6256), TOBN(0xf44ee1fe, 0x6b353b45),
+       TOBN(0x9115c0c7, 0x70e903b3), TOBN(0xc78ec0a1, 0x818f31df)},
+      {TOBN(0x6c003324, 0xb7dccbc6), TOBN(0xd96dd1f3, 0x163bbc25),
+       TOBN(0x33aa82dd, 0x5cedd805), TOBN(0x123aae4f, 0x7f7eb2f1)}},
+     {{TOBN(0x1723fcf5, 0xa26262cd), TOBN(0x1f7f4d5d, 0x0060ebd5),
+       TOBN(0xf19c5c01, 0xb2eaa3af), TOBN(0x2ccb9b14, 0x9790accf)},
+      {TOBN(0x1f9c1cad, 0x52324aa6), TOBN(0x63200526, 0x7247df54),
+       TOBN(0x5732fe42, 0xbac96f82), TOBN(0x52fe771f, 0x01a1c384)}},
+     {{TOBN(0x546ca13d, 0xb1001684), TOBN(0xb56b4eee, 0xa1709f75),
+       TOBN(0x266545a9, 0xd5db8672), TOBN(0xed971c90, 0x1e8f3cfb)},
+      {TOBN(0x4e7d8691, 0xe3a07b29), TOBN(0x7570d9ec, 0xe4b696b9),
+       TOBN(0xdc5fa067, 0x7bc7e9ae), TOBN(0x68b44caf, 0xc82c4844)}},
+     {{TOBN(0x519d34b3, 0xbf44da80), TOBN(0x283834f9, 0x5ab32e66),
+       TOBN(0x6e608797, 0x6278a000), TOBN(0x1e62960e, 0x627312f6)},
+      {TOBN(0x9b87b27b, 0xe6901c55), TOBN(0x80e78538, 0x24fdbc1f),
+       TOBN(0xbbbc0951, 0x2facc27d), TOBN(0x06394239, 0xac143b5a)}},
+     {{TOBN(0x35bb4a40, 0x376c1944), TOBN(0x7cb62694, 0x63da1511),
+       TOBN(0xafd29161, 0xb7148a3b), TOBN(0xa6f9d9ed, 0x4e2ea2ee)},
+      {TOBN(0x15dc2ca2, 0x880dd212), TOBN(0x903c3813, 0xa61139a9),
+       TOBN(0x2aa7b46d, 0x6c0f8785), TOBN(0x36ce2871, 0x901c60ff)}},
+     {{TOBN(0xc683b028, 0xe10d9c12), TOBN(0x7573baa2, 0x032f33d3),
+       TOBN(0x87a9b1f6, 0x67a31b58), TOBN(0xfd3ed11a, 0xf4ffae12)},
+      {TOBN(0x83dcaa9a, 0x0cb2748e), TOBN(0x8239f018, 0x5d6fdf16),
+       TOBN(0xba67b49c, 0x72753941), TOBN(0x2beec455, 0xc321cb36)}},
+     {{TOBN(0x88015606, 0x3f8b84ce), TOBN(0x76417083, 0x8d38c86f),
+       TOBN(0x054f1ca7, 0x598953dd), TOBN(0xc939e110, 0x4e8e7429)},
+      {TOBN(0x9b1ac2b3, 0x5a914f2f), TOBN(0x39e35ed3, 0xe74b8f9c),
+       TOBN(0xd0debdb2, 0x781b2fb0), TOBN(0x1585638f, 0x2d997ba2)}},
+     {{TOBN(0x9c4b646e, 0x9e2fce99), TOBN(0x68a21081, 0x1e80857f),
+       TOBN(0x06d54e44, 0x3643b52a), TOBN(0xde8d6d63, 0x0d8eb843)},
+      {TOBN(0x70321563, 0x42146a0a), TOBN(0x8ba826f2, 0x5eaa3622),
+       TOBN(0x227a58bd, 0x86138787), TOBN(0x43b6c03c, 0x10281d37)}},
+     {{TOBN(0x6326afbb, 0xb54dde39), TOBN(0x744e5e8a, 0xdb6f2d5f),
+       TOBN(0x48b2a99a, 0xcff158e1), TOBN(0xa93c8fa0, 0xef87918f)},
+      {TOBN(0x2182f956, 0xde058c5c), TOBN(0x216235d2, 0x936f9e7a),
+       TOBN(0xace0c0db, 0xd2e31e67), TOBN(0xc96449bf, 0xf23ac3e7)}},
+     {{TOBN(0x7e9a2874, 0x170693bd), TOBN(0xa28e14fd, 0xa45e6335),
+       TOBN(0x5757f6b3, 0x56427344), TOBN(0x822e4556, 0xacf8edf9)},
+      {TOBN(0x2b7a6ee2, 0xe6a285cd), TOBN(0x5866f211, 0xa9df3af0),
+       TOBN(0x40dde2dd, 0xf845b844), TOBN(0x986c3726, 0x110e5e49)}},
+     {{TOBN(0x73680c2a, 0xf7172277), TOBN(0x57b94f0f, 0x0cccb244),
+       TOBN(0xbdff7267, 0x2d438ca7), TOBN(0xbad1ce11, 0xcf4663fd)},
+      {TOBN(0x9813ed9d, 0xd8f71cae), TOBN(0xf43272a6, 0x961fdaa6),
+       TOBN(0xbeff0119, 0xbd6d1637), TOBN(0xfebc4f91, 0x30361978)}},
+     {{TOBN(0x02b37a95, 0x2f41deff), TOBN(0x0e44a59a, 0xe63b89b7),
+       TOBN(0x673257dc, 0x143ff951), TOBN(0x19c02205, 0xd752baf4)},
+      {TOBN(0x46c23069, 0xc4b7d692), TOBN(0x2e6392c3, 0xfd1502ac),
+       TOBN(0x6057b1a2, 0x1b220846), TOBN(0xe51ff946, 0x0c1b5b63)}}},
+    {{{TOBN(0x6e85cb51, 0x566c5c43), TOBN(0xcff9c919, 0x3597f046),
+       TOBN(0x9354e90c, 0x4994d94a), TOBN(0xe0a39332, 0x2147927d)},
+      {TOBN(0x8427fac1, 0x0dc1eb2b), TOBN(0x88cfd8c2, 0x2ff319fa),
+       TOBN(0xe2d4e684, 0x01965274), TOBN(0xfa2e067d, 0x67aaa746)}},
+     {{TOBN(0xb6d92a7f, 0x3e5f9f11), TOBN(0x9afe153a, 0xd6cb3b8e),
+       TOBN(0x4d1a6dd7, 0xddf800bd), TOBN(0xf6c13cc0, 0xcaf17e19)},
+      {TOBN(0x15f6c58e, 0x325fc3ee), TOBN(0x71095400, 0xa31dc3b2),
+       TOBN(0x168e7c07, 0xafa3d3e7), TOBN(0x3f8417a1, 0x94c7ae2d)}},
+     {{TOBN(0xec234772, 0x813b230d), TOBN(0x634d0f5f, 0x17344427),
+       TOBN(0x11548ab1, 0xd77fc56a), TOBN(0x7fab1750, 0xce06af77)},
+      {TOBN(0xb62c10a7, 0x4f7c4f83), TOBN(0xa7d2edc4, 0x220a67d9),
+       TOBN(0x1c404170, 0x921209a0), TOBN(0x0b9815a0, 0xface59f0)}},
+     {{TOBN(0x2842589b, 0x319540c3), TOBN(0x18490f59, 0xa283d6f8),
+       TOBN(0xa2731f84, 0xdaae9fcb), TOBN(0x3db6d960, 0xc3683ba0)},
+      {TOBN(0xc85c63bb, 0x14611069), TOBN(0xb19436af, 0x0788bf05),
+       TOBN(0x905459df, 0x347460d2), TOBN(0x73f6e094, 0xe11a7db1)}},
+     {{TOBN(0xdc7f938e, 0xb6357f37), TOBN(0xc5d00f79, 0x2bd8aa62),
+       TOBN(0xc878dcb9, 0x2ca979fc), TOBN(0x37e83ed9, 0xeb023a99)},
+      {TOBN(0x6b23e273, 0x1560bf3d), TOBN(0x1086e459, 0x1d0fae61),
+       TOBN(0x78248316, 0x9a9414bd), TOBN(0x1b956bc0, 0xf0ea9ea1)}},
+     {{TOBN(0x7b85bb91, 0xc31b9c38), TOBN(0x0c5aa90b, 0x48ef57b5),
+       TOBN(0xdedeb169, 0xaf3bab6f), TOBN(0xe610ad73, 0x2d373685)},
+      {TOBN(0xf13870df, 0x02ba8e15), TOBN(0x0337edb6, 0x8ca7f771),
+       TOBN(0xe4acf747, 0xb62c036c), TOBN(0xd921d576, 0xb6b94e81)}},
+     {{TOBN(0xdbc86439, 0x2c422f7a), TOBN(0xfb635362, 0xed348898),
+       TOBN(0x83084668, 0xc45bfcd1), TOBN(0xc357c9e3, 0x2b315e11)},
+      {TOBN(0xb173b540, 0x5b2e5b8c), TOBN(0x7e946931, 0xe102b9a4),
+       TOBN(0x17c890eb, 0x7b0fb199), TOBN(0xec225a83, 0xd61b662b)}},
+     {{TOBN(0xf306a3c8, 0xee3c76cb), TOBN(0x3cf11623, 0xd32a1f6e),
+       TOBN(0xe6d5ab64, 0x6863e956), TOBN(0x3b8a4cbe, 0x5c005c26)},
+      {TOBN(0xdcd529a5, 0x9ce6bb27), TOBN(0xc4afaa52, 0x04d4b16f),
+       TOBN(0xb0624a26, 0x7923798d), TOBN(0x85e56df6, 0x6b307fab)}},
+     {{TOBN(0x0281893c, 0x2bf29698), TOBN(0x91fc19a4, 0xd7ce7603),
+       TOBN(0x75a5dca3, 0xad9a558f), TOBN(0x40ceb3fa, 0x4d50bf77)},
+      {TOBN(0x1baf6060, 0xbc9ba369), TOBN(0x927e1037, 0x597888c2),
+       TOBN(0xd936bf19, 0x86a34c07), TOBN(0xd4cf10c1, 0xc34ae980)}},
+     {{TOBN(0x3a3e5334, 0x859dd614), TOBN(0x9c475b5b, 0x18d0c8ee),
+       TOBN(0x63080d1f, 0x07cd51d5), TOBN(0xc9c0d0a6, 0xb88b4326)},
+      {TOBN(0x1ac98691, 0xc234296f), TOBN(0x2a0a83a4, 0x94887fb6),
+       TOBN(0x56511427, 0x0cea9cf2), TOBN(0x5230a6e8, 0xa24802f5)}},
+     {{TOBN(0xf7a2bf0f, 0x72e3d5c1), TOBN(0x37717446, 0x4f21439e),
+       TOBN(0xfedcbf25, 0x9ce30334), TOBN(0xe0030a78, 0x7ce202f9)},
+      {TOBN(0x6f2d9ebf, 0x1202e9ca), TOBN(0xe79dde6c, 0x75e6e591),
+       TOBN(0xf52072af, 0xf1dac4f8), TOBN(0x6c8d087e, 0xbb9b404d)}},
+     {{TOBN(0xad0fc73d, 0xbce913af), TOBN(0x909e587b, 0x458a07cb),
+       TOBN(0x1300da84, 0xd4f00c8a), TOBN(0x425cd048, 0xb54466ac)},
+      {TOBN(0xb59cb9be, 0x90e9d8bf), TOBN(0x991616db, 0x3e431b0e),
+       TOBN(0xd3aa117a, 0x531aecff), TOBN(0x91af92d3, 0x59f4dc3b)}},
+     {{TOBN(0x9b1ec292, 0xe93fda29), TOBN(0x76bb6c17, 0xe97d91bc),
+       TOBN(0x7509d95f, 0xaface1e6), TOBN(0x3653fe47, 0xbe855ae3)},
+      {TOBN(0x73180b28, 0x0f680e75), TOBN(0x75eefd1b, 0xeeb6c26c),
+       TOBN(0xa4cdf29f, 0xb66d4236), TOBN(0x2d70a997, 0x6b5821d8)}},
+     {{TOBN(0x7a3ee207, 0x20445c36), TOBN(0x71d1ac82, 0x59877174),
+       TOBN(0x0fc539f7, 0x949f73e9), TOBN(0xd05cf3d7, 0x982e3081)},
+      {TOBN(0x8758e20b, 0x7b1c7129), TOBN(0xffadcc20, 0x569e61f2),
+       TOBN(0xb05d3a2f, 0x59544c2d), TOBN(0xbe16f5c1, 0x9fff5e53)}},
+     {{TOBN(0x73cf65b8, 0xaad58135), TOBN(0x622c2119, 0x037aa5be),
+       TOBN(0x79373b3f, 0x646fd6a0), TOBN(0x0e029db5, 0x0d3978cf)},
+      {TOBN(0x8bdfc437, 0x94fba037), TOBN(0xaefbd687, 0x620797a6),
+       TOBN(0x3fa5382b, 0xbd30d38e), TOBN(0x7627cfbf, 0x585d7464)}},
+     {{TOBN(0xb2330fef, 0x4e4ca463), TOBN(0xbcef7287, 0x3566cc63),
+       TOBN(0xd161d2ca, 0xcf780900), TOBN(0x135dc539, 0x5b54827d)},
+      {TOBN(0x638f052e, 0x27bf1bc6), TOBN(0x10a224f0, 0x07dfa06c),
+       TOBN(0xe973586d, 0x6d3321da), TOBN(0x8b0c5738, 0x26152c8f)}},
+     {{TOBN(0x07ef4f2a, 0x34606074), TOBN(0x80fe7fe8, 0xa0f7047a),
+       TOBN(0x3d1a8152, 0xe1a0e306), TOBN(0x32cf43d8, 0x88da5222)},
+      {TOBN(0xbf89a95f, 0x5f02ffe6), TOBN(0x3d9eb9a4, 0x806ad3ea),
+       TOBN(0x012c17bb, 0x79c8e55e), TOBN(0xfdcd1a74, 0x99c81dac)}},
+     {{TOBN(0x7043178b, 0xb9556098), TOBN(0x4090a1df, 0x801c3886),
+       TOBN(0x759800ff, 0x9b67b912), TOBN(0x3e5c0304, 0x232620c8)},
+      {TOBN(0x4b9d3c4b, 0x70dceeca), TOBN(0xbb2d3c15, 0x181f648e),
+       TOBN(0xf981d837, 0x6e33345c), TOBN(0xb626289b, 0x0cf2297a)}},
+     {{TOBN(0x766ac659, 0x8baebdcf), TOBN(0x1a28ae09, 0x75df01e5),
+       TOBN(0xb71283da, 0x375876d8), TOBN(0x4865a96d, 0x607b9800)},
+      {TOBN(0x25dd1bcd, 0x237936b2), TOBN(0x332f4f4b, 0x60417494),
+       TOBN(0xd0923d68, 0x370a2147), TOBN(0x497f5dfb, 0xdc842203)}},
+     {{TOBN(0x9dc74cbd, 0x32be5e0f), TOBN(0x7475bcb7, 0x17a01375),
+       TOBN(0x438477c9, 0x50d872b1), TOBN(0xcec67879, 0xffe1d63d)},
+      {TOBN(0x9b006014, 0xd8578c70), TOBN(0xc9ad99a8, 0x78bb6b8b),
+       TOBN(0x6799008e, 0x11fb3806), TOBN(0xcfe81435, 0xcd44cab3)}},
+     {{TOBN(0xa2ee1582, 0x2f4fb344), TOBN(0xb8823450, 0x483fa6eb),
+       TOBN(0x622d323d, 0x652c7749), TOBN(0xd8474a98, 0xbeb0a15b)},
+      {TOBN(0xe43c154d, 0x5d1c00d0), TOBN(0x7fd581d9, 0x0e3e7aac),
+       TOBN(0x2b44c619, 0x2525ddf8), TOBN(0x67a033eb, 0xb8ae9739)}},
+     {{TOBN(0x113ffec1, 0x9ef2d2e4), TOBN(0x1bf6767e, 0xd5a0ea7f),
+       TOBN(0x57fff75e, 0x03714c0a), TOBN(0xa23c422e, 0x0a23e9ee)},
+      {TOBN(0xdd5f6b2d, 0x540f83af), TOBN(0xc2c2c27e, 0x55ea46a7),
+       TOBN(0xeb6b4246, 0x672a1208), TOBN(0xd13599f7, 0xae634f7a)}},
+     {{TOBN(0xcf914b5c, 0xd7b32c6e), TOBN(0x61a5a640, 0xeaf61814),
+       TOBN(0x8dc3df8b, 0x208a1bbb), TOBN(0xef627fd6, 0xb6d79aa5)},
+      {TOBN(0x44232ffc, 0xc4c86bc8), TOBN(0xe6f9231b, 0x061539fe),
+       TOBN(0x1d04f25a, 0x958b9533), TOBN(0x180cf934, 0x49e8c885)}},
+     {{TOBN(0x89689595, 0x9884aaf7), TOBN(0xb1959be3, 0x07b348a6),
+       TOBN(0x96250e57, 0x3c147c87), TOBN(0xae0efb3a, 0xdd0c61f8)},
+      {TOBN(0xed00745e, 0xca8c325e), TOBN(0x3c911696, 0xecff3f70),
+       TOBN(0x73acbc65, 0x319ad41d), TOBN(0x7b01a020, 0xf0b1c7ef)}},
+     {{TOBN(0xea32b293, 0x63a1483f), TOBN(0x89eabe71, 0x7a248f96),
+       TOBN(0x9c6231d3, 0x343157e5), TOBN(0x93a375e5, 0xdf3c546d)},
+      {TOBN(0xe76e9343, 0x6a2afe69), TOBN(0xc4f89100, 0xe166c88e),
+       TOBN(0x248efd0d, 0x4f872093), TOBN(0xae0eb3ea, 0x8fe0ea61)}},
+     {{TOBN(0xaf89790d, 0x9d79046e), TOBN(0x4d650f2d, 0x6cee0976),
+       TOBN(0xa3935d9a, 0x43071eca), TOBN(0x66fcd2c9, 0x283b0bfe)},
+      {TOBN(0x0e665eb5, 0x696605f1), TOBN(0xe77e5d07, 0xa54cd38d),
+       TOBN(0x90ee050a, 0x43d950cf), TOBN(0x86ddebda, 0xd32e69b5)}},
+     {{TOBN(0x6ad94a3d, 0xfddf7415), TOBN(0xf7fa1309, 0x3f6e8d5a),
+       TOBN(0xc4831d1d, 0xe9957f75), TOBN(0x7de28501, 0xd5817447)},
+      {TOBN(0x6f1d7078, 0x9e2aeb6b), TOBN(0xba2b9ff4, 0xf67a53c2),
+       TOBN(0x36963767, 0xdf9defc3), TOBN(0x479deed3, 0x0d38022c)}},
+     {{TOBN(0xd2edb89b, 0x3a8631e8), TOBN(0x8de855de, 0x7a213746),
+       TOBN(0xb2056cb7, 0xb00c5f11), TOBN(0xdeaefbd0, 0x2c9b85e4)},
+      {TOBN(0x03f39a8d, 0xd150892d), TOBN(0x37b84686, 0x218b7985),
+       TOBN(0x36296dd8, 0xb7375f1a), TOBN(0x472cd4b1, 0xb78e898e)}},
+     {{TOBN(0x15dff651, 0xe9f05de9), TOBN(0xd4045069, 0x2ce98ba9),
+       TOBN(0x8466a7ae, 0x9b38024c), TOBN(0xb910e700, 0xe5a6b5ef)},
+      {TOBN(0xae1c56ea, 0xb3aa8f0d), TOBN(0xbab2a507, 0x7eee74a6),
+       TOBN(0x0dca11e2, 0x4b4c4620), TOBN(0xfd896e2e, 0x4c47d1f4)}},
+     {{TOBN(0xeb45ae53, 0x308fbd93), TOBN(0x46cd5a2e, 0x02c36fda),
+       TOBN(0x6a3d4e90, 0xbaa48385), TOBN(0xdd55e62e, 0x9dbe9960)},
+      {TOBN(0xa1406aa0, 0x2a81ede7), TOBN(0x6860dd14, 0xf9274ea7),
+       TOBN(0xcfdcb0c2, 0x80414f86), TOBN(0xff410b10, 0x22f94327)}},
+     {{TOBN(0x5a33cc38, 0x49ad467b), TOBN(0xefb48b6c, 0x0a7335f1),
+       TOBN(0x14fb54a4, 0xb153a360), TOBN(0x604aa9d2, 0xb52469cc)},
+      {TOBN(0x5e9dc486, 0x754e48e9), TOBN(0x693cb455, 0x37471e8e),
+       TOBN(0xfb2fd7cd, 0x8d3b37b6), TOBN(0x63345e16, 0xcf09ff07)}},
+     {{TOBN(0x9910ba6b, 0x23a5d896), TOBN(0x1fe19e35, 0x7fe4364e),
+       TOBN(0x6e1da8c3, 0x9a33c677), TOBN(0x15b4488b, 0x29fd9fd0)},
+      {TOBN(0x1f439254, 0x1a1f22bf), TOBN(0x920a8a70, 0xab8163e8),
+       TOBN(0x3fd1b249, 0x07e5658e), TOBN(0xf2c4f79c, 0xb6ec839b)}},
+     {{TOBN(0x1abbc3d0, 0x4aa38d1b), TOBN(0x3b0db35c, 0xb5d9510e),
+       TOBN(0x1754ac78, 0x3e60dec0), TOBN(0x53272fd7, 0xea099b33)},
+      {TOBN(0x5fb0494f, 0x07a8e107), TOBN(0x4a89e137, 0x6a8191fa),
+       TOBN(0xa113b7f6, 0x3c4ad544), TOBN(0x88a2e909, 0x6cb9897b)}},
+     {{TOBN(0x17d55de3, 0xb44a3f84), TOBN(0xacb2f344, 0x17c6c690),
+       TOBN(0x32088168, 0x10232390), TOBN(0xf2e8a61f, 0x6c733bf7)},
+      {TOBN(0xa774aab6, 0x9c2d7652), TOBN(0xfb5307e3, 0xed95c5bc),
+       TOBN(0xa05c73c2, 0x4981f110), TOBN(0x1baae31c, 0xa39458c9)}},
+     {{TOBN(0x1def185b, 0xcbea62e7), TOBN(0xe8ac9eae, 0xeaf63059),
+       TOBN(0x098a8cfd, 0x9921851c), TOBN(0xd959c3f1, 0x3abe2f5b)},
+      {TOBN(0xa4f19525, 0x20e40ae5), TOBN(0x320789e3, 0x07a24aa1),
+       TOBN(0x259e6927, 0x7392b2bc), TOBN(0x58f6c667, 0x1918668b)}},
+     {{TOBN(0xce1db2bb, 0xc55d2d8b), TOBN(0x41d58bb7, 0xf4f6ca56),
+       TOBN(0x7650b680, 0x8f877614), TOBN(0x905e16ba, 0xf4c349ed)},
+      {TOBN(0xed415140, 0xf661acac), TOBN(0x3b8784f0, 0xcb2270af),
+       TOBN(0x3bc280ac, 0x8a402cba), TOBN(0xd53f7146, 0x0937921a)}},
+     {{TOBN(0xc03c8ee5, 0xe5681e83), TOBN(0x62126105, 0xf6ac9e4a),
+       TOBN(0x9503a53f, 0x936b1a38), TOBN(0x3d45e2d4, 0x782fecbd)},
+      {TOBN(0x69a5c439, 0x76e8ae98), TOBN(0xb53b2eeb, 0xbfb4b00e),
+       TOBN(0xf1674712, 0x72386c89), TOBN(0x30ca34a2, 0x4268bce4)}},
+     {{TOBN(0x7f1ed86c, 0x78341730), TOBN(0x8ef5beb8, 0xb525e248),
+       TOBN(0xbbc489fd, 0xb74fbf38), TOBN(0x38a92a0e, 0x91a0b382)},
+      {TOBN(0x7a77ba3f, 0x22433ccf), TOBN(0xde8362d6, 0xa29f05a9),
+       TOBN(0x7f6a30ea, 0x61189afc), TOBN(0x693b5505, 0x59ef114f)}},
+     {{TOBN(0x50266bc0, 0xcd1797a1), TOBN(0xea17b47e, 0xf4b7af2d),
+       TOBN(0xd6c4025c, 0x3df9483e), TOBN(0x8cbb9d9f, 0xa37b18c9)},
+      {TOBN(0x91cbfd9c, 0x4d8424cf), TOBN(0xdb7048f1, 0xab1c3506),
+       TOBN(0x9eaf641f, 0x028206a3), TOBN(0xf986f3f9, 0x25bdf6ce)}},
+     {{TOBN(0x262143b5, 0x224c08dc), TOBN(0x2bbb09b4, 0x81b50c91),
+       TOBN(0xc16ed709, 0xaca8c84f), TOBN(0xa6210d9d, 0xb2850ca8)},
+      {TOBN(0x6d8df67a, 0x09cb54d6), TOBN(0x91eef6e0, 0x500919a4),
+       TOBN(0x90f61381, 0x0f132857), TOBN(0x9acede47, 0xf8d5028b)}},
+     {{TOBN(0x844d1b71, 0x90b771c3), TOBN(0x563b71e4, 0xba6426be),
+       TOBN(0x2efa2e83, 0xbdb802ff), TOBN(0x3410cbab, 0xab5b4a41)},
+      {TOBN(0x555b2d26, 0x30da84dd), TOBN(0xd0711ae9, 0xee1cc29a),
+       TOBN(0xcf3e8c60, 0x2f547792), TOBN(0x03d7d5de, 0xdc678b35)}},
+     {{TOBN(0x071a2fa8, 0xced806b8), TOBN(0x222e6134, 0x697f1478),
+       TOBN(0xdc16fd5d, 0xabfcdbbf), TOBN(0x44912ebf, 0x121b53b8)},
+      {TOBN(0xac943674, 0x2496c27c), TOBN(0x8ea3176c, 0x1ffc26b0),
+       TOBN(0xb6e224ac, 0x13debf2c), TOBN(0x524cc235, 0xf372a832)}},
+     {{TOBN(0xd706e1d8, 0x9f6f1b18), TOBN(0x2552f005, 0x44cce35b),
+       TOBN(0x8c8326c2, 0xa88e31fc), TOBN(0xb5468b2c, 0xf9552047)},
+      {TOBN(0xce683e88, 0x3ff90f2b), TOBN(0x77947bdf, 0x2f0a5423),
+       TOBN(0xd0a1b28b, 0xed56e328), TOBN(0xaee35253, 0xc20134ac)}},
+     {{TOBN(0x7e98367d, 0x3567962f), TOBN(0x379ed61f, 0x8188bffb),
+       TOBN(0x73bba348, 0xfaf130a1), TOBN(0x6c1f75e1, 0x904ed734)},
+      {TOBN(0x18956642, 0x3b4a79fc), TOBN(0xf20bc83d, 0x54ef4493),
+       TOBN(0x836d425d, 0x9111eca1), TOBN(0xe5b5c318, 0x009a8dcf)}},
+     {{TOBN(0x3360b25d, 0x13221bc5), TOBN(0x707baad2, 0x6b3eeaf7),
+       TOBN(0xd7279ed8, 0x743a95a1), TOBN(0x7450a875, 0x969e809f)},
+      {TOBN(0x32b6bd53, 0xe5d0338f), TOBN(0x1e77f7af, 0x2b883bbc),
+       TOBN(0x90da12cc, 0x1063ecd0), TOBN(0xe2697b58, 0xc315be47)}},
+     {{TOBN(0x2771a5bd, 0xda85d534), TOBN(0x53e78c1f, 0xff980eea),
+       TOBN(0xadf1cf84, 0x900385e7), TOBN(0x7d3b14f6, 0xc9387b62)},
+      {TOBN(0x170e74b0, 0xcb8f2bd2), TOBN(0x2d50b486, 0x827fa993),
+       TOBN(0xcdbe8c9a, 0xf6f32bab), TOBN(0x55e906b0, 0xc3b93ab8)}},
+     {{TOBN(0x747f22fc, 0x8fe280d1), TOBN(0xcd8e0de5, 0xb2e114ab),
+       TOBN(0x5ab7dbeb, 0xe10b68b0), TOBN(0x9dc63a9c, 0xa480d4b2)},
+      {TOBN(0x78d4bc3b, 0x4be1495f), TOBN(0x25eb3db8, 0x9359122d),
+       TOBN(0x3f8ac05b, 0x0809cbdc), TOBN(0xbf4187bb, 0xd37c702f)}},
+     {{TOBN(0x84cea069, 0x1416a6a5), TOBN(0x8f860c79, 0x43ef881c),
+       TOBN(0x41311f8a, 0x38038a5d), TOBN(0xe78c2ec0, 0xfc612067)},
+      {TOBN(0x494d2e81, 0x5ad73581), TOBN(0xb4cc9e00, 0x59604097),
+       TOBN(0xff558aec, 0xf3612cba), TOBN(0x35beef7a, 0x9e36c39e)}},
+     {{TOBN(0x1845c7cf, 0xdbcf41b9), TOBN(0x5703662a, 0xaea997c0),
+       TOBN(0x8b925afe, 0xe402f6d8), TOBN(0xd0a1b1ae, 0x4dd72162)},
+      {TOBN(0x9f47b375, 0x03c41c4b), TOBN(0xa023829b, 0x0391d042),
+       TOBN(0x5f5045c3, 0x503b8b0a), TOBN(0x123c2688, 0x98c010e5)}},
+     {{TOBN(0x324ec0cc, 0x36ba06ee), TOBN(0xface3115, 0x3dd2cc0c),
+       TOBN(0xb364f3be, 0xf333e91f), TOBN(0xef8aff73, 0x28e832b0)},
+      {TOBN(0x1e9bad04, 0x2d05841b), TOBN(0x42f0e3df, 0x356a21e2),
+       TOBN(0xa3270bcb, 0x4add627e), TOBN(0xb09a8158, 0xd322e711)}},
+     {{TOBN(0x86e326a1, 0x0fee104a), TOBN(0xad7788f8, 0x3703f65d),
+       TOBN(0x7e765430, 0x47bc4833), TOBN(0x6cee582b, 0x2b9b893a)},
+      {TOBN(0x9cd2a167, 0xe8f55a7b), TOBN(0xefbee3c6, 0xd9e4190d),
+       TOBN(0x33ee7185, 0xd40c2e9d), TOBN(0x844cc9c5, 0xa380b548)}},
+     {{TOBN(0x323f8ecd, 0x66926e04), TOBN(0x0001e38f, 0x8110c1ba),
+       TOBN(0x8dbcac12, 0xfc6a7f07), TOBN(0xd65e1d58, 0x0cec0827)},
+      {TOBN(0xd2cd4141, 0xbe76ca2d), TOBN(0x7895cf5c, 0xe892f33a),
+       TOBN(0x956d230d, 0x367139d2), TOBN(0xa91abd3e, 0xd012c4c1)}},
+     {{TOBN(0x34fa4883, 0x87eb36bf), TOBN(0xc5f07102, 0x914b8fb4),
+       TOBN(0x90f0e579, 0xadb9c95f), TOBN(0xfe6ea8cb, 0x28888195)},
+      {TOBN(0x7b9b5065, 0xedfa9284), TOBN(0x6c510bd2, 0x2b8c8d65),
+       TOBN(0xd7b8ebef, 0xcbe8aafd), TOBN(0xedb3af98, 0x96b1da07)}},
+     {{TOBN(0x28ff779d, 0x6295d426), TOBN(0x0c4f6ac7, 0x3fa3ad7b),
+       TOBN(0xec44d054, 0x8b8e2604), TOBN(0x9b32a66d, 0x8b0050e1)},
+      {TOBN(0x1f943366, 0xf0476ce2), TOBN(0x7554d953, 0xa602c7b4),
+       TOBN(0xbe35aca6, 0x524f2809), TOBN(0xb6881229, 0xfd4edbea)}},
+     {{TOBN(0xe8cd0c8f, 0x508efb63), TOBN(0x9eb5b5c8, 0x6abcefc7),
+       TOBN(0xf5621f5f, 0xb441ab4f), TOBN(0x79e6c046, 0xb76a2b22)},
+      {TOBN(0x74a4792c, 0xe37a1f69), TOBN(0xcbd252cb, 0x03542b60),
+       TOBN(0x785f65d5, 0xb3c20bd3), TOBN(0x8dea6143, 0x4fabc60c)}},
+     {{TOBN(0x45e21446, 0xde673629), TOBN(0x57f7aa1e, 0x703c2d21),
+       TOBN(0xa0e99b7f, 0x98c868c7), TOBN(0x4e42f66d, 0x8b641676)},
+      {TOBN(0x602884dc, 0x91077896), TOBN(0xa0d690cf, 0xc2c9885b),
+       TOBN(0xfeb4da33, 0x3b9a5187), TOBN(0x5f789598, 0x153c87ee)}},
+     {{TOBN(0x2192dd47, 0x52b16dba), TOBN(0xdeefc0e6, 0x3524c1b1),
+       TOBN(0x465ea76e, 0xe4383693), TOBN(0x79401711, 0x361b8d98)},
+      {TOBN(0xa5f9ace9, 0xf21a15cb), TOBN(0x73d26163, 0xefee9aeb),
+       TOBN(0xcca844b3, 0xe677016c), TOBN(0x6c122b07, 0x57eaee06)}},
+     {{TOBN(0xb782dce7, 0x15f09690), TOBN(0x508b9b12, 0x2dfc0fc9),
+       TOBN(0x9015ab4b, 0x65d89fc6), TOBN(0x5e79dab7, 0xd6d5bb0f)},
+      {TOBN(0x64f021f0, 0x6c775aa2), TOBN(0xdf09d8cc, 0x37c7eca1),
+       TOBN(0x9a761367, 0xef2fa506), TOBN(0xed4ca476, 0x5b81eec6)}},
+     {{TOBN(0x262ede36, 0x10bbb8b5), TOBN(0x0737ce83, 0x0641ada3),
+       TOBN(0x4c94288a, 0xe9831ccc), TOBN(0x487fc1ce, 0x8065e635)},
+      {TOBN(0xb13d7ab3, 0xb8bb3659), TOBN(0xdea5df3e, 0x855e4120),
+       TOBN(0xb9a18573, 0x85eb0244), TOBN(0x1a1b8ea3, 0xa7cfe0a3)}},
+     {{TOBN(0x3b837119, 0x67b0867c), TOBN(0x8d5e0d08, 0x9d364520),
+       TOBN(0x52dccc1e, 0xd930f0e3), TOBN(0xefbbcec7, 0xbf20bbaf)},
+      {TOBN(0x99cffcab, 0x0263ad10), TOBN(0xd8199e6d, 0xfcd18f8a),
+       TOBN(0x64e2773f, 0xe9f10617), TOBN(0x0079e8e1, 0x08704848)}},
+     {{TOBN(0x1169989f, 0x8a342283), TOBN(0x8097799c, 0xa83012e6),
+       TOBN(0xece966cb, 0x8a6a9001), TOBN(0x93b3afef, 0x072ac7fc)},
+      {TOBN(0xe6893a2a, 0x2db3d5ba), TOBN(0x263dc462, 0x89bf4fdc),
+       TOBN(0x8852dfc9, 0xe0396673), TOBN(0x7ac70895, 0x3af362b6)}},
+     {{TOBN(0xbb9cce4d, 0x5c2f342b), TOBN(0xbf80907a, 0xb52d7aae),
+       TOBN(0x97f3d3cd, 0x2161bcd0), TOBN(0xb25b0834, 0x0962744d)},
+      {TOBN(0xc5b18ea5, 0x6c3a1dda), TOBN(0xfe4ec7eb, 0x06c92317),
+       TOBN(0xb787b890, 0xad1c4afe), TOBN(0xdccd9a92, 0x0ede801a)}},
+     {{TOBN(0x9ac6ddda, 0xdb58da1f), TOBN(0x22bbc12f, 0xb8cae6ee),
+       TOBN(0xc6f8bced, 0x815c4a43), TOBN(0x8105a92c, 0xf96480c7)},
+      {TOBN(0x0dc3dbf3, 0x7a859d51), TOBN(0xe3ec7ce6, 0x3041196b),
+       TOBN(0xd9f64b25, 0x0d1067c9), TOBN(0xf2321321, 0x3d1f8dd8)}},
+     {{TOBN(0x8b5c619c, 0x76497ee8), TOBN(0x5d2b0ac6, 0xc717370e),
+       TOBN(0x98204cb6, 0x4fcf68e1), TOBN(0x0bdec211, 0x62bc6792)},
+      {TOBN(0x6973ccef, 0xa63b1011), TOBN(0xf9e3fa97, 0xe0de1ac5),
+       TOBN(0x5efb693e, 0x3d0e0c8b), TOBN(0x037248e9, 0xd2d4fcb4)}}},
+    {{{TOBN(0x80802dc9, 0x1ec34f9e), TOBN(0xd8772d35, 0x33810603),
+       TOBN(0x3f06d66c, 0x530cb4f3), TOBN(0x7be5ed0d, 0xc475c129)},
+      {TOBN(0xcb9e3c19, 0x31e82b10), TOBN(0xc63d2857, 0xc9ff6b4c),
+       TOBN(0xb92118c6, 0x92a1b45e), TOBN(0x0aec4414, 0x7285bbca)}},
+     {{TOBN(0xfc189ae7, 0x1e29a3ef), TOBN(0xcbe906f0, 0x4c93302e),
+       TOBN(0xd0107914, 0xceaae10e), TOBN(0xb7a23f34, 0xb68e19f8)},
+      {TOBN(0xe9d875c2, 0xefd2119d), TOBN(0x03198c6e, 0xfcadc9c8),
+       TOBN(0x65591bf6, 0x4da17113), TOBN(0x3cf0bbf8, 0x3d443038)}},
+     {{TOBN(0xae485bb7, 0x2b724759), TOBN(0x945353e1, 0xb2d4c63a),
+       TOBN(0x82159d07, 0xde7d6f2c), TOBN(0x389caef3, 0x4ec5b109)},
+      {TOBN(0x4a8ebb53, 0xdb65ef14), TOBN(0x2dc2cb7e, 0xdd99de43),
+       TOBN(0x816fa3ed, 0x83f2405f), TOBN(0x73429bb9, 0xc14208a3)}},
+     {{TOBN(0xb618d590, 0xb01e6e27), TOBN(0x047e2ccd, 0xe180b2dc),
+       TOBN(0xd1b299b5, 0x04aea4a9), TOBN(0x412c9e1e, 0x9fa403a4)},
+      {TOBN(0x88d28a36, 0x79407552), TOBN(0x49c50136, 0xf332b8e3),
+       TOBN(0x3a1b6fcc, 0xe668de19), TOBN(0x178851bc, 0x75122b97)}},
+     {{TOBN(0xb1e13752, 0xfb85fa4c), TOBN(0xd61257ce, 0x383c8ce9),
+       TOBN(0xd43da670, 0xd2f74dae), TOBN(0xa35aa23f, 0xbf846bbb)},
+      {TOBN(0x5e74235d, 0x4421fc83), TOBN(0xf6df8ee0, 0xc363473b),
+       TOBN(0x34d7f52a, 0x3c4aa158), TOBN(0x50d05aab, 0x9bc6d22e)}},
+     {{TOBN(0x8c56e735, 0xa64785f4), TOBN(0xbc56637b, 0x5f29cd07),
+       TOBN(0x53b2bb80, 0x3ee35067), TOBN(0x50235a0f, 0xdc919270)},
+      {TOBN(0x191ab6d8, 0xf2c4aa65), TOBN(0xc3475831, 0x8396023b),
+       TOBN(0x80400ba5, 0xf0f805ba), TOBN(0x8881065b, 0x5ec0f80f)}},
+     {{TOBN(0xc370e522, 0xcc1b5e83), TOBN(0xde2d4ad1, 0x860b8bfb),
+       TOBN(0xad364df0, 0x67b256df), TOBN(0x8f12502e, 0xe0138997)},
+      {TOBN(0x503fa0dc, 0x7783920a), TOBN(0xe80014ad, 0xc0bc866a),
+       TOBN(0x3f89b744, 0xd3064ba6), TOBN(0x03511dcd, 0xcba5dba5)}},
+     {{TOBN(0x197dd46d, 0x95a7b1a2), TOBN(0x9c4e7ad6, 0x3c6341fb),
+       TOBN(0x426eca29, 0x484c2ece), TOBN(0x9211e489, 0xde7f4f8a)},
+      {TOBN(0x14997f6e, 0xc78ef1f4), TOBN(0x2b2c0910, 0x06574586),
+       TOBN(0x17286a6e, 0x1c3eede8), TOBN(0x25f92e47, 0x0f60e018)}},
+     {{TOBN(0x805c5646, 0x31890a36), TOBN(0x703ef600, 0x57feea5b),
+       TOBN(0x389f747c, 0xaf3c3030), TOBN(0xe0e5daeb, 0x54dd3739)},
+      {TOBN(0xfe24a4c3, 0xc9c9f155), TOBN(0x7e4bf176, 0xb5393962),
+       TOBN(0x37183de2, 0xaf20bf29), TOBN(0x4a1bd7b5, 0xf95a8c3b)}},
+     {{TOBN(0xa83b9699, 0x46191d3d), TOBN(0x281fc8dd, 0x7b87f257),
+       TOBN(0xb18e2c13, 0x54107588), TOBN(0x6372def7, 0x9b2bafe8)},
+      {TOBN(0xdaf4bb48, 0x0d8972ca), TOBN(0x3f2dd4b7, 0x56167a3f),
+       TOBN(0x1eace32d, 0x84310cf4), TOBN(0xe3bcefaf, 0xe42700aa)}},
+     {{TOBN(0x5fe5691e, 0xd785e73d), TOBN(0xa5db5ab6, 0x2ea60467),
+       TOBN(0x02e23d41, 0xdfc6514a), TOBN(0x35e8048e, 0xe03c3665)},
+      {TOBN(0x3f8b118f, 0x1adaa0f8), TOBN(0x28ec3b45, 0x84ce1a5a),
+       TOBN(0xe8cacc6e, 0x2c6646b8), TOBN(0x1343d185, 0xdbd0e40f)}},
+     {{TOBN(0xe5d7f844, 0xcaaa358c), TOBN(0x1a1db7e4, 0x9924182a),
+       TOBN(0xd64cd42d, 0x9c875d9a), TOBN(0xb37b515f, 0x042eeec8)},
+      {TOBN(0x4d4dd409, 0x7b165fbe), TOBN(0xfc322ed9, 0xe206eff3),
+       TOBN(0x7dee4102, 0x59b7e17e), TOBN(0x55a481c0, 0x8236ca00)}},
+     {{TOBN(0x8c885312, 0xc23fc975), TOBN(0x15715806, 0x05d6297b),
+       TOBN(0xa078868e, 0xf78edd39), TOBN(0x956b31e0, 0x03c45e52)},
+      {TOBN(0x470275d5, 0xff7b33a6), TOBN(0xc8d5dc3a, 0x0c7e673f),
+       TOBN(0x419227b4, 0x7e2f2598), TOBN(0x8b37b634, 0x4c14a975)}},
+     {{TOBN(0xd0667ed6, 0x8b11888c), TOBN(0x5e0e8c3e, 0x803e25dc),
+       TOBN(0x34e5d0dc, 0xb987a24a), TOBN(0x9f40ac3b, 0xae920323)},
+      {TOBN(0x5463de95, 0x34e0f63a), TOBN(0xa128bf92, 0x6b6328f9),
+       TOBN(0x491ccd7c, 0xda64f1b7), TOBN(0x7ef1ec27, 0xc47bde35)}},
+     {{TOBN(0xa857240f, 0xa36a2737), TOBN(0x35dc1366, 0x63621bc1),
+       TOBN(0x7a3a6453, 0xd4fb6897), TOBN(0x80f1a439, 0xc929319d)},
+      {TOBN(0xfc18274b, 0xf8cb0ba0), TOBN(0xb0b53766, 0x8078c5eb),
+       TOBN(0xfb0d4924, 0x1e01d0ef), TOBN(0x50d7c67d, 0x372ab09c)}},
+     {{TOBN(0xb4e370af, 0x3aeac968), TOBN(0xe4f7fee9, 0xc4b63266),
+       TOBN(0xb4acd4c2, 0xe3ac5664), TOBN(0xf8910bd2, 0xceb38cbf)},
+      {TOBN(0x1c3ae50c, 0xc9c0726e), TOBN(0x15309569, 0xd97b40bf),
+       TOBN(0x70884b7f, 0xfd5a5a1b), TOBN(0x3890896a, 0xef8314cd)}},
+     {{TOBN(0x58e1515c, 0xa5618c93), TOBN(0xe665432b, 0x77d942d1),
+       TOBN(0xb32181bf, 0xb6f767a8), TOBN(0x753794e8, 0x3a604110)},
+      {TOBN(0x09afeb7c, 0xe8c0dbcc), TOBN(0x31e02613, 0x598673a3),
+       TOBN(0x5d98e557, 0x7d46db00), TOBN(0xfc21fb8c, 0x9d985b28)}},
+     {{TOBN(0xc9040116, 0xb0843e0b), TOBN(0x53b1b3a8, 0x69b04531),
+       TOBN(0xdd1649f0, 0x85d7d830), TOBN(0xbb3bcc87, 0xcb7427e8)},
+      {TOBN(0x77261100, 0xc93dce83), TOBN(0x7e79da61, 0xa1922a2a),
+       TOBN(0x587a2b02, 0xf3149ce8), TOBN(0x147e1384, 0xde92ec83)}},
+     {{TOBN(0x484c83d3, 0xaf077f30), TOBN(0xea78f844, 0x0658b53a),
+       TOBN(0x912076c2, 0x027aec53), TOBN(0xf34714e3, 0x93c8177d)},
+      {TOBN(0x37ef5d15, 0xc2376c84), TOBN(0x8315b659, 0x3d1aa783),
+       TOBN(0x3a75c484, 0xef852a90), TOBN(0x0ba0c58a, 0x16086bd4)}},
+     {{TOBN(0x29688d7a, 0x529a6d48), TOBN(0x9c7f250d, 0xc2f19203),
+       TOBN(0x123042fb, 0x682e2df9), TOBN(0x2b7587e7, 0xad8121bc)},
+      {TOBN(0x30fc0233, 0xe0182a65), TOBN(0xb82ecf87, 0xe3e1128a),
+       TOBN(0x71682861, 0x93fb098f), TOBN(0x043e21ae, 0x85e9e6a7)}},
+     {{TOBN(0xab5b49d6, 0x66c834ea), TOBN(0x3be43e18, 0x47414287),
+       TOBN(0xf40fb859, 0x219a2a47), TOBN(0x0e6559e9, 0xcc58df3c)},
+      {TOBN(0xfe1dfe8e, 0x0c6615b4), TOBN(0x14abc8fd, 0x56459d70),
+       TOBN(0x7be0fa8e, 0x05de0386), TOBN(0x8e63ef68, 0xe9035c7c)}},
+     {{TOBN(0x116401b4, 0x53b31e91), TOBN(0x0cba7ad4, 0x4436b4d8),
+       TOBN(0x9151f9a0, 0x107afd66), TOBN(0xafaca8d0, 0x1f0ee4c4)},
+      {TOBN(0x75fe5c1d, 0x9ee9761c), TOBN(0x3497a16b, 0xf0c0588f),
+       TOBN(0x3ee2bebd, 0x0304804c), TOBN(0xa8fb9a60, 0xc2c990b9)}},
+     {{TOBN(0xd14d32fe, 0x39251114), TOBN(0x36bf25bc, 0xcac73366),
+       TOBN(0xc9562c66, 0xdba7495c), TOBN(0x324d301b, 0x46ad348b)},
+      {TOBN(0x9f46620c, 0xd670407e), TOBN(0x0ea8d4f1, 0xe3733a01),
+       TOBN(0xd396d532, 0xb0c324e0), TOBN(0x5b211a0e, 0x03c317cd)}},
+     {{TOBN(0x090d7d20, 0x5ffe7b37), TOBN(0x3b7f3efb, 0x1747d2da),
+       TOBN(0xa2cb525f, 0xb54fc519), TOBN(0x6e220932, 0xf66a971e)},
+      {TOBN(0xddc160df, 0xb486d440), TOBN(0x7fcfec46, 0x3fe13465),
+       TOBN(0x83da7e4e, 0x76e4c151), TOBN(0xd6fa48a1, 0xd8d302b5)}},
+     {{TOBN(0xc6304f26, 0x5872cd88), TOBN(0x806c1d3c, 0x278b90a1),
+       TOBN(0x3553e725, 0xcaf0bc1c), TOBN(0xff59e603, 0xbb9d8d5c)},
+      {TOBN(0xa4550f32, 0x7a0b85dd), TOBN(0xdec5720a, 0x93ecc217),
+       TOBN(0x0b88b741, 0x69d62213), TOBN(0x7212f245, 0x5b365955)}},
+     {{TOBN(0x20764111, 0xb5cae787), TOBN(0x13cb7f58, 0x1dfd3124),
+       TOBN(0x2dca77da, 0x1175aefb), TOBN(0xeb75466b, 0xffaae775)},
+      {TOBN(0x74d76f3b, 0xdb6cff32), TOBN(0x7440f37a, 0x61fcda9a),
+       TOBN(0x1bb3ac92, 0xb525028b), TOBN(0x20fbf8f7, 0xa1975f29)}},
+     {{TOBN(0x982692e1, 0xdf83097f), TOBN(0x28738f6c, 0x554b0800),
+       TOBN(0xdc703717, 0xa2ce2f2f), TOBN(0x7913b93c, 0x40814194)},
+      {TOBN(0x04924593, 0x1fe89636), TOBN(0x7b98443f, 0xf78834a6),
+       TOBN(0x11c6ab01, 0x5114a5a1), TOBN(0x60deb383, 0xffba5f4c)}},
+     {{TOBN(0x4caa54c6, 0x01a982e6), TOBN(0x1dd35e11, 0x3491cd26),
+       TOBN(0x973c315f, 0x7cbd6b05), TOBN(0xcab00775, 0x52494724)},
+      {TOBN(0x04659b1f, 0x6565e15a), TOBN(0xbf30f529, 0x8c8fb026),
+       TOBN(0xfc21641b, 0xa8a0de37), TOBN(0xe9c7a366, 0xfa5e5114)}},
+     {{TOBN(0xdb849ca5, 0x52f03ad8), TOBN(0xc7e8dbe9, 0x024e35c0),
+       TOBN(0xa1a2bbac, 0xcfc3c789), TOBN(0xbf733e7d, 0x9c26f262)},
+      {TOBN(0x882ffbf5, 0xb8444823), TOBN(0xb7224e88, 0x6bf8483b),
+       TOBN(0x53023b8b, 0x65bef640), TOBN(0xaabfec91, 0xd4d5f8cd)}},
+     {{TOBN(0xa40e1510, 0x079ea1bd), TOBN(0x1ad9addc, 0xd05d5d26),
+       TOBN(0xdb3f2eab, 0x13e68d4f), TOBN(0x1cff1ae2, 0x640f803f)},
+      {TOBN(0xe0e7b749, 0xd4cee117), TOBN(0x8e9f275b, 0x4036d909),
+       TOBN(0xce34e31d, 0x8f4d4c38), TOBN(0x22b37f69, 0xd75130fc)}},
+     {{TOBN(0x83e0f1fd, 0xb4014604), TOBN(0xa8ce9919, 0x89415078),
+       TOBN(0x82375b75, 0x41792efe), TOBN(0x4f59bf5c, 0x97d4515b)},
+      {TOBN(0xac4f324f, 0x923a277d), TOBN(0xd9bc9b7d, 0x650f3406),
+       TOBN(0xc6fa87d1, 0x8a39bc51), TOBN(0x82588530, 0x5ccc108f)}},
+     {{TOBN(0x5ced3c9f, 0x82e4c634), TOBN(0x8efb8314, 0x3a4464f8),
+       TOBN(0xe706381b, 0x7a1dca25), TOBN(0x6cd15a3c, 0x5a2a412b)},
+      {TOBN(0x9347a8fd, 0xbfcd8fb5), TOBN(0x31db2eef, 0x6e54cd22),
+       TOBN(0xc4aeb11e, 0xf8d8932f), TOBN(0x11e7c1ed, 0x344411af)}},
+     {{TOBN(0x2653050c, 0xdc9a151e), TOBN(0x9edbfc08, 0x3bb0a859),
+       TOBN(0x926c81c7, 0xfd5691e7), TOBN(0x9c1b2342, 0x6f39019a)},
+      {TOBN(0x64a81c8b, 0x7f8474b9), TOBN(0x90657c07, 0x01761819),
+       TOBN(0x390b3331, 0x55e0375a), TOBN(0xc676c626, 0xb6ebc47d)}},
+     {{TOBN(0x51623247, 0xb7d6dee8), TOBN(0x0948d927, 0x79659313),
+       TOBN(0x99700161, 0xe9ab35ed), TOBN(0x06cc32b4, 0x8ddde408)},
+      {TOBN(0x6f2fd664, 0x061ef338), TOBN(0x1606fa02, 0xc202e9ed),
+       TOBN(0x55388bc1, 0x929ba99b), TOBN(0xc4428c5e, 0x1e81df69)}},
+     {{TOBN(0xce2028ae, 0xf91b0b2a), TOBN(0xce870a23, 0xf03dfd3f),
+       TOBN(0x66ec2c87, 0x0affe8ed), TOBN(0xb205fb46, 0x284d0c00)},
+      {TOBN(0xbf5dffe7, 0x44cefa48), TOBN(0xb6fc37a8, 0xa19876d7),
+       TOBN(0xbecfa84c, 0x08b72863), TOBN(0xd7205ff5, 0x2576374f)}},
+     {{TOBN(0x80330d32, 0x8887de41), TOBN(0x5de0df0c, 0x869ea534),
+       TOBN(0x13f42753, 0x3c56ea17), TOBN(0xeb1f6069, 0x452b1a78)},
+      {TOBN(0x50474396, 0xe30ea15c), TOBN(0x575816a1, 0xc1494125),
+       TOBN(0xbe1ce55b, 0xfe6bb38f), TOBN(0xb901a948, 0x96ae30f7)}},
+     {{TOBN(0xe5af0f08, 0xd8fc3548), TOBN(0x5010b5d0, 0xd73bfd08),
+       TOBN(0x993d2880, 0x53fe655a), TOBN(0x99f2630b, 0x1c1309fd)},
+      {TOBN(0xd8677baf, 0xb4e3b76f), TOBN(0x14e51ddc, 0xb840784b),
+       TOBN(0x326c750c, 0xbf0092ce), TOBN(0xc83d306b, 0xf528320f)}},
+     {{TOBN(0xc4456715, 0x77d4715c), TOBN(0xd30019f9, 0x6b703235),
+       TOBN(0x207ccb2e, 0xd669e986), TOBN(0x57c824af, 0xf6dbfc28)},
+      {TOBN(0xf0eb532f, 0xd8f92a23), TOBN(0x4a557fd4, 0x9bb98fd2),
+       TOBN(0xa57acea7, 0xc1e6199a), TOBN(0x0c663820, 0x8b94b1ed)}},
+     {{TOBN(0x9b42be8f, 0xf83a9266), TOBN(0xc7741c97, 0x0101bd45),
+       TOBN(0x95770c11, 0x07bd9ceb), TOBN(0x1f50250a, 0x8b2e0744)},
+      {TOBN(0xf762eec8, 0x1477b654), TOBN(0xc65b900e, 0x15efe59a),
+       TOBN(0x88c96148, 0x9546a897), TOBN(0x7e8025b3, 0xc30b4d7c)}},
+     {{TOBN(0xae4065ef, 0x12045cf9), TOBN(0x6fcb2caf, 0x9ccce8bd),
+       TOBN(0x1fa0ba4e, 0xf2cf6525), TOBN(0xf683125d, 0xcb72c312)},
+      {TOBN(0xa01da4ea, 0xe312410e), TOBN(0x67e28677, 0x6cd8e830),
+       TOBN(0xabd95752, 0x98fb3f07), TOBN(0x05f11e11, 0xeef649a5)}},
+     {{TOBN(0xba47faef, 0x9d3472c2), TOBN(0x3adff697, 0xc77d1345),
+       TOBN(0x4761fa04, 0xdd15afee), TOBN(0x64f1f61a, 0xb9e69462)},
+      {TOBN(0xfa691fab, 0x9bfb9093), TOBN(0x3df8ae8f, 0xa1133dfe),
+       TOBN(0xcd5f8967, 0x58cc710d), TOBN(0xfbb88d50, 0x16c7fe79)}},
+     {{TOBN(0x8e011b4c, 0xe88c50d1), TOBN(0x7532e807, 0xa8771c4f),
+       TOBN(0x64c78a48, 0xe2278ee4), TOBN(0x0b283e83, 0x3845072a)},
+      {TOBN(0x98a6f291, 0x49e69274), TOBN(0xb96e9668, 0x1868b21c),
+       TOBN(0x38f0adc2, 0xb1a8908e), TOBN(0x90afcff7, 0x1feb829d)}},
+     {{TOBN(0x9915a383, 0x210b0856), TOBN(0xa5a80602, 0xdef04889),
+       TOBN(0x800e9af9, 0x7c64d509), TOBN(0x81382d0b, 0xb8996f6f)},
+      {TOBN(0x490eba53, 0x81927e27), TOBN(0x46c63b32, 0x4af50182),
+       TOBN(0x784c5fd9, 0xd3ad62ce), TOBN(0xe4fa1870, 0xf8ae8736)}},
+     {{TOBN(0x4ec9d0bc, 0xd7466b25), TOBN(0x84ddbe1a, 0xdb235c65),
+       TOBN(0x5e2645ee, 0x163c1688), TOBN(0x570bd00e, 0x00eba747)},
+      {TOBN(0xfa51b629, 0x128bfa0f), TOBN(0x92fce1bd, 0x6c1d3b68),
+       TOBN(0x3e7361dc, 0xb66778b1), TOBN(0x9c7d249d, 0x5561d2bb)}},
+     {{TOBN(0xa40b28bf, 0x0bbc6229), TOBN(0x1c83c05e, 0xdfd91497),
+       TOBN(0x5f9f5154, 0xf083df05), TOBN(0xbac38b3c, 0xeee66c9d)},
+      {TOBN(0xf71db7e3, 0xec0dfcfd), TOBN(0xf2ecda8e, 0x8b0a8416),
+       TOBN(0x52fddd86, 0x7812aa66), TOBN(0x2896ef10, 0x4e6f4272)}},
+     {{TOBN(0xff27186a, 0x0fe9a745), TOBN(0x08249fcd, 0x49ca70db),
+       TOBN(0x7425a2e6, 0x441cac49), TOBN(0xf4a0885a, 0xece5ff57)},
+      {TOBN(0x6e2cb731, 0x7d7ead58), TOBN(0xf96cf7d6, 0x1898d104),
+       TOBN(0xafe67c9d, 0x4f2c9a89), TOBN(0x89895a50, 0x1c7bf5bc)}},
+     {{TOBN(0xdc7cb8e5, 0x573cecfa), TOBN(0x66497eae, 0xd15f03e6),
+       TOBN(0x6bc0de69, 0x3f084420), TOBN(0x323b9b36, 0xacd532b0)},
+      {TOBN(0xcfed390a, 0x0115a3c1), TOBN(0x9414c40b, 0x2d65ca0e),
+       TOBN(0x641406bd, 0x2f530c78), TOBN(0x29369a44, 0x833438f2)}},
+     {{TOBN(0x996884f5, 0x903fa271), TOBN(0xe6da0fd2, 0xb9da921e),
+       TOBN(0xa6f2f269, 0x5db01e54), TOBN(0x1ee3e9bd, 0x6876214e)},
+      {TOBN(0xa26e181c, 0xe27a9497), TOBN(0x36d254e4, 0x8e215e04),
+       TOBN(0x42f32a6c, 0x252cabca), TOBN(0x99481487, 0x80b57614)}},
+     {{TOBN(0x4c4dfe69, 0x40d9cae1), TOBN(0x05869580, 0x11a10f09),
+       TOBN(0xca287b57, 0x3491b64b), TOBN(0x77862d5d, 0x3fd4a53b)},
+      {TOBN(0xbf94856e, 0x50349126), TOBN(0x2be30bd1, 0x71c5268f),
+       TOBN(0x10393f19, 0xcbb650a6), TOBN(0x639531fe, 0x778cf9fd)}},
+     {{TOBN(0x02556a11, 0xb2935359), TOBN(0xda38aa96, 0xaf8c126e),
+       TOBN(0x47dbe6c2, 0x0960167f), TOBN(0x37bbabb6, 0x501901cd)},
+      {TOBN(0xb6e979e0, 0x2c947778), TOBN(0xd69a5175, 0x7a1a1dc6),
+       TOBN(0xc3ed5095, 0x9d9faf0c), TOBN(0x4dd9c096, 0x1d5fa5f0)}},
+     {{TOBN(0xa0c4304d, 0x64f16ea8), TOBN(0x8b1cac16, 0x7e718623),
+       TOBN(0x0b576546, 0x7c67f03e), TOBN(0x559cf5ad, 0xcbd88c01)},
+      {TOBN(0x074877bb, 0x0e2af19a), TOBN(0x1f717ec1, 0xa1228c92),
+       TOBN(0x70bcb800, 0x326e8920), TOBN(0xec6e2c5c, 0x4f312804)}},
+     {{TOBN(0x426aea7d, 0x3fca4752), TOBN(0xf12c0949, 0x2211f62a),
+       TOBN(0x24beecd8, 0x7be7b6b5), TOBN(0xb77eaf4c, 0x36d7a27d)},
+      {TOBN(0x154c2781, 0xfda78fd3), TOBN(0x848a83b0, 0x264eeabe),
+       TOBN(0x81287ef0, 0x4ffe2bc4), TOBN(0x7b6d88c6, 0xb6b6fc2a)}},
+     {{TOBN(0x805fb947, 0xce417d99), TOBN(0x4b93dcc3, 0x8b916cc4),
+       TOBN(0x72e65bb3, 0x21273323), TOBN(0xbcc1badd, 0x6ea9886e)},
+      {TOBN(0x0e223011, 0x4bc5ee85), TOBN(0xa561be74, 0xc18ee1e4),
+       TOBN(0x762fd2d4, 0xa6bcf1f1), TOBN(0x50e6a5a4, 0x95231489)}},
+     {{TOBN(0xca96001f, 0xa00b500b), TOBN(0x5c098cfc, 0x5d7dcdf5),
+       TOBN(0xa64e2d2e, 0x8c446a85), TOBN(0xbae9bcf1, 0x971f3c62)},
+      {TOBN(0x4ec22683, 0x8435a2c5), TOBN(0x8ceaed6c, 0x4bad4643),
+       TOBN(0xe9f8fb47, 0xccccf4e3), TOBN(0xbd4f3fa4, 0x1ce3b21e)}},
+     {{TOBN(0xd79fb110, 0xa3db3292), TOBN(0xe28a37da, 0xb536c66a),
+       TOBN(0x279ce87b, 0x8e49e6a9), TOBN(0x70ccfe8d, 0xfdcec8e3)},
+      {TOBN(0x2193e4e0, 0x3ba464b2), TOBN(0x0f39d60e, 0xaca9a398),
+       TOBN(0x7d7932af, 0xf82c12ab), TOBN(0xd8ff50ed, 0x91e7e0f7)}},
+     {{TOBN(0xea961058, 0xfa28a7e0), TOBN(0xc726cf25, 0x0bf5ec74),
+       TOBN(0xe74d55c8, 0xdb229666), TOBN(0x0bd9abbf, 0xa57f5799)},
+      {TOBN(0x7479ef07, 0x4dfc47b3), TOBN(0xd9c65fc3, 0x0c52f91d),
+       TOBN(0x8e0283fe, 0x36a8bde2), TOBN(0xa32a8b5e, 0x7d4b7280)}},
+     {{TOBN(0x6a677c61, 0x12e83233), TOBN(0x0fbb3512, 0xdcc9bf28),
+       TOBN(0x562e8ea5, 0x0d780f61), TOBN(0x0db8b22b, 0x1dc4e89c)},
+      {TOBN(0x0a6fd1fb, 0x89be0144), TOBN(0x8c77d246, 0xca57113b),
+       TOBN(0x4639075d, 0xff09c91c), TOBN(0x5b47b17f, 0x5060824c)}},
+     {{TOBN(0x58aea2b0, 0x16287b52), TOBN(0xa1343520, 0xd0cd8eb0),
+       TOBN(0x6148b4d0, 0xc5d58573), TOBN(0xdd2b6170, 0x291c68ae)},
+      {TOBN(0xa61b3929, 0x1da3b3b7), TOBN(0x5f946d79, 0x08c4ac10),
+       TOBN(0x4105d4a5, 0x7217d583), TOBN(0x5061da3d, 0x25e6de5e)}},
+     {{TOBN(0x3113940d, 0xec1b4991), TOBN(0xf12195e1, 0x36f485ae),
+       TOBN(0xa7507fb2, 0x731a2ee0), TOBN(0x95057a8e, 0x6e9e196e)},
+      {TOBN(0xa3c2c911, 0x2e130136), TOBN(0x97dfbb36, 0x33c60d15),
+       TOBN(0xcaf3c581, 0xb300ee2b), TOBN(0x77f25d90, 0xf4bac8b8)}},
+     {{TOBN(0xdb1c4f98, 0x6d840cd6), TOBN(0x471d62c0, 0xe634288c),
+       TOBN(0x8ec2f85e, 0xcec8a161), TOBN(0x41f37cbc, 0xfa6f4ae2)},
+      {TOBN(0x6793a20f, 0x4b709985), TOBN(0x7a7bd33b, 0xefa8985b),
+       TOBN(0x2c6a3fbd, 0x938e6446), TOBN(0x19042619, 0x2a8d47c1)}},
+     {{TOBN(0x16848667, 0xcc36975f), TOBN(0x02acf168, 0x9d5f1dfb),
+       TOBN(0x62d41ad4, 0x613baa94), TOBN(0xb56fbb92, 0x9f684670)},
+      {TOBN(0xce610d0d, 0xe9e40569), TOBN(0x7b99c65f, 0x35489fef),
+       TOBN(0x0c88ad1b, 0x3df18b97), TOBN(0x81b7d9be, 0x5d0e9edb)}},
+     {{TOBN(0xd85218c0, 0xc716cc0a), TOBN(0xf4b5ff90, 0x85691c49),
+       TOBN(0xa4fd666b, 0xce356ac6), TOBN(0x17c72895, 0x4b327a7a)},
+      {TOBN(0xf93d5085, 0xda6be7de), TOBN(0xff71530e, 0x3301d34e),
+       TOBN(0x4cd96442, 0xd8f448e8), TOBN(0x9283d331, 0x2ed18ffa)}},
+     {{TOBN(0x4d33dd99, 0x2a849870), TOBN(0xa716964b, 0x41576335),
+       TOBN(0xff5e3a9b, 0x179be0e5), TOBN(0x5b9d6b1b, 0x83b13632)},
+      {TOBN(0x3b8bd7d4, 0xa52f313b), TOBN(0xc9dd95a0, 0x637a4660),
+       TOBN(0x30035962, 0x0b3e218f), TOBN(0xce1481a3, 0xc7b28a3c)}},
+     {{TOBN(0xab41b43a, 0x43228d83), TOBN(0x24ae1c30, 0x4ad63f99),
+       TOBN(0x8e525f1a, 0x46a51229), TOBN(0x14af860f, 0xcd26d2b4)},
+      {TOBN(0xd6baef61, 0x3f714aa1), TOBN(0xf51865ad, 0xeb78795e),
+       TOBN(0xd3e21fce, 0xe6a9d694), TOBN(0x82ceb1dd, 0x8a37b527)}}}};
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c
new file mode 100644
index 0000000000..7ae2ffda77
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c
@@ -0,0 +1,437 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+// (1) Intel Corporation, Israel Development Center, Haifa, Israel
+// (2) University of Haifa, Israel
+//
+// Reference:
+// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+//                          256 Bit Primes"
+
+#include <ring-core/base.h>
+
+#include "../../limbs/limbs.inl"
+
+#include <stdint.h>
+
+#include "p256-nistz.h"
+
+#if defined(OPENSSL_USE_NISTZ256)
+
+typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
+
+// One converted into the Montgomery domain
+static const BN_ULONG ONE_MONT[P256_LIMBS] = {
+    TOBN(0x00000000, 0x00000001),
+    TOBN(0xffffffff, 0x00000000),
+    TOBN(0xffffffff, 0xffffffff),
+    TOBN(0x00000000, 0xfffffffe),
+};
+
+// Precomputed tables for the default generator
+#include "p256-nistz-table.h"
+
+// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in
+// util.c for details
+static crypto_word_t booth_recode_w5(crypto_word_t in) {
+  crypto_word_t s, d;
+
+  s = ~((in >> 5) - 1);
+  d = (1 << 6) - in - 1;
+  d = (d & s) | (in & ~s);
+  d = (d >> 1) + (d & 1);
+
+  return (d << 1) + (s & 1);
+}
+
+static crypto_word_t booth_recode_w7(crypto_word_t in) {
+  crypto_word_t s, d;
+
+  s = ~((in >> 7) - 1);
+  d = (1 << 8) - in - 1;
+  d = (d & s) | (in & ~s);
+  d = (d >> 1) + (d & 1);
+
+  return (d << 1) + (s & 1);
+}
+
+// The `(P256_LIMBS == 8)` case is unreachable for 64-bit targets.
+#if defined(OPENSSL_64_BIT) && defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunreachable-code"
+#endif
+
+// copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is
+// if |move| is zero.
+//
+// WARNING: this breaks the usual convention of constant-time functions
+// returning masks.
+static void copy_conditional(BN_ULONG dst[P256_LIMBS],
+                             const BN_ULONG src[P256_LIMBS], BN_ULONG move) {
+  BN_ULONG mask1 = ((BN_ULONG)0) - move;
+  BN_ULONG mask2 = ~mask1;
+
+  dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
+  dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
+  dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
+  dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
+  if (P256_LIMBS == 8) {
+    dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
+    dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
+    dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
+    dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
+  }
+}
+
+#if defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+// is_not_zero returns one iff in != 0 and zero otherwise.
+//
+// WARNING: this breaks the usual convention of constant-time functions
+// returning masks.
+//
+// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64)
+//   (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f)
+// )
+//
+// (declare-fun x () (_ BitVec 64))
+//
+// (assert (and (= x #x0000000000000000) (= (is_not_zero x)
+// #x0000000000000001))) (check-sat)
+//
+// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x)
+// #x0000000000000000))) (check-sat)
+//
+static BN_ULONG is_not_zero(BN_ULONG in) {
+  in |= (0 - in);
+  in >>= BN_BITS2 - 1;
+  return in;
+}
+
+#if defined(OPENSSL_X86_64)
+// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in
+// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both
+// capabilities.
+       void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
+                                  const BN_ULONG a[P256_LIMBS],
+                                  const BN_ULONG b[P256_LIMBS]) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_mul_mont_adx(res, a, b);
+  } else {
+    ecp_nistz256_mul_mont_nohw(res, a, b);
+  }
+}
+
+       void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
+                                  const BN_ULONG a[P256_LIMBS]) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_sqr_mont_adx(res, a);
+  } else {
+    ecp_nistz256_sqr_mont_nohw(res, a);
+  }
+}
+
+       void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                                      const BN_ULONG a[P256_LIMBS],
+                                      const BN_ULONG b[P256_LIMBS]) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_ord_mul_mont_adx(res, a, b);
+  } else {
+    ecp_nistz256_ord_mul_mont_nohw(res, a, b);
+  }
+}
+
+       void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                                      const BN_ULONG a[P256_LIMBS],
+                                      BN_ULONG rep) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_ord_sqr_mont_adx(res, a, rep);
+  } else {
+    ecp_nistz256_ord_sqr_mont_nohw(res, a, rep);
+  }
+}
+
+static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
+                                   int index) {
+  if (avx2_available) {
+    ecp_nistz256_select_w5_avx2(val, in_t, index);
+  } else {
+    ecp_nistz256_select_w5_nohw(val, in_t, index);
+  }
+}
+
+static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
+                                   const P256_POINT_AFFINE in_t[64],
+                                   int index) {
+  if (avx2_available) {
+    ecp_nistz256_select_w7_avx2(val, in_t, index);
+  } else {
+    ecp_nistz256_select_w7_nohw(val, in_t, index);
+  }
+}
+
+       void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_point_double_adx(r, a);
+  } else {
+    ecp_nistz256_point_double_nohw(r, a);
+  }
+}
+
+       void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
+                                   const P256_POINT *b) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_point_add_adx(r, a, b);
+  } else {
+    ecp_nistz256_point_add_nohw(r, a, b);
+  }
+}
+
+       void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
+                                          const P256_POINT_AFFINE *b) {
+  if (adx_bmi2_available) {
+    ecp_nistz256_point_add_affine_adx(r, a, b);
+  } else {
+    ecp_nistz256_point_add_affine_nohw(r, a, b);
+  }
+}
+#endif  // OPENSSL_X86_64
+
+// r = p * p_scalar
+static void ecp_nistz256_windowed_mul(P256_POINT *r,
+                                      const BN_ULONG p_scalar[P256_LIMBS],
+                                      const BN_ULONG p_x[P256_LIMBS],
+                                      const BN_ULONG p_y[P256_LIMBS]) {
+  debug_assert_nonsecret(r != NULL);
+  debug_assert_nonsecret(p_scalar != NULL);
+  debug_assert_nonsecret(p_x != NULL);
+  debug_assert_nonsecret(p_y != NULL);
+
+  static const size_t kWindowSize = 5;
+  static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
+
+  // A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should
+  // add no more than 63 bytes of overhead. Thus, |table| should require
+  // ~1599 ((96 * 16) + 63) bytes of stack space.
+  alignas(64) P256_POINT table[16];
+  P256_SCALAR_BYTES p_str;
+  p256_scalar_bytes_from_limbs(p_str, p_scalar);
+
+  // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
+  // not stored. All other values are actually stored with an offset of -1 in
+  // table.
+  P256_POINT *row = table;
+
+  limbs_copy(row[1 - 1].X, p_x, P256_LIMBS);
+  limbs_copy(row[1 - 1].Y, p_y, P256_LIMBS);
+  limbs_copy(row[1 - 1].Z, ONE_MONT, P256_LIMBS);
+
+  ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]);
+  ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
+  ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]);
+  ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]);
+  ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]);
+  ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]);
+  ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
+  ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
+  ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
+  ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
+  ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]);
+  ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]);
+  ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
+  ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
+  ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]);
+
+  BN_ULONG tmp[P256_LIMBS];
+  alignas(32) P256_POINT h;
+  size_t index = 255;
+  crypto_word_t wvalue = p_str[(index - 1) / 8];
+  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
+
+  ecp_nistz256_select_w5(r, table, (int)(booth_recode_w5(wvalue) >> 1));
+
+  while (index >= 5) {
+    if (index != 255) {
+      size_t off = (index - 1) / 8;
+
+      wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8;
+      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
+
+      wvalue = booth_recode_w5(wvalue);
+
+      ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1));
+
+      ecp_nistz256_neg(tmp, h.Y);
+      copy_conditional(h.Y, tmp, (wvalue & 1));
+
+      ecp_nistz256_point_add(r, r, &h);
+    }
+
+    index -= kWindowSize;
+
+    ecp_nistz256_point_double(r, r);
+    ecp_nistz256_point_double(r, r);
+    ecp_nistz256_point_double(r, r);
+    ecp_nistz256_point_double(r, r);
+    ecp_nistz256_point_double(r, r);
+  }
+
+  // Final window
+  wvalue = p_str[0];
+  wvalue = (wvalue << 1) & kMask;
+
+  wvalue = booth_recode_w5(wvalue);
+
+  ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1));
+
+  ecp_nistz256_neg(tmp, h.Y);
+  copy_conditional(h.Y, tmp, wvalue & 1);
+
+  ecp_nistz256_point_add(r, r, &h);
+}
+
+static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) {
+  static const size_t kWindowSize = 7;
+  static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+  *index = kWindowSize;
+
+  crypto_word_t wvalue = ((crypto_word_t)p_str[0] << 1) & kMask;
+  return booth_recode_w7(wvalue);
+}
+
+static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
+  static const size_t kWindowSize = 7;
+  static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+
+  const size_t off = (*index - 1) / 8;
+  crypto_word_t wvalue =
+      (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8;
+  wvalue = (wvalue >> ((*index - 1) % 8)) & kMask;
+  *index += kWindowSize;
+
+  return booth_recode_w7(wvalue);
+}
+
+void p256_point_mul(Limb r[3][P256_LIMBS], const Limb p_scalar[P256_LIMBS],
+                        const Limb p_x[P256_LIMBS],
+                        const Limb p_y[P256_LIMBS]) {
+  alignas(32) P256_POINT out;
+  ecp_nistz256_windowed_mul(&out, p_scalar, p_x, p_y);
+
+  limbs_copy(r[0], out.X, P256_LIMBS);
+  limbs_copy(r[1], out.Y, P256_LIMBS);
+  limbs_copy(r[2], out.Z, P256_LIMBS);
+}
+
+void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
+  P256_SCALAR_BYTES p_str;
+  p256_scalar_bytes_from_limbs(p_str, scalar);
+
+  // First window
+  size_t index = 0;
+  crypto_word_t wvalue = calc_first_wvalue(&index, p_str);
+
+  alignas(32) P256_POINT_AFFINE t;
+  alignas(32) P256_POINT p;
+  ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], (int)(wvalue >> 1));
+  ecp_nistz256_neg(p.Z, t.Y);
+  copy_conditional(t.Y, p.Z, wvalue & 1);
+
+  // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t|
+  // is infinity and |ONE| otherwise. |t| was computed from the table, so it
+  // is infinity iff |wvalue >> 1| is zero.
+  limbs_copy(p.X, t.X, P256_LIMBS);
+  limbs_copy(p.Y, t.Y, P256_LIMBS);
+  limbs_zero(p.Z, P256_LIMBS);
+  copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1));
+
+  for (int i = 1; i < 37; i++) {
+    wvalue = calc_wvalue(&index, p_str);
+
+    ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], (int)(wvalue >> 1));
+
+    alignas(32) BN_ULONG neg_Y[P256_LIMBS];
+    ecp_nistz256_neg(neg_Y, t.Y);
+    copy_conditional(t.Y, neg_Y, wvalue & 1);
+
+    // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the
+    // same non-infinity point.
+    ecp_nistz256_point_add_affine(&p, &p, &t);
+  }
+
+  limbs_copy(r[0], p.X, P256_LIMBS);
+  limbs_copy(r[1], p.Y, P256_LIMBS);
+  limbs_copy(r[2], p.Z, P256_LIMBS);
+}
+
+void p256_point_mul_base_vartime(Limb r[3][P256_LIMBS],
+                                 const Limb g_scalar[P256_LIMBS]) {
+  alignas(32) P256_POINT p;
+  uint8_t p_str[33];
+  OPENSSL_memcpy(p_str, g_scalar, 32);
+  p_str[32] = 0;
+
+  // First window
+  size_t index = 0;
+  size_t wvalue = calc_first_wvalue(&index, p_str);
+
+  // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
+  // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so
+  // it is infinity iff |wvalue >> 1| is zero.
+  if ((wvalue >> 1) != 0) {
+    OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X,
+                   sizeof(p.X));
+    OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y,
+                   sizeof(p.Y));
+    OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z));
+  } else {
+    OPENSSL_memset(p.X, 0, sizeof(p.X));
+    OPENSSL_memset(p.Y, 0, sizeof(p.Y));
+    OPENSSL_memset(p.Z, 0, sizeof(p.Z));
+  }
+
+  if ((wvalue & 1) == 1) {
+    ecp_nistz256_neg(p.Y, p.Y);
+  }
+
+  for (int i = 1; i < 37; i++) {
+    wvalue = calc_wvalue(&index, p_str);
+    if ((wvalue >> 1) == 0) {
+      continue;
+    }
+
+    alignas(32) P256_POINT_AFFINE t;
+    OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1],
+                   sizeof(t));
+    if ((wvalue & 1) == 1) {
+      ecp_nistz256_neg(t.Y, t.Y);
+    }
+
+    // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are
+    // the same non-infinity point, so it is important that we compute the
+    // |g_scalar| term before the |p_scalar| term.
+    ecp_nistz256_point_add_affine(&p, &p, &t);
+  }
+
+
+  limbs_copy(r[0], p.X, P256_LIMBS);
+  limbs_copy(r[1], p.Y, P256_LIMBS);
+  limbs_copy(r[2], p.Z, P256_LIMBS);
+}
+
+#endif /* defined(OPENSSL_USE_NISTZ256) */
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h
new file mode 100644
index 0000000000..ff01f501f7
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h
@@ -0,0 +1,171 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+// (1) Intel Corporation, Israel Development Center, Haifa, Israel
+// (2) University of Haifa, Israel
+//
+// Reference:
+// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+//                          256 Bit Primes"
+
+#ifndef OPENSSL_HEADER_EC_P256_X86_64_H
+#define OPENSSL_HEADER_EC_P256_X86_64_H
+
+#include <ring-core/base.h>
+
+#include "p256_shared.h"
+
+#include "../bn/internal.h"
+
+#if defined(OPENSSL_USE_NISTZ256)
+
+// ecp_nistz256_neg sets |res| to -|a| mod P.
+void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
+
+// ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                const BN_ULONG a[P256_LIMBS],
+                                const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+#else
+void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS],
+                           const BN_ULONG b[P256_LIMBS]);
+#endif
+
+// ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                const BN_ULONG a[P256_LIMBS]);
+void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS]);
+#else
+void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS]);
+#endif
+
+
+// P-256 scalar operations.
+//
+// The following functions compute modulo N, where N is the order of P-256. They
+// take fully-reduced inputs and give fully-reduced outputs.
+
+// ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs
+// are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                    const BN_ULONG a[P256_LIMBS],
+                                    const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS],
+                                   const BN_ULONG a[P256_LIMBS],
+                                   const BN_ULONG b[P256_LIMBS]);
+#else
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+#endif
+
+// ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and
+// outputs are in Montgomery form. That is, |res| is
+// (|a| * 2^-256)^(2*|rep|) * 2^256 mod N.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                    const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
+void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS],
+                                   const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
+#else
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
+#endif
+
+
+
+// P-256 point operations.
+//
+// The following functions may be used in-place. All coordinates are in the
+// Montgomery domain.
+
+// A P256_POINT_AFFINE represents a P-256 point in affine coordinates. Infinity
+// is encoded as (0, 0).
+typedef struct {
+  BN_ULONG X[P256_LIMBS];
+  BN_ULONG Y[P256_LIMBS];
+} P256_POINT_AFFINE;
+
+// ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16
+// and all zeros (the point at infinity) if |index| is 0. This is done in
+// constant time.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16],
+                                 int index);
+void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16],
+                                 int index);
+#else
+void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
+                            int index);
+#endif
+
+// ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64
+// and all zeros (the point at infinity) if |index| is 0. This is done in
+// constant time.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val,
+                                 const P256_POINT_AFFINE in_t[64], int index);
+void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val,
+                                 const P256_POINT_AFFINE in_t[64], int index);
+#else
+void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
+                            const P256_POINT_AFFINE in_t[64], int index);
+#endif
+
+// ecp_nistz256_point_double sets |r| to |a| doubled.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a);
+void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a);
+#else
+void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
+#endif
+
+// ecp_nistz256_point_add adds |a| to |b| and places the result in |r|.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a,
+                                 const P256_POINT *b);
+void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a,
+                                const P256_POINT *b);
+#else
+void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
+                            const P256_POINT *b);
+#endif
+
+// ecp_nistz256_point_add_affine adds |a| to |b| and places the result in
+// |r|. |a| and |b| must not represent the same point unless they are both
+// infinity.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a,
+                                       const P256_POINT_AFFINE *b);
+void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a,
+                                        const P256_POINT_AFFINE *b);
+#else
+void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
+                                   const P256_POINT_AFFINE *b);
+#endif
+
+#endif /* defined(OPENSSL_USE_NISTZ256) */
+
+#endif  // OPENSSL_HEADER_EC_P256_X86_64_H
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256.c b/ring-0.17.14/crypto/fipsmodule/ec/p256.c
new file mode 100644
index 0000000000..0117916dab
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/p256.c
@@ -0,0 +1,539 @@
+// Copyright 2020 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// An implementation of the NIST P-256 elliptic curve point multiplication.
+// 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by
+// Fiat, which lives in //third_party/fiat.
+
+#include <ring-core/base.h>
+
+#include "../../limbs/limbs.h"
+#include "../../limbs/limbs.inl"
+
+#include "p256_shared.h"
+
+#include "../../internal.h"
+#include "./util.h"
+
+#if !defined(OPENSSL_USE_NISTZ256)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+// '=': conversion from 'int64_t' to 'int32_t', possible loss of data
+#pragma warning(disable: 4242)
+// '=': conversion from 'int32_t' to 'uint8_t', possible loss of data
+#pragma warning(disable: 4244)
+// 'initializing': conversion from 'size_t' to 'fiat_p256_limb_t'
+#pragma warning(disable: 4267)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Winline"
+#endif
+
+#if defined(BORINGSSL_HAS_UINT128)
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include "../../../third_party/fiat/p256_64.h"
+#elif defined(OPENSSL_64_BIT)
+#include "../../../third_party/fiat/p256_64_msvc.h"
+#else
+#include "../../../third_party/fiat/p256_32.h"
+#endif
+
+
+// utility functions, handwritten
+
+#if defined(OPENSSL_64_BIT)
+#define FIAT_P256_NLIMBS 4
+typedef uint64_t fiat_p256_limb_t;
+typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS];
+static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000,
+                                              0xffffffffffffffff, 0xfffffffe};
+#else  // 64BIT; else 32BIT
+#define FIAT_P256_NLIMBS 8
+typedef uint32_t fiat_p256_limb_t;
+typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS];
+static const fiat_p256_felem fiat_p256_one = {
+    0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0};
+#endif  // 64BIT
+
+
+static fiat_p256_limb_t fiat_p256_nz(
+    const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
+  fiat_p256_limb_t ret;
+  fiat_p256_nonzero(&ret, in1);
+  return ret;
+}
+
+static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
+                           const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
+  for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) {
+    out[i] = in1[i];
+  }
+}
+
+static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
+                              fiat_p256_limb_t t,
+                              const fiat_p256_limb_t z[FIAT_P256_NLIMBS],
+                              const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) {
+  fiat_p256_selectznz(out, !!t, z, nz);
+}
+
+static void fiat_p256_from_words(fiat_p256_felem out,
+                                 const Limb in[32 / sizeof(BN_ULONG)]) {
+  // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on
+  // 64-bit platforms without |uint128_t|, they are different. However, on
+  // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same
+  // layout.
+  OPENSSL_memcpy(out, in, 32);
+}
+
+static void fiat_p256_to_words(Limb out[32 / sizeof(BN_ULONG)], const fiat_p256_felem in) {
+  // See |fiat_p256_from_words|.
+  OPENSSL_memcpy(out, in, 32);
+}
+
+
+// Group operations
+// ----------------
+//
+// Building on top of the field operations we have the operations on the
+// elliptic curve group itself. Points on the curve are represented in Jacobian
+// coordinates.
+//
+// Both operations were transcribed to Coq and proven to correspond to naive
+// implementations using Affine coordinates, for all suitable fields.  In the
+// Coq proofs, issues of constant-time execution and memory layout (aliasing)
+// conventions were not considered. Specification of affine coordinates:
+// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Spec/WeierstrassCurve.v#L28>
+// As a sanity check, a proof that these points form a commutative group:
+// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/AffineProofs.v#L33>
+
+// fiat_p256_point_double calculates 2*(x_in, y_in, z_in)
+//
+// The method is taken from:
+//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+//
+// Coq transcription and correctness proof:
+// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
+// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
+//
+// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
+// while x_out == y_in is not (maybe this works, but it's not tested).
+static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
+                                   fiat_p256_felem z_out,
+                                   const fiat_p256_felem x_in,
+                                   const fiat_p256_felem y_in,
+                                   const fiat_p256_felem z_in) {
+  fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
+  // delta = z^2
+  fiat_p256_square(delta, z_in);
+  // gamma = y^2
+  fiat_p256_square(gamma, y_in);
+  // beta = x*gamma
+  fiat_p256_mul(beta, x_in, gamma);
+
+  // alpha = 3*(x-delta)*(x+delta)
+  fiat_p256_sub(ftmp, x_in, delta);
+  fiat_p256_add(ftmp2, x_in, delta);
+
+  fiat_p256_add(tmptmp, ftmp2, ftmp2);
+  fiat_p256_add(ftmp2, ftmp2, tmptmp);
+  fiat_p256_mul(alpha, ftmp, ftmp2);
+
+  // x' = alpha^2 - 8*beta
+  fiat_p256_square(x_out, alpha);
+  fiat_p256_add(fourbeta, beta, beta);
+  fiat_p256_add(fourbeta, fourbeta, fourbeta);
+  fiat_p256_add(tmptmp, fourbeta, fourbeta);
+  fiat_p256_sub(x_out, x_out, tmptmp);
+
+  // z' = (y + z)^2 - gamma - delta
+  fiat_p256_add(delta, gamma, delta);
+  fiat_p256_add(ftmp, y_in, z_in);
+  fiat_p256_square(z_out, ftmp);
+  fiat_p256_sub(z_out, z_out, delta);
+
+  // y' = alpha*(4*beta - x') - 8*gamma^2
+  fiat_p256_sub(y_out, fourbeta, x_out);
+  fiat_p256_add(gamma, gamma, gamma);
+  fiat_p256_square(gamma, gamma);
+  fiat_p256_mul(y_out, alpha, y_out);
+  fiat_p256_add(gamma, gamma, gamma);
+  fiat_p256_sub(y_out, y_out, gamma);
+}
+
+// fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2)
+//
+// The method is taken from:
+//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
+// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
+//
+// Coq transcription and correctness proof:
+// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L135>
+// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L205>
+//
+// This function includes a branch for checking whether the two input points
+// are equal, (while not equal to the point at infinity). This case never
+// happens during single point multiplication, so there is no timing leak for
+// ECDH or ECDSA signing.
+static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
+                                fiat_p256_felem z3, const fiat_p256_felem x1,
+                                const fiat_p256_felem y1,
+                                const fiat_p256_felem z1, const int mixed,
+                                const fiat_p256_felem x2,
+                                const fiat_p256_felem y2,
+                                const fiat_p256_felem z2) {
+  fiat_p256_felem x_out, y_out, z_out;
+  fiat_p256_limb_t z1nz = fiat_p256_nz(z1);
+  fiat_p256_limb_t z2nz = fiat_p256_nz(z2);
+
+  // z1z1 = z1z1 = z1**2
+  fiat_p256_felem z1z1;
+  fiat_p256_square(z1z1, z1);
+
+  fiat_p256_felem u1, s1, two_z1z2;
+  if (!mixed) {
+    // z2z2 = z2**2
+    fiat_p256_felem z2z2;
+    fiat_p256_square(z2z2, z2);
+
+    // u1 = x1*z2z2
+    fiat_p256_mul(u1, x1, z2z2);
+
+    // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
+    fiat_p256_add(two_z1z2, z1, z2);
+    fiat_p256_square(two_z1z2, two_z1z2);
+    fiat_p256_sub(two_z1z2, two_z1z2, z1z1);
+    fiat_p256_sub(two_z1z2, two_z1z2, z2z2);
+
+    // s1 = y1 * z2**3
+    fiat_p256_mul(s1, z2, z2z2);
+    fiat_p256_mul(s1, s1, y1);
+  } else {
+    // We'll assume z2 = 1 (special case z2 = 0 is handled later).
+
+    // u1 = x1*z2z2
+    fiat_p256_copy(u1, x1);
+    // two_z1z2 = 2z1z2
+    fiat_p256_add(two_z1z2, z1, z1);
+    // s1 = y1 * z2**3
+    fiat_p256_copy(s1, y1);
+  }
+
+  // u2 = x2*z1z1
+  fiat_p256_felem u2;
+  fiat_p256_mul(u2, x2, z1z1);
+
+  // h = u2 - u1
+  fiat_p256_felem h;
+  fiat_p256_sub(h, u2, u1);
+
+  fiat_p256_limb_t xneq = fiat_p256_nz(h);
+
+  // z_out = two_z1z2 * h
+  fiat_p256_mul(z_out, h, two_z1z2);
+
+  // z1z1z1 = z1 * z1z1
+  fiat_p256_felem z1z1z1;
+  fiat_p256_mul(z1z1z1, z1, z1z1);
+
+  // s2 = y2 * z1**3
+  fiat_p256_felem s2;
+  fiat_p256_mul(s2, y2, z1z1z1);
+
+  // r = (s2 - s1)*2
+  fiat_p256_felem r;
+  fiat_p256_sub(r, s2, s1);
+  fiat_p256_add(r, r, r);
+
+  fiat_p256_limb_t yneq = fiat_p256_nz(r);
+
+  fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) &
+                                          ~constant_time_is_zero_w(z1nz) &
+                                          ~constant_time_is_zero_w(z2nz);
+  if (constant_time_declassify_w(is_nontrivial_double)) {
+    fiat_p256_point_double(x3, y3, z3, x1, y1, z1);
+    return;
+  }
+
+  // I = (2h)**2
+  fiat_p256_felem i;
+  fiat_p256_add(i, h, h);
+  fiat_p256_square(i, i);
+
+  // J = h * I
+  fiat_p256_felem j;
+  fiat_p256_mul(j, h, i);
+
+  // V = U1 * I
+  fiat_p256_felem v;
+  fiat_p256_mul(v, u1, i);
+
+  // x_out = r**2 - J - 2V
+  fiat_p256_square(x_out, r);
+  fiat_p256_sub(x_out, x_out, j);
+  fiat_p256_sub(x_out, x_out, v);
+  fiat_p256_sub(x_out, x_out, v);
+
+  // y_out = r(V-x_out) - 2 * s1 * J
+  fiat_p256_sub(y_out, v, x_out);
+  fiat_p256_mul(y_out, y_out, r);
+  fiat_p256_felem s1j;
+  fiat_p256_mul(s1j, s1, j);
+  fiat_p256_sub(y_out, y_out, s1j);
+  fiat_p256_sub(y_out, y_out, s1j);
+
+  fiat_p256_cmovznz(x_out, z1nz, x2, x_out);
+  fiat_p256_cmovznz(x3, z2nz, x1, x_out);
+  fiat_p256_cmovznz(y_out, z1nz, y2, y_out);
+  fiat_p256_cmovznz(y3, z2nz, y1, y_out);
+  fiat_p256_cmovznz(z_out, z1nz, z2, z_out);
+  fiat_p256_cmovznz(z3, z2nz, z1, z_out);
+}
+
+#include "./p256_table.h"
+
+// fiat_p256_select_point_affine selects the |idx-1|th point from a
+// precomputation table and copies it to out. If |idx| is zero, the output is
+// the point at infinity.
+static void fiat_p256_select_point_affine(
+    const fiat_p256_limb_t idx, size_t size,
+    const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) {
+  OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
+  for (size_t i = 0; i < size; i++) {
+    fiat_p256_limb_t mismatch = i ^ (idx - 1);
+    fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
+    fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
+  }
+  fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one);
+}
+
+// fiat_p256_select_point selects the |idx|th point from a precomputation table
+// and copies it to out.
+static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
+                                   const fiat_p256_felem pre_comp[/*size*/][3],
+                                   fiat_p256_felem out[3]) {
+  OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
+  for (size_t i = 0; i < size; i++) {
+    fiat_p256_limb_t mismatch = i ^ idx;
+    fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
+    fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
+    fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]);
+  }
+}
+
+// fiat_p256_get_bit returns the |i|th bit in |in|
+static crypto_word_t fiat_p256_get_bit(const Limb in[P256_LIMBS], int i) {
+  if (i < 0 || i >= 256) {
+    return 0;
+  }
+#if defined(OPENSSL_64_BIT)
+  OPENSSL_STATIC_ASSERT(sizeof(Limb) == 8, "BN_ULONG was not 64-bit");
+  return (in[i >> 6] >> (i & 63)) & 1;
+#else
+  OPENSSL_STATIC_ASSERT(sizeof(Limb) == 4, "BN_ULONG was not 32-bit");
+  return (in[i >> 5] >> (i & 31)) & 1;
+#endif
+}
+
+void p256_point_mul(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS],
+                    const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) {
+  debug_assert_nonsecret(r != NULL);
+  debug_assert_nonsecret(scalar != NULL);
+  debug_assert_nonsecret(p_x != NULL);
+  debug_assert_nonsecret(p_y != NULL);
+
+  fiat_p256_felem p_pre_comp[17][3];
+  OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
+  // Precompute multiples.
+  fiat_p256_from_words(p_pre_comp[1][0], p_x);
+  fiat_p256_from_words(p_pre_comp[1][1], p_y);
+  fiat_p256_copy(p_pre_comp[1][2], fiat_p256_one);
+
+  for (size_t j = 2; j <= 16; ++j) {
+    if (j & 1) {
+      fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
+                          p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
+                          0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
+                          p_pre_comp[j - 1][2]);
+    } else {
+      fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
+                             p_pre_comp[j][2], p_pre_comp[j / 2][0],
+                             p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
+    }
+  }
+
+  // Set nq to the point at infinity.
+  fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3];
+
+  // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round.
+  int skip = 1;  // Save two point operations in the first round.
+  for (size_t i = 255; i < 256; i--) {
+    // double
+    if (!skip) {
+      fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+    }
+
+    // do other additions every 5 doublings
+    if (i % 5 == 0) {
+      crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
+      bits |= fiat_p256_get_bit(scalar, i + 3) << 4;
+      bits |= fiat_p256_get_bit(scalar, i + 2) << 3;
+      bits |= fiat_p256_get_bit(scalar, i + 1) << 2;
+      bits |= fiat_p256_get_bit(scalar, i) << 1;
+      bits |= fiat_p256_get_bit(scalar, i - 1);
+      crypto_word_t sign, digit;
+      recode_scalar_bits(&sign, &digit, bits);
+
+      // select the point to add or subtract, in constant time.
+      fiat_p256_select_point((fiat_p256_limb_t)digit, 17,
+        RING_CORE_POINTLESS_ARRAY_CONST_CAST((const fiat_p256_felem(*)[3]))p_pre_comp,
+        tmp);
+      fiat_p256_opp(ftmp, tmp[1]);  // (X, -Y, Z) is the negative point.
+      fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);
+
+      if (!skip) {
+        fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
+                            0 /* mixed */, tmp[0], tmp[1], tmp[2]);
+      } else {
+        fiat_p256_copy(nq[0], tmp[0]);
+        fiat_p256_copy(nq[1], tmp[1]);
+        fiat_p256_copy(nq[2], tmp[2]);
+        skip = 0;
+      }
+    }
+  }
+
+  fiat_p256_to_words(r[0], nq[0]);
+  fiat_p256_to_words(r[1], nq[1]);
+  fiat_p256_to_words(r[2], nq[2]);
+}
+
+void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
+  // Set nq to the point at infinity.
+  fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3];
+
+  int skip = 1;  // Save two point operations in the first round.
+  for (size_t i = 31; i < 32; i--) {
+    if (!skip) {
+      fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+    }
+
+    // First, look 32 bits upwards.
+    crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3;
+    bits |= fiat_p256_get_bit(scalar, i + 160) << 2;
+    bits |= fiat_p256_get_bit(scalar, i + 96) << 1;
+    bits |= fiat_p256_get_bit(scalar, i + 32);
+    // Select the point to add, in constant time.
+    fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
+                                  fiat_p256_g_pre_comp[1], tmp);
+
+    if (!skip) {
+      fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
+                          1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+    } else {
+      fiat_p256_copy(nq[0], tmp[0]);
+      fiat_p256_copy(nq[1], tmp[1]);
+      fiat_p256_copy(nq[2], tmp[2]);
+      skip = 0;
+    }
+
+    // Second, look at the current position.
+    bits = fiat_p256_get_bit(scalar, i + 192) << 3;
+    bits |= fiat_p256_get_bit(scalar, i + 128) << 2;
+    bits |= fiat_p256_get_bit(scalar, i + 64) << 1;
+    bits |= fiat_p256_get_bit(scalar, i);
+    // Select the point to add, in constant time.
+    fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
+                                  fiat_p256_g_pre_comp[0], tmp);
+    fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
+                        tmp[0], tmp[1], tmp[2]);
+  }
+
+  fiat_p256_to_words(r[0], nq[0]);
+  fiat_p256_to_words(r[1], nq[1]);
+  fiat_p256_to_words(r[2], nq[2]);
+}
+
+void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
+                   const Limb b[P256_LIMBS]) {
+  fiat_p256_felem a_, b_;
+  fiat_p256_from_words(a_, a);
+  fiat_p256_from_words(b_, b);
+  fiat_p256_mul(a_, a_, b_);
+  fiat_p256_to_words(r, a_);
+}
+
+void p256_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
+  fiat_p256_felem x;
+  fiat_p256_from_words(x, a);
+  fiat_p256_square(x, x);
+  fiat_p256_to_words(r, x);
+}
+
+void p256_point_add(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS],
+                    const Limb b[3][P256_LIMBS]) {
+  fiat_p256_felem x1, y1, z1, x2, y2, z2;
+  fiat_p256_from_words(x1, a[0]);
+  fiat_p256_from_words(y1, a[1]);
+  fiat_p256_from_words(z1, a[2]);
+  fiat_p256_from_words(x2, b[0]);
+  fiat_p256_from_words(y2, b[1]);
+  fiat_p256_from_words(z2, b[2]);
+  fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2,
+                      z2);
+  fiat_p256_to_words(r[0], x1);
+  fiat_p256_to_words(r[1], y1);
+  fiat_p256_to_words(r[2], z1);
+}
+
+void p256_point_double(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS]) {
+  fiat_p256_felem x, y, z;
+  fiat_p256_from_words(x, a[0]);
+  fiat_p256_from_words(y, a[1]);
+  fiat_p256_from_words(z, a[2]);
+  fiat_p256_point_double(x, y, z, x, y, z);
+  fiat_p256_to_words(r[0], x);
+  fiat_p256_to_words(r[1], y);
+  fiat_p256_to_words(r[2], z);
+}
+
+// For testing only.
+void p256_point_add_affine(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS],
+                           const Limb b[2][P256_LIMBS]) {
+  fiat_p256_felem x1, y1, z1, x2, y2;
+  fiat_p256_from_words(x1, a[0]);
+  fiat_p256_from_words(y1, a[1]);
+  fiat_p256_from_words(z1, a[2]);
+  fiat_p256_from_words(x2, b[0]);
+  fiat_p256_from_words(y2, b[1]);
+
+  fiat_p256_felem z2 = {0};
+  fiat_p256_cmovznz(z2, fiat_p256_nz(x2) & fiat_p256_nz(y2), z2, fiat_p256_one);
+
+  fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 1 /* mixed */, x2, y2, z2);
+
+  fiat_p256_to_words(r[0], x1);
+  fiat_p256_to_words(r[1], y1);
+  fiat_p256_to_words(r[2], z1);
+}
+
+#endif
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h b/ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h
new file mode 100644
index 0000000000..ef717b170b
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h
@@ -0,0 +1,62 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+// (1) Intel Corporation, Israel Development Center, Haifa, Israel
+// (2) University of Haifa, Israel
+//
+// Reference:
+// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+//                          256 Bit Primes"
+
+#ifndef OPENSSL_HEADER_EC_P256_SHARED_H
+#define OPENSSL_HEADER_EC_P256_SHARED_H
+
+#include "ring-core/base.h"
+
+#include "../bn/internal.h"
+
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
+    !defined(OPENSSL_SMALL)
+# define OPENSSL_USE_NISTZ256
+#endif
+
+// P-256 field operations.
+//
+// An element mod P in P-256 is represented as a little-endian array of
+// |P256_LIMBS| |BN_ULONG|s, spanning the full range of values.
+//
+// The following functions take fully-reduced inputs mod P and give
+// fully-reduced outputs. They may be used in-place.
+
+#define P256_LIMBS (256 / BN_BITS2)
+
+// A P256_POINT represents a P-256 point in Jacobian coordinates.
+typedef struct {
+  BN_ULONG X[P256_LIMBS];
+  BN_ULONG Y[P256_LIMBS];
+  BN_ULONG Z[P256_LIMBS];
+} P256_POINT;
+
+typedef unsigned char P256_SCALAR_BYTES[33];
+
+static inline void p256_scalar_bytes_from_limbs(
+    P256_SCALAR_BYTES bytes_out, const BN_ULONG limbs[P256_LIMBS]) {
+  OPENSSL_memcpy(bytes_out, limbs, 32);
+  bytes_out[32] = 0;
+}
+
+#endif /* !defined(OPENSSL_USE_NISTZ256) */
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256_table.h b/ring-0.17.14/crypto/fipsmodule/ec/p256_table.h
new file mode 100644
index 0000000000..e16eabaa73
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/p256_table.h
@@ -0,0 +1,297 @@
+// Copyright 2020 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is generated by make_tables.go.
+
+// Base point pre computation
+// --------------------------
+//
+// Two different sorts of precomputed tables are used in the following code.
+// Each contain various points on the curve, where each point is three field
+// elements (x, y, z).
+//
+// For the base point table, z is usually 1 (0 for the point at infinity).
+// This table has 2 * 16 elements, starting with the following:
+// index | bits    | point
+// ------+---------+------------------------------
+//     0 | 0 0 0 0 | 0G
+//     1 | 0 0 0 1 | 1G
+//     2 | 0 0 1 0 | 2^64G
+//     3 | 0 0 1 1 | (2^64 + 1)G
+//     4 | 0 1 0 0 | 2^128G
+//     5 | 0 1 0 1 | (2^128 + 1)G
+//     6 | 0 1 1 0 | (2^128 + 2^64)G
+//     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
+//     8 | 1 0 0 0 | 2^192G
+//     9 | 1 0 0 1 | (2^192 + 1)G
+//    10 | 1 0 1 0 | (2^192 + 2^64)G
+//    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
+//    12 | 1 1 0 0 | (2^192 + 2^128)G
+//    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
+//    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
+//    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
+// followed by a copy of this with each element multiplied by 2^32.
+//
+// The reason for this is so that we can clock bits into four different
+// locations when doing simple scalar multiplies against the base point,
+// and then another four locations using the second 16 elements.
+//
+// Tables for other points have table[i] = iG for i in 0 .. 16.
+
+// fiat_p256_g_pre_comp is the table of precomputed base points
+#if defined(OPENSSL_64_BIT)
+static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = {
+    {{{0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510,
+       0x18905f76a53755c6},
+      {0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325,
+       0x8571ff1825885d85}},
+     {{0x4f922fc516a0d2bb, 0x0d5cc16c1a623499, 0x9241cf3a57c62c8b,
+       0x2f5e6961fd1b667f},
+      {0x5c15c70bf5a01797, 0x3d20b44d60956192, 0x04911b37071fdb52,
+       0xf648f9168d6f0f7b}},
+     {{0x9e566847e137bbbc, 0xe434469e8a6a0bec, 0xb1c4276179d73463,
+       0x5abe0285133d0015},
+      {0x92aa837cc04c7dab, 0x573d9f4c43260c07, 0x0c93156278e6cc37,
+       0x94bb725b6b6f7383}},
+     {{0x62a8c244bfe20925, 0x91c19ac38fdce867, 0x5a96a5d5dd387063,
+       0x61d587d421d324f6},
+      {0xe87673a2a37173ea, 0x2384800853778b65, 0x10f8441e05bab43e,
+       0xfa11fe124621efbe}},
+     {{0x1c891f2b2cb19ffd, 0x01ba8d5bb1923c23, 0xb6d03d678ac5ca8e,
+       0x586eb04c1f13bedc},
+      {0x0c35c6e527e8ed09, 0x1e81a33c1819ede2, 0x278fd6c056c652fa,
+       0x19d5ac0870864f11}},
+     {{0x62577734d2b533d5, 0x673b8af6a1bdddc0, 0x577e7c9aa79ec293,
+       0xbb6de651c3b266b1},
+      {0xe7e9303ab65259b3, 0xd6a0afd3d03a7480, 0xc5ac83d19b3cfc27,
+       0x60b4619a5d18b99b}},
+     {{0xbd6a38e11ae5aa1c, 0xb8b7652b49e73658, 0x0b130014ee5f87ed,
+       0x9d0f27b2aeebffcd},
+      {0xca9246317a730a55, 0x9c955b2fddbbc83a, 0x07c1dfe0ac019a71,
+       0x244a566d356ec48d}},
+     {{0x56f8410ef4f8b16a, 0x97241afec47b266a, 0x0a406b8e6d9c87c1,
+       0x803f3e02cd42ab1b},
+      {0x7f0309a804dbec69, 0xa83b85f73bbad05f, 0xc6097273ad8e197f,
+       0xc097440e5067adc1}},
+     {{0x846a56f2c379ab34, 0xa8ee068b841df8d1, 0x20314459176c68ef,
+       0xf1af32d5915f1f30},
+      {0x99c375315d75bd50, 0x837cffbaf72f67bc, 0x0613a41848d7723f,
+       0x23d0f130e2d41c8b}},
+     {{0xed93e225d5be5a2b, 0x6fe799835934f3c6, 0x4314092622626ffc,
+       0x50bbb4d97990216a},
+      {0x378191c6e57ec63e, 0x65422c40181dcdb2, 0x41a8099b0236e0f6,
+       0x2b10011801fe49c3}},
+     {{0xfc68b5c59b391593, 0xc385f5a2598270fc, 0x7144f3aad19adcbb,
+       0xdd55899983fbae0c},
+      {0x93b88b8e74b82ff4, 0xd2e03c4071e734c9, 0x9a7a9eaf43c0322a,
+       0xe6e4c551149d6041}},
+     {{0x5fe14bfe80ec21fe, 0xf6ce116ac255be82, 0x98bc5a072f4a5d67,
+       0xfad27148db7e63af},
+      {0x90c0b6ac29ab05b3, 0x37a9a83c4e251ae6, 0x0a7dc875c2aade7d,
+       0x77387de39f0e1a84}},
+     {{0x1e9ecc49a56c0dd7, 0xa5cffcd846086c74, 0x8f7a1408f505aece,
+       0xb37b85c0bef0c47e},
+      {0x3596b6e4cc0e6a8f, 0xfd6d4bbf6b388f23, 0xaba453fac39cef4e,
+       0x9c135ac8f9f628d5}},
+     {{0x0a1c729495c8f8be, 0x2961c4803bf362bf, 0x9e418403df63d4ac,
+       0xc109f9cb91ece900},
+      {0xc2d095d058945705, 0xb9083d96ddeb85c0, 0x84692b8d7a40449b,
+       0x9bc3344f2eee1ee1}},
+     {{0x0d5ae35642913074, 0x55491b2748a542b1, 0x469ca665b310732a,
+       0x29591d525f1a4cc1},
+      {0xe76f5b6bb84f983f, 0xbe7eef419f5f84e1, 0x1200d49680baa189,
+       0x6376551f18ef332c}}},
+    {{{0x202886024147519a, 0xd0981eac26b372f0, 0xa9d4a7caa785ebc8,
+       0xd953c50ddbdf58e9},
+      {0x9d6361ccfd590f8f, 0x72e9626b44e6c917, 0x7fd9611022eb64cf,
+       0x863ebb7e9eb288f3}},
+     {{0x4fe7ee31b0e63d34, 0xf4600572a9e54fab, 0xc0493334d5e7b5a4,
+       0x8589fb9206d54831},
+      {0xaa70f5cc6583553a, 0x0879094ae25649e5, 0xcc90450710044652,
+       0xebb0696d02541c4f}},
+     {{0xabbaa0c03b89da99, 0xa6f2d79eb8284022, 0x27847862b81c05e8,
+       0x337a4b5905e54d63},
+      {0x3c67500d21f7794a, 0x207005b77d6d7f61, 0x0a5a378104cfd6e8,
+       0x0d65e0d5f4c2fbd6}},
+     {{0xd433e50f6d3549cf, 0x6f33696ffacd665e, 0x695bfdacce11fcb4,
+       0x810ee252af7c9860},
+      {0x65450fe17159bb2c, 0xf7dfbebe758b357b, 0x2b057e74d69fea72,
+       0xd485717a92731745}},
+     {{0xce1f69bbe83f7669, 0x09f8ae8272877d6b, 0x9548ae543244278d,
+       0x207755dee3c2c19c},
+      {0x87bd61d96fef1945, 0x18813cefb12d28c3, 0x9fbcd1d672df64aa,
+       0x48dc5ee57154b00d}},
+     {{0xef0f469ef49a3154, 0x3e85a5956e2b2e9a, 0x45aaec1eaa924a9c,
+       0xaa12dfc8a09e4719},
+      {0x26f272274df69f1d, 0xe0e4c82ca2ff5e73, 0xb9d8ce73b7a9dd44,
+       0x6c036e73e48ca901}},
+     {{0xe1e421e1a47153f0, 0xb86c3b79920418c9, 0x93bdce87705d7672,
+       0xf25ae793cab79a77},
+      {0x1f3194a36d869d0c, 0x9d55c8824986c264, 0x49fb5ea3096e945e,
+       0x39b8e65313db0a3e}},
+     {{0xe3417bc035d0b34a, 0x440b386b8327c0a7, 0x8fb7262dac0362d1,
+       0x2c41114ce0cdf943},
+      {0x2ba5cef1ad95a0b1, 0xc09b37a867d54362, 0x26d6cdd201e486c9,
+       0x20477abf42ff9297}},
+     {{0x0f121b41bc0a67d2, 0x62d4760a444d248a, 0x0e044f1d659b4737,
+       0x08fde365250bb4a8},
+      {0xaceec3da848bf287, 0xc2a62182d3369d6e, 0x3582dfdc92449482,
+       0x2f7e2fd2565d6cd7}},
+     {{0x0a0122b5178a876b, 0x51ff96ff085104b4, 0x050b31ab14f29f76,
+       0x84abb28b5f87d4e6},
+      {0xd5ed439f8270790a, 0x2d6cb59d85e3f46b, 0x75f55c1b6c1e2212,
+       0xe5436f6717655640}},
+     {{0xc2965ecc9aeb596d, 0x01ea03e7023c92b4, 0x4704b4b62e013961,
+       0x0ca8fd3f905ea367},
+      {0x92523a42551b2b61, 0x1eb7a89c390fcd06, 0xe7f1d2be0392a63e,
+       0x96dca2644ddb0c33}},
+     {{0x231c210e15339848, 0xe87a28e870778c8d, 0x9d1de6616956e170,
+       0x4ac3c9382bb09c0b},
+      {0x19be05516998987d, 0x8b2376c4ae09f4d6, 0x1de0b7651a3f933d,
+       0x380d94c7e39705f4}},
+     {{0x3685954b8c31c31d, 0x68533d005bf21a0c, 0x0bd7626e75c79ec9,
+       0xca17754742c69d54},
+      {0xcc6edafff6d2dbb2, 0xfd0d8cbd174a9d18, 0x875e8793aa4578e8,
+       0xa976a7139cab2ce6}},
+     {{0xce37ab11b43ea1db, 0x0a7ff1a95259d292, 0x851b02218f84f186,
+       0xa7222beadefaad13},
+      {0xa2ac78ec2b0a9144, 0x5a024051f2fa59c5, 0x91d1eca56147ce38,
+       0xbe94d523bc2ac690}},
+     {{0x2d8daefd79ec1a0f, 0x3bbcd6fdceb39c97, 0xf5575ffc58f61a95,
+       0xdbd986c4adf7b420},
+      {0x81aa881415f39eb7, 0x6ee2fcf5b98d976c, 0x5465475dcf2f717d,
+       0x8e24d3c46860bbd0}}}};
+#else
+static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = {
+    {{{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b,
+       0xa53755c6, 0x18905f76},
+      {0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688,
+       0x25885d85, 0x8571ff18}},
+     {{0x16a0d2bb, 0x4f922fc5, 0x1a623499, 0x0d5cc16c, 0x57c62c8b, 0x9241cf3a,
+       0xfd1b667f, 0x2f5e6961},
+      {0xf5a01797, 0x5c15c70b, 0x60956192, 0x3d20b44d, 0x071fdb52, 0x04911b37,
+       0x8d6f0f7b, 0xf648f916}},
+     {{0xe137bbbc, 0x9e566847, 0x8a6a0bec, 0xe434469e, 0x79d73463, 0xb1c42761,
+       0x133d0015, 0x5abe0285},
+      {0xc04c7dab, 0x92aa837c, 0x43260c07, 0x573d9f4c, 0x78e6cc37, 0x0c931562,
+       0x6b6f7383, 0x94bb725b}},
+     {{0xbfe20925, 0x62a8c244, 0x8fdce867, 0x91c19ac3, 0xdd387063, 0x5a96a5d5,
+       0x21d324f6, 0x61d587d4},
+      {0xa37173ea, 0xe87673a2, 0x53778b65, 0x23848008, 0x05bab43e, 0x10f8441e,
+       0x4621efbe, 0xfa11fe12}},
+     {{0x2cb19ffd, 0x1c891f2b, 0xb1923c23, 0x01ba8d5b, 0x8ac5ca8e, 0xb6d03d67,
+       0x1f13bedc, 0x586eb04c},
+      {0x27e8ed09, 0x0c35c6e5, 0x1819ede2, 0x1e81a33c, 0x56c652fa, 0x278fd6c0,
+       0x70864f11, 0x19d5ac08}},
+     {{0xd2b533d5, 0x62577734, 0xa1bdddc0, 0x673b8af6, 0xa79ec293, 0x577e7c9a,
+       0xc3b266b1, 0xbb6de651},
+      {0xb65259b3, 0xe7e9303a, 0xd03a7480, 0xd6a0afd3, 0x9b3cfc27, 0xc5ac83d1,
+       0x5d18b99b, 0x60b4619a}},
+     {{0x1ae5aa1c, 0xbd6a38e1, 0x49e73658, 0xb8b7652b, 0xee5f87ed, 0x0b130014,
+       0xaeebffcd, 0x9d0f27b2},
+      {0x7a730a55, 0xca924631, 0xddbbc83a, 0x9c955b2f, 0xac019a71, 0x07c1dfe0,
+       0x356ec48d, 0x244a566d}},
+     {{0xf4f8b16a, 0x56f8410e, 0xc47b266a, 0x97241afe, 0x6d9c87c1, 0x0a406b8e,
+       0xcd42ab1b, 0x803f3e02},
+      {0x04dbec69, 0x7f0309a8, 0x3bbad05f, 0xa83b85f7, 0xad8e197f, 0xc6097273,
+       0x5067adc1, 0xc097440e}},
+     {{0xc379ab34, 0x846a56f2, 0x841df8d1, 0xa8ee068b, 0x176c68ef, 0x20314459,
+       0x915f1f30, 0xf1af32d5},
+      {0x5d75bd50, 0x99c37531, 0xf72f67bc, 0x837cffba, 0x48d7723f, 0x0613a418,
+       0xe2d41c8b, 0x23d0f130}},
+     {{0xd5be5a2b, 0xed93e225, 0x5934f3c6, 0x6fe79983, 0x22626ffc, 0x43140926,
+       0x7990216a, 0x50bbb4d9},
+      {0xe57ec63e, 0x378191c6, 0x181dcdb2, 0x65422c40, 0x0236e0f6, 0x41a8099b,
+       0x01fe49c3, 0x2b100118}},
+     {{0x9b391593, 0xfc68b5c5, 0x598270fc, 0xc385f5a2, 0xd19adcbb, 0x7144f3aa,
+       0x83fbae0c, 0xdd558999},
+      {0x74b82ff4, 0x93b88b8e, 0x71e734c9, 0xd2e03c40, 0x43c0322a, 0x9a7a9eaf,
+       0x149d6041, 0xe6e4c551}},
+     {{0x80ec21fe, 0x5fe14bfe, 0xc255be82, 0xf6ce116a, 0x2f4a5d67, 0x98bc5a07,
+       0xdb7e63af, 0xfad27148},
+      {0x29ab05b3, 0x90c0b6ac, 0x4e251ae6, 0x37a9a83c, 0xc2aade7d, 0x0a7dc875,
+       0x9f0e1a84, 0x77387de3}},
+     {{0xa56c0dd7, 0x1e9ecc49, 0x46086c74, 0xa5cffcd8, 0xf505aece, 0x8f7a1408,
+       0xbef0c47e, 0xb37b85c0},
+      {0xcc0e6a8f, 0x3596b6e4, 0x6b388f23, 0xfd6d4bbf, 0xc39cef4e, 0xaba453fa,
+       0xf9f628d5, 0x9c135ac8}},
+     {{0x95c8f8be, 0x0a1c7294, 0x3bf362bf, 0x2961c480, 0xdf63d4ac, 0x9e418403,
+       0x91ece900, 0xc109f9cb},
+      {0x58945705, 0xc2d095d0, 0xddeb85c0, 0xb9083d96, 0x7a40449b, 0x84692b8d,
+       0x2eee1ee1, 0x9bc3344f}},
+     {{0x42913074, 0x0d5ae356, 0x48a542b1, 0x55491b27, 0xb310732a, 0x469ca665,
+       0x5f1a4cc1, 0x29591d52},
+      {0xb84f983f, 0xe76f5b6b, 0x9f5f84e1, 0xbe7eef41, 0x80baa189, 0x1200d496,
+       0x18ef332c, 0x6376551f}}},
+    {{{0x4147519a, 0x20288602, 0x26b372f0, 0xd0981eac, 0xa785ebc8, 0xa9d4a7ca,
+       0xdbdf58e9, 0xd953c50d},
+      {0xfd590f8f, 0x9d6361cc, 0x44e6c917, 0x72e9626b, 0x22eb64cf, 0x7fd96110,
+       0x9eb288f3, 0x863ebb7e}},
+     {{0xb0e63d34, 0x4fe7ee31, 0xa9e54fab, 0xf4600572, 0xd5e7b5a4, 0xc0493334,
+       0x06d54831, 0x8589fb92},
+      {0x6583553a, 0xaa70f5cc, 0xe25649e5, 0x0879094a, 0x10044652, 0xcc904507,
+       0x02541c4f, 0xebb0696d}},
+     {{0x3b89da99, 0xabbaa0c0, 0xb8284022, 0xa6f2d79e, 0xb81c05e8, 0x27847862,
+       0x05e54d63, 0x337a4b59},
+      {0x21f7794a, 0x3c67500d, 0x7d6d7f61, 0x207005b7, 0x04cfd6e8, 0x0a5a3781,
+       0xf4c2fbd6, 0x0d65e0d5}},
+     {{0x6d3549cf, 0xd433e50f, 0xfacd665e, 0x6f33696f, 0xce11fcb4, 0x695bfdac,
+       0xaf7c9860, 0x810ee252},
+      {0x7159bb2c, 0x65450fe1, 0x758b357b, 0xf7dfbebe, 0xd69fea72, 0x2b057e74,
+       0x92731745, 0xd485717a}},
+     {{0xe83f7669, 0xce1f69bb, 0x72877d6b, 0x09f8ae82, 0x3244278d, 0x9548ae54,
+       0xe3c2c19c, 0x207755de},
+      {0x6fef1945, 0x87bd61d9, 0xb12d28c3, 0x18813cef, 0x72df64aa, 0x9fbcd1d6,
+       0x7154b00d, 0x48dc5ee5}},
+     {{0xf49a3154, 0xef0f469e, 0x6e2b2e9a, 0x3e85a595, 0xaa924a9c, 0x45aaec1e,
+       0xa09e4719, 0xaa12dfc8},
+      {0x4df69f1d, 0x26f27227, 0xa2ff5e73, 0xe0e4c82c, 0xb7a9dd44, 0xb9d8ce73,
+       0xe48ca901, 0x6c036e73}},
+     {{0xa47153f0, 0xe1e421e1, 0x920418c9, 0xb86c3b79, 0x705d7672, 0x93bdce87,
+       0xcab79a77, 0xf25ae793},
+      {0x6d869d0c, 0x1f3194a3, 0x4986c264, 0x9d55c882, 0x096e945e, 0x49fb5ea3,
+       0x13db0a3e, 0x39b8e653}},
+     {{0x35d0b34a, 0xe3417bc0, 0x8327c0a7, 0x440b386b, 0xac0362d1, 0x8fb7262d,
+       0xe0cdf943, 0x2c41114c},
+      {0xad95a0b1, 0x2ba5cef1, 0x67d54362, 0xc09b37a8, 0x01e486c9, 0x26d6cdd2,
+       0x42ff9297, 0x20477abf}},
+     {{0xbc0a67d2, 0x0f121b41, 0x444d248a, 0x62d4760a, 0x659b4737, 0x0e044f1d,
+       0x250bb4a8, 0x08fde365},
+      {0x848bf287, 0xaceec3da, 0xd3369d6e, 0xc2a62182, 0x92449482, 0x3582dfdc,
+       0x565d6cd7, 0x2f7e2fd2}},
+     {{0x178a876b, 0x0a0122b5, 0x085104b4, 0x51ff96ff, 0x14f29f76, 0x050b31ab,
+       0x5f87d4e6, 0x84abb28b},
+      {0x8270790a, 0xd5ed439f, 0x85e3f46b, 0x2d6cb59d, 0x6c1e2212, 0x75f55c1b,
+       0x17655640, 0xe5436f67}},
+     {{0x9aeb596d, 0xc2965ecc, 0x023c92b4, 0x01ea03e7, 0x2e013961, 0x4704b4b6,
+       0x905ea367, 0x0ca8fd3f},
+      {0x551b2b61, 0x92523a42, 0x390fcd06, 0x1eb7a89c, 0x0392a63e, 0xe7f1d2be,
+       0x4ddb0c33, 0x96dca264}},
+     {{0x15339848, 0x231c210e, 0x70778c8d, 0xe87a28e8, 0x6956e170, 0x9d1de661,
+       0x2bb09c0b, 0x4ac3c938},
+      {0x6998987d, 0x19be0551, 0xae09f4d6, 0x8b2376c4, 0x1a3f933d, 0x1de0b765,
+       0xe39705f4, 0x380d94c7}},
+     {{0x8c31c31d, 0x3685954b, 0x5bf21a0c, 0x68533d00, 0x75c79ec9, 0x0bd7626e,
+       0x42c69d54, 0xca177547},
+      {0xf6d2dbb2, 0xcc6edaff, 0x174a9d18, 0xfd0d8cbd, 0xaa4578e8, 0x875e8793,
+       0x9cab2ce6, 0xa976a713}},
+     {{0xb43ea1db, 0xce37ab11, 0x5259d292, 0x0a7ff1a9, 0x8f84f186, 0x851b0221,
+       0xdefaad13, 0xa7222bea},
+      {0x2b0a9144, 0xa2ac78ec, 0xf2fa59c5, 0x5a024051, 0x6147ce38, 0x91d1eca5,
+       0xbc2ac690, 0xbe94d523}},
+     {{0x79ec1a0f, 0x2d8daefd, 0xceb39c97, 0x3bbcd6fd, 0x58f61a95, 0xf5575ffc,
+       0xadf7b420, 0xdbd986c4},
+      {0x15f39eb7, 0x81aa8814, 0xb98d976c, 0x6ee2fcf5, 0xcf2f717d, 0x5465475d,
+       0x6860bbd0, 0x8e24d3c4}}}};
+#endif
diff --git a/ring-0.17.14/crypto/fipsmodule/ec/util.h b/ring-0.17.14/crypto/fipsmodule/ec/util.h
new file mode 100644
index 0000000000..1fef3da614
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ec/util.h
@@ -0,0 +1,258 @@
+// Copyright 2015 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ring-core/base.h>
+
+#include "../../internal.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+
+// This function looks at 5+1 scalar bits (5 current, 1 adjacent less
+// significant bit), and recodes them into a signed digit for use in fast point
+// multiplication: the use of signed rather than unsigned digits means that
+// fewer points need to be precomputed, given that point inversion is easy (a
+// precomputed point dP makes -dP available as well).
+//
+// BACKGROUND:
+//
+// Signed digits for multiplication were introduced by Booth ("A signed binary
+// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
+// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
+// Booth's original encoding did not generally improve the density of nonzero
+// digits over the binary representation, and was merely meant to simplify the
+// handling of signed factors given in two's complement; but it has since been
+// shown to be the basis of various signed-digit representations that do have
+// further advantages, including the wNAF, using the following general
+// approach:
+//
+// (1) Given a binary representation
+//
+//       b_k  ...  b_2  b_1  b_0,
+//
+//     of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
+//     by using bit-wise subtraction as follows:
+//
+//        b_k     b_(k-1)  ...  b_2  b_1  b_0
+//      -         b_k      ...  b_3  b_2  b_1  b_0
+//       -----------------------------------------
+//        s_(k+1) s_k      ...  s_3  s_2  s_1  s_0
+//
+//     A left-shift followed by subtraction of the original value yields a new
+//     representation of the same value, using signed bits s_i = b_(i-1) - b_i.
+//     This representation from Booth's paper has since appeared in the
+//     literature under a variety of different names including "reversed binary
+//     form", "alternating greedy expansion", "mutual opposite form", and
+//     "sign-alternating {+-1}-representation".
+//
+//     An interesting property is that among the nonzero bits, values 1 and -1
+//     strictly alternate.
+//
+// (2) Various window schemes can be applied to the Booth representation of
+//     integers: for example, right-to-left sliding windows yield the wNAF
+//     (a signed-digit encoding independently discovered by various researchers
+//     in the 1990s), and left-to-right sliding windows yield a left-to-right
+//     equivalent of the wNAF (independently discovered by various researchers
+//     around 2004).
+//
+// To prevent leaking information through side channels in point multiplication,
+// we need to recode the given integer into a regular pattern: sliding windows
+// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
+// decades older: we'll be using the so-called "modified Booth encoding" due to
+// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
+// (1961), pp. 67-91), in a radix-2^5 setting.  That is, we always combine five
+// signed bits into a signed digit:
+//
+//       s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
+//
+// The sign-alternating property implies that the resulting digit values are
+// integers from -16 to 16.
+//
+// Of course, we don't actually need to compute the signed digits s_i as an
+// intermediate step (that's just a nice way to see how this scheme relates
+// to the wNAF): a direct computation obtains the recoded digit from the
+// six bits b_(5j + 4) ... b_(5j - 1).
+//
+// This function takes those six bits as an integer (0 .. 63), writing the
+// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
+// value, in the range 0 .. 16).  Note that this integer essentially provides
+// the input bits "shifted to the left" by one position: for example, the input
+// to compute the least significant recoded digit, given that there's no bit
+// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
+//
+// DOUBLING CASE:
+//
+// Point addition formulas for short Weierstrass curves are often incomplete.
+// Edge cases such as P + P or P + ∞ must be handled separately. This
+// complicates constant-time requirements. P + ∞ cannot be avoided (any window
+// may be zero) and is handled with constant-time selects. P + P (where P is not
+// ∞) usually is not. Instead, windowing strategies are chosen to avoid this
+// case. Whether this happens depends on the group order.
+//
+// Let w be the window width (in this function, w = 5). The non-trivial doubling
+// case in single-point scalar multiplication may occur if and only if the
+// 2^(w-1) bit of the group order is zero.
+//
+// Note the above only holds if the scalar is fully reduced and the group order
+// is a prime that is much larger than 2^w. It also only holds when windows
+// are applied from most significant to least significant, doubling between each
+// window. It does not apply to more complex table strategies such as
+// |EC_nistz256_method|.
+//
+// PROOF:
+//
+// Let n be the group order. Let l be the number of bits needed to represent n.
+// Assume there exists some 0 <= k < n such that signed w-bit windowed
+// multiplication hits the doubling case.
+//
+// Windowed multiplication consists of iterating over groups of s_i (defined
+// above based on k's binary representation) from most to least significant. At
+// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant
+// window), we:
+//
+//  1. Double the accumulator A, w times. Let A_i be the value of A at this
+//     point.
+//
+//  2. Set A to T_i + A_i, where T_i is a precomputed multiple of P
+//     corresponding to the window s_(i+w-1) ... s_i.
+//
+// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as
+// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i.
+// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is
+// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i =
+// 2^w * (a_(i+w) + t_(i+w)).
+//
+// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it
+// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i.
+// This is computed as:
+//
+//         b_(i+w-2) b_(i+w-3)  ...  b_i      b_(i-1)
+//      -  b_(i+w-1) b_(i+w-2)  ...  b_(i+1)  b_i
+//       --------------------------------------------
+//   t_i = s_(i+w-1) s_(i+w-2)  ...  s_(i+1)  s_i
+//
+// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer
+// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i.
+//
+//   t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x)
+//       = x - 2^(w-1)*b_(i+w-1) + b_(i-1)
+//
+// Or, using C notation for bit operations:
+//
+//   t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1
+//
+// Note b_(i-1) is added in left-shifted by one (or doubled) from its place.
+// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed
+// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C
+// notation, this is:
+//
+//   a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w
+//
+// Observe that, while t_i may be positive or negative, a_i is bounded by
+// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
+// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
+// all groups. That would imply the subsequent a_i is zero, which means all
+// terms thus far were zero.)
+//
+// Returning to our doubling position, we have a_j = t_j (mod n). We now
+// determine the value of a_j - t_j, which must be divisible by n. Our bounds on
+// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w
+// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if
+// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n.
+//
+// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
+//
+//   n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j
+//                <= k/2^j + 2^w - t_j
+//                 < n/2^w + 2^w + 2^(w-1)
+//
+// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final
+// addition may hit the doubling case.
+//
+// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
+// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the
+// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0.
+// That is:
+//
+// - 2^w divides k_H
+// - k_M is 0 or 2^(w-1)
+// - 0 <= k_L < 2^(w-1)
+//
+// Divide n into n_H + n_M + n_L similarly. We thus have:
+//
+//   t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1
+//       = k & ((1<<(w-1)) - 1) - k & (1<<(w-1))
+//       = k_L - k_M
+//
+//   a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w
+//       = (k>>w) << w + ((k>>(w-1)) & 1) << w
+//       = k_H + 2*k_M
+//
+//                 n = a_0 - t_0
+//   n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M)
+//                   = k_H + 3*k_M - k_L
+//
+// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be
+// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H.
+// Then,
+//
+//   n_M + n_L = 3*(2^(w-1)) - k_L
+//             > 3*(2^(w-1)) - 2^(w-1)
+//             = 2^w
+//
+// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose
+// k_H < n_H - 2*2^w. Then,
+//
+//   n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
+//                   < n_H - 2*2^w + 3*(2^(w-1)) - k_L
+//         n_M + n_L < -2^(w-1) - k_L
+//
+// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus,
+//
+//   n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
+//                   = n_H - 2^w + 3*(2^(w-1)) - k_L
+//         n_M + n_L = 2^(w-1) - k_L
+//                  <= 2^(w-1)
+//
+// Equality would mean 2^(w-1) divides n, which is impossible if n is prime.
+// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition.
+//
+// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w,
+// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point
+// doubling in the final addition and is the only such scalar.
+//
+// COMMON CURVES:
+//
+// The group orders for common curves end in the following bit patterns:
+//
+//   P-521: ...00001001; w = 4 is okay
+//   P-384: ...01110011; w = 2, 5, 6, 7 are okay
+//   P-256: ...01010001; w = 5, 7 are okay
+//   P-224: ...00111101; w = 3, 4, 5, 6 are okay
+static inline void recode_scalar_bits(crypto_word_t *sign, crypto_word_t *digit,
+                                      crypto_word_t in) {
+  crypto_word_t s, d;
+
+  s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as
+                          * 6-bit value */
+  d = (1 << 6) - in - 1;
+  d = (d & s) | (in & ~s);
+  d = (d >> 1) + (d & 1);
+
+  *sign = s & 1;
+  *digit = d;
+}
diff --git a/ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt b/ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
new file mode 100644
index 0000000000..c00af6c9e9
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
@@ -0,0 +1,1362 @@
+# Tests from NIST CAVP 186-4 ECDSA2VS Test Vectors, Signature Verification Test
+# http://csrc.nist.gov/groups/STM/cavp/documents/dss/186-3ecdsatestvectors.zip
+#
+# NIST's files provide message and digest pairs. Since this is a low-level test,
+# the digests have been extracted. P-521 test vectors were fixed to have the
+# right number of leading zeros.
+
+Curve = P-256
+X = 1198b3c409a8b47edb1347e0982d533cb1813e5cb2a92c824b2881b3cd2f3f4a
+Y = 0bdbac5fa02e41e775f8d602446d58ecb2209b5a3d79ae69eef399016e992e87
+Digest = 01ed0c41d650479c47057f61433d7e8b24492649
+R = 9206d435f148f88c15b2effbf3c506e41b2c620102022b801e371d0767b54bea
+S = cbc4e1674ae1af69873946ccf6275946e59e0107278749b2d0010795833d80fa
+Invalid = Y
+
+Curve = P-256
+X = f7c6280aecd6b936513b0ca84e63346333dc41437a15442e605d46bba93ae101
+Y = 3c834cecc16167b07866a9478f9f2d882de7ef937da447cd837e60cb5ed65d81
+Digest = f91b4dfddd5eb33a875d2e50d1e949211ac819da
+R = f615af212ab030c4bbf9362d9815a1462312df4beb4358a7ce80d820355420bf
+S = d12ed715ef65cfe6fe6bf348364088a0e7f70927bbafe4c12fc4cb65c0cc51bc
+Invalid = Y
+
+Curve = P-256
+X = 0e7632dbc4db879e10d1d80f2789d9fa414c1fe77a6c1e56d6667af43e36e610
+Y = 6f0dd2a5840e5a6f6ff7e23f656f5c945b7a493fbb0cfd5b9b531bf04435b1ef
+Digest = 3905696f8bad8205fa1445df0e91ade3dbc413e6
+R = 2b0b9ab4a575732a168f28494b66a855fc1a757fb1177864bf3e4f0a000c4a86
+S = 54901ce2f92f55ac112afa0f8b62bc00b44c8c10fe0c863675bfd305d6dc0cd8
+Invalid = Y
+
+Curve = P-256
+X = 1613f12bae8e98d09b4bba53f5229596a0d417d2c625f41bb15f923b3c1e4b57
+Y = 411319fa85227997a4cf3b1756161485124d2cedc38c9c30d82f42dc2647d545
+Digest = 580d31ce22700a20c2db81bcdac37330b491c86f
+R = ed058d476a77be99c1b0fc8502abe545541b4c0ff3eed3f558133ae2f02042b0
+S = c571b4895712a4f64f7220b0694cab767379b09f1824fe7874acd127deb2371e
+Invalid = Y
+
+Curve = P-256
+X = 88bb041dcb1733a676a7f4ae8d3e407d72d5396547f07db77078485c1d5db077
+Y = 72cf2b55e596cd140c58228f1b0a19c34fca26ffac043528a417c5abb6fca9c9
+Digest = 7900a02f768b0718a13525c33adace583de15c50
+R = 87208734deb125dca68f0d33f9d369cf1b79cf5a021391b9c6c1727d2efe663a
+S = b984f722de18f1ce407104342948f03f2b55413a096c4b5fca1e032a2c814a4a
+Invalid = Y
+
+Curve = P-256
+X = 811eb5180def7fb60d632f8cb2cba831b88cee778aa2a82ec3a5fc3d80ff7fb6
+Y = db88d65b0fc35d9ba1f1ced0400434979ae895d371d1441d7c7a441a9fb1709b
+Digest = 17b7451ea903125ccb293ffaa9d1a4ca1141a2c5
+R = c329fa28dac0018276c5af0cd770e60be50bc14e2562d5556991971edc7d4916
+S = 2d111d13837a02fa279fe835a7dc59a521864d92b26649ca4e24b36ae93878e8
+Invalid = Y
+
+Curve = P-256
+X = 4a6f1e7f7268174d23993b8b58aa60c2a87b18de79b36a750ec86dd6f9e12227
+Y = 572df22bd6487a863a51ca544b8c5de2b47f801372a881cb996a97d9a98aa825
+Digest = 54e9a048559f370425e9c8e54a460ec91bcc930a
+R = 4a800e24de65e5c57d4cab4dd1ef7b6c38a2f0aa5cfd3a571a4b552fb1993e69
+S = d9c89fb983640a7e65edf632cacd1de0823b7efbc798fc1f7bbfacdda7398955
+Invalid = Y
+
+Curve = P-256
+X = f3033d1e548d245b5e45ff1147db8cd44db8a1f2823c3c164125be88f9a982c2
+Y = 3c078f6cee2f50e95e8916aa9c4e93de3fdf9b045abac6f707cfcb22d065638e
+Digest = e8d38e4c6a905a814b04c2841d898ed6da023c34
+R = d4255db86a416a5a688de4e238071ef16e5f2a20e31b9490c03dee9ae6164c34
+S = 4e0ac1e1a6725bf7c6bd207439b2d370c5f2dea1ff4decf1650ab84c7769efc0
+
+Curve = P-256
+X = 0ea0a6bb6c70966fad1a2307479c12de2322795bdecb70e4b286bd6200ba9c1a
+Y = c40eda3947021348db691ac4086fb6c06b587ce37c155bb0a7d912b93226de81
+Digest = 3b08bf1b67abc03c1cd69b0e24743b5c2d49e506
+R = f5509deff7bfda3f3759800fa4033af6a84466b114ecb48eac37eff48d2ae1b3
+S = 8c4b62dce2082f80caf220cdbb1d02567bbdfab40564b90ef31d86e3e10ce80a
+Invalid = Y
+
+Curve = P-256
+X = e7a57e0f6ec0fa9c7c34978034cf82f039f8fd62804070ad943573fc8efa5775
+Y = 87b2cc85dfff2dae5620fbe3e6256bd728de28fc9dc1b5eb6b5d7bd5d29186ad
+Digest = a8c5dc0344b1442dfdb5f8836251893d6c4ecbe9
+R = 97642038932fdddbe2021ec1af53ae6b9af00ef9c8b9f26aea582892e80e6285
+S = 9cb14918359338041cf795cf6781e4905837fa5ce3b3e50ffafb5f13c73b5bc8
+Invalid = Y
+
+Curve = P-256
+X = be7a651be0c87278569987cf62d7fa1dd1b3d6e1b868d8f4dfb56135a9960eec
+Y = b7a62c588a987760b915edbd7f95506870c60f042471de1d8b2d4cd9d6563391
+Digest = 2f93ee45db133a14c26d418c2ffd3470ae63bf50
+R = aa889fb608b6939f6eeacf2f64c3b2e3a6061f2834058c7e724321720b737a63
+S = 6cd6d0ef2b93a760daa914e11b9b414bd4d72457405f00a62ab63f36d76efb73
+Invalid = Y
+
+Curve = P-256
+X = 76ddc46d8db8d7ce2ce837f60cdabcee92b7c7817ee41c8f066f1ae65f85c318
+Y = bea47191f1c584c87250370ce337a1de1583bcfc20ccc23b7a82e83f19adaa88
+Digest = 2136a5470ff9d45214a0b2c300042efea8ff7266
+R = 84a42efbf7ec04166ad144d19cd98c120aa2e79d483b5eea6fbdfa7f1222e07b
+S = e41531205e691e65668f69f518abc7b60f32c373434872a043b7358462babf83
+Invalid = Y
+
+Curve = P-256
+X = 2f71b932f770ba9daf7c1dd47444ab6cb8881f71a1c597e719845b15cb84ca35
+Y = ab928625b40ec0738d0fc8dbc4df4a1f65d20bc0447b69cfa13bb20b95bb41d4
+Digest = ae6093bb37c1264ca3ead439e4f678721912c8c4
+R = 63fca172bbca6197cd2802a9cb61d74c2b47cf35f6d35203e67ffbaa838be775
+S = e70ec283cd212df6ba3723e26b697501f112d7cf64e4f45185dae76055e09f1e
+
+Curve = P-256
+X = ce775648b928db82ac5edb3b009d32959a73b86c45e96d4b8d5b6e640b7c2790
+Y = 52455caf08ee94d86f0984e9ec9268d74823f2102dd97fced59638055f6af18e
+Digest = 60054807acb29e3091a023c42b9885c4945249e1
+R = 2a64b29146588f3153fee1029a0131ac0a8a25ba2ecc494f697c166c7c91fc08
+S = 7b429bc12a72ca3d76c119eea9f4098633cc31c87831e54d5d93afd6e8d20f4f
+Invalid = Y
+
+Curve = P-256
+X = cd2f29a53f0ce57e0e4a542c3256e65ebbdc30415f4de771d5d706d3aeacc852
+Y = dbbf2c129f30d11fe77d7816a24187764eae3fb2ff70c1ec745e876e26f5232f
+Digest = 5f50e35b134942295c16d003742fd6bce5bdab45
+R = 2454c5ee84e4f77b554acd368dd412389db8c78429590a092f24db2da43cb761
+S = 63e870ce2fa4085d4ff1e360f7a5c101a1f8b288abe71cca56887e613ad034b7
+
+Curve = P-256
+X = 843f6d83d777aac75b758d58c670f417c8deea8d339a440bb626114318c34f29
+Y = 83e0c70008521c8509044b724420463e3478e3c91874d424be44413d1ce555f3
+Digest = cda2c7ad9abb2a858c4981550f78974c69e41cc31fa33509e3e83dc2
+R = d08e9a5db411019d826b20ac889227ed245503a6d839494db1e8d7995a6b245b
+S = 8d46a204054125d0dc776ab1055302ec4eb0f20b90bca6d205f21d3cefd29097
+
+Curve = P-256
+X = f08b56f73f7a0e098444f6f0a02ad81ce0b914a11cafa15893d1c84704e1c564
+Y = bbee9aeb91cdc2d1d1437b4168df73acfd64e8b02962b14c85e67187e1ef80a4
+Digest = 5453c2656550e9b3dc6c40a3f1362a73522396bc35d383dd6451128f
+R = 71b3ec982725a007ac18a5cf60587e1fd1beb57685a1f9df3cddd9df25dcbc18
+S = 407e41217325f92f8a031cfcc4eb64c1a4b17b0a7459c254af754a7ea9eac997
+Invalid = Y
+
+Curve = P-256
+X = 0b688e761e1ddda2305e002809da65bf5916dfe1356a5b99b61f5576a9b90efa
+Y = 90ec958e2e3a676e7bbf8e9394f72742875836125a317b0ae38374953f746a91
+Digest = 7289573d6bb7486e428e086bec9da9d7ff3c5f8bd0db2ec209fed6ae
+R = ef89df3bbf079fb250f7e882c4f85c0023fc3804e862d9ef4d9530a15f1013f0
+S = 4ba985e900e6737b8e07eac638f7b38277ead4faee6d2076a2eee90fd2a6bf0f
+Invalid = Y
+
+Curve = P-256
+X = 0b64480783e260e1e9caef37b4cc9c650d2d57e2c594b1106314843d8d7ab74e
+Y = 29d373d8522deffe40055aef539f53f38937eb799b44f05a8d8c0b381f12907f
+Digest = 497656e780360ec3b4bd1be97570615e4a32467982cd9330bc6aa224
+R = c5c26b0b21eef0f7a0f1cff38d0079d890376759369b01d8d8e959c1c785e203
+S = fecc400bf0deab99d87da168b9d0dd31d2dfa3435b0fe9d38b5fb8efd45195a4
+Invalid = Y
+
+Curve = P-256
+X = 7f78a8fd880c509940e2b83de67c9ab553ab91489bae75cdc1d5b523b06ab7f5
+Y = 7786aee7032c373cdfad7d9ddb6fa09a026f6da30fd477ab014d30a289d542a1
+Digest = 6d88da9e83ae9457e233d7977172c062dfbdd17d365694515251e031
+R = c93ada69db326f76b1362d610cb8bcc6e7ef1dc03d3d11367e153c0e39d5dc86
+S = d0c02c71b14ef7a4af4e23bd207ce98449f5d6e7e5b3ec8cbbca9549e97d379d
+
+Curve = P-256
+X = e58cdc207c56f62e0bb7c0b55b7f7236a6b308f8fc4de3e61cdb3bf20ad2f62c
+Y = 6056c0ee827e85ba284838954d0c6cc096df03b4611b1e0f7f9002bac86856d4
+Digest = 3f9a97b8ea807edc88788df8956c296b1daaed8dd12d50c712344091
+R = 2df3906527ad322000285bccdd11dd09130d633cf43534f5802604639eb847e0
+S = adaaad19b7c66836ef0f4afeff8ac5e898cd2523246a74a1a291a3a1ff583322
+
+Curve = P-256
+X = 70b4bba10b7bbc6d4175ada8d485f3685b13916d0c992301f47e45b629c63d0e
+Y = 257a93be31b09ff4cd22e3375e30b5a79f3bf3c74c80dde93e5d65e88c07c1c4
+Digest = cc3a0d3a5d4f28dc9144a3cdb276eb92265f1157a8d8192cf628673c
+R = 6e714a737b07a4784d26bde0399d8eee81998a13363785e2e4fb527e6a5c9e4e
+S = 94c0220f0f3fa66ff24f96717f464b66ae3a7b0f228ab6a0b5775038da13768a
+Invalid = Y
+
+Curve = P-256
+X = 8b11b48d2397355000a5289d816b9892ae64dffc842abec02a2fb2db2bb34310
+Y = fc1a42528a0473cfc2c2e184b8bc5055096350fe1549d24b526d6536681026e8
+Digest = f340e491fa935be8945b8caa485d0699c66331e0e17c7407da1b018e
+R = 61a91dd1c80049e70dc4aea84bda0efc6ec9c7b9dd16ecbccf687244c51184ce
+S = e381e7b32bab49578c7e7ce7784ce19263e4a7dab4b614df411d20eaebfc391c
+Invalid = Y
+
+Curve = P-256
+X = 7bad1b3d8bad4355a44511d2eb50daeae793af99418ada118327359936aa0e1d
+Y = e7eff40334b7a5455f6b0d0ecdcdc513702857bb5bbb73c910c86746092bcd7d
+Digest = 9cf84546c046b370c372c167ebba39af6aadd60463626453787bb058
+R = fd961b60b21be32b47abafa77e22197dc99af6825dcca46e0e3b1991a90aa202
+S = a0477f97b94a1c26a3b2d186791d7fc9dfa8130bbae79c28fa11ec93a3aeac0b
+Invalid = Y
+
+Curve = P-256
+X = 407d92c9b28723602bf09f20f0de002afdf90e22cb709a8d38e3c51e82cba96c
+Y = 4530659432e1dd74237768133e1f9808e62d0fbe5d1d979d1571baf645dcb84c
+Digest = 0cf5cd48c93f45472d254196bebea4bddb272a2adff23bab8c3adf99
+R = a7dc65293ee3deb0008ae3e2d7ef9e9a4ebb8bf7b10d165f80ab8bed58d6fdef
+S = 3e8300a3ee603a8d8234fe265c628e705015bf1903eb74c943323050626f701f
+Invalid = Y
+
+Curve = P-256
+X = 26aea3dd5c53f984dbdaf415c7f26e1e73048658a548eb3b59dd5f721899919a
+Y = dff15f57bd9b08644d49cbb214403647195725cd4d4511bc8a48b0770466ae9f
+Digest = 75d6b6b575d0a2c89528b83c94ef864c825b66253ab662b36bb0e716
+R = 726af92afe53e8125b0b9f3659745be401a37ae658b7b1aa88c3cb97e9de22c3
+S = 794484c5837a419efe11a4e4293341a6fa36d21230925a0e5e135887302acca9
+Invalid = Y
+
+Curve = P-256
+X = e73418677ce044b331a6d60773cbae199221699d31e1bec4b68b9bc0b87e4cd0
+Y = 37215db4e3d9161f3351b385a61ddb2fcf1cec469d1659e7574610ed27fe879f
+Digest = dcbb92e3be3951d37e37852d508f78da29c8183c5dbe59d6549f78ed
+R = ac469290a8f61a2a8c6adc7533dd5cfe804e2e7bf101cc74e5f624f301bccd23
+S = 4c328c3bc259316641fff44753743afebe89b8627f904df7245e42adcff2dc76
+Invalid = Y
+
+Curve = P-256
+X = b0892b19c508b3543a5ae864ba9194084c8f7ae544760759550cc160972e87ff
+Y = 9208e9b0c86ad6bc833e53026f233db9a42298cdb35d906326008377520b7d98
+Digest = 90333facb4f5068c1d05d1a478fb46d02f367e271a000474c06a5fec
+R = a62dd0d1518c6b9c60de766b952312a8d8c6eaa36a68196d2a30a46fb17dc067
+S = b9ded660e978129277f74c1d436003d1e6d556dc8eed9d505bbaf4c67cb13d21
+Invalid = Y
+
+Curve = P-256
+X = 8c5c41cb07d828a6a86be4533aef791d3a70a95cb285aa2956b21feeac2f8c49
+Y = 84101581cad7a48b7d0596df7ffed47085d22e8a4af685cddbeeb32ea69ae190
+Digest = 8bb52bd045c985167f673c07b613a3402f435a54c122877bc0c5fe34
+R = 9812449df0a51f7a2a8f78aa9a589ca9644dce285f1e69658daaea759fa5bd7e
+S = beb4c27c748a7944e37afe861576f76b5a749a8ccbbd7dec00838ba250ddfe1a
+Invalid = Y
+
+Curve = P-256
+X = 788d7e54ab03020e4954f41259052ee5af68361492b180da31fbbe68d868aa95
+Y = 982a3ababa6d351649e56da3faeb7160b9de74e22fe93a06ead1bd9a8dffdf7e
+Digest = 9870ae25b0f0403eff1079b94669cf95fb250fb098eeb885ff08f117
+R = 3ddea06bf8aa4a1b0c68674a2c4796def0bfb52236f4efb3332204a41fd8ea89
+S = 871237039431a41aeefcdd08f67848b2b09067e3a1344c8ed9b372d1b1c754a6
+Invalid = Y
+
+Curve = P-256
+X = 87f8f2b218f49845f6f10eec3877136269f5c1a54736dbdf69f89940cad41555
+Y = e15f369036f49842fac7a86c8a2b0557609776814448b8f5e84aa9f4395205e9
+Digest = a82c31412f537135d1c418bd7136fb5fde9426e70c70e7c2fb11f02f30fdeae2
+R = d19ff48b324915576416097d2544f7cbdf8768b1454ad20e0baac50e211f23b0
+S = a3e81e59311cdfff2d4784949f7a2cb50ba6c3a91fa54710568e61aca3e847c6
+Invalid = Y
+
+Curve = P-256
+X = 5cf02a00d205bdfee2016f7421807fc38ae69e6b7ccd064ee689fc1a94a9f7d2
+Y = ec530ce3cc5c9d1af463f264d685afe2b4db4b5828d7e61b748930f3ce622a85
+Digest = 5984eab8854d0a9aa5f0c70f96deeb510e5f9ff8c51befcdc3c41bac53577f22
+R = dc23d130c6117fb5751201455e99f36f59aba1a6a21cf2d0e7481a97451d6693
+S = d6ce7708c18dbf35d4f8aa7240922dc6823f2e7058cbc1484fcad1599db5018c
+Invalid = Y
+
+Curve = P-256
+X = 2ddfd145767883ffbb0ac003ab4a44346d08fa2570b3120dcce94562422244cb
+Y = 5f70c7d11ac2b7a435ccfbbae02c3df1ea6b532cc0e9db74f93fffca7c6f9a64
+Digest = 44b02ad3088076f997220a68ff0b27a58ecfa528b604427097cce5ca956274c5
+R = 9913111cff6f20c5bf453a99cd2c2019a4e749a49724a08774d14e4c113edda8
+S = 9467cd4cd21ecb56b0cab0a9a453b43386845459127a952421f5c6382866c5cc
+Invalid = Y
+
+Curve = P-256
+X = e424dc61d4bb3cb7ef4344a7f8957a0c5134e16f7a67c074f82e6e12f49abf3c
+Y = 970eed7aa2bc48651545949de1dddaf0127e5965ac85d1243d6f60e7dfaee927
+Digest = d1b8ef21eb4182ee270638061063a3f3c16c114e33937f69fb232cc833965a94
+R = bf96b99aa49c705c910be33142017c642ff540c76349b9dab72f981fd9347f4f
+S = 17c55095819089c2e03b9cd415abdf12444e323075d98f31920b9e0f57ec871c
+
+Curve = P-256
+X = e0fc6a6f50e1c57475673ee54e3a57f9a49f3328e743bf52f335e3eeaa3d2864
+Y = 7f59d689c91e463607d9194d99faf316e25432870816dde63f5d4b373f12f22a
+Digest = b9336a8d1f3e8ede001d19f41320bc7672d772a3d2cb0e435fff3c27d6804a2c
+R = 1d75830cd36f4c9aa181b2c4221e87f176b7f05b7c87824e82e396c88315c407
+S = cb2acb01dac96efc53a32d4a0d85d0c2e48955214783ecf50a4f0414a319c05a
+
+Curve = P-256
+X = a849bef575cac3c6920fbce675c3b787136209f855de19ffe2e8d29b31a5ad86
+Y = bf5fe4f7858f9b805bd8dcc05ad5e7fb889de2f822f3d8b41694e6c55c16b471
+Digest = 640c13e290147a48c83e0ea75a0f92723cda125ee21a747e34c8d1b36f16cf2d
+R = 25acc3aa9d9e84c7abf08f73fa4195acc506491d6fc37cb9074528a7db87b9d6
+S = 9b21d5b5259ed3f2ef07dfec6cc90d3a37855d1ce122a85ba6a333f307d31537
+Invalid = Y
+
+Curve = P-256
+X = 3dfb6f40f2471b29b77fdccba72d37c21bba019efa40c1c8f91ec405d7dcc5df
+Y = f22f953f1e395a52ead7f3ae3fc47451b438117b1e04d613bc8555b7d6e6d1bb
+Digest = 8a3e7ad7b9b1b0cdc48e58d1e651fe6d710fef1420addeb61582bdd982d2b44c
+R = 548886278e5ec26bed811dbb72db1e154b6f17be70deb1b210107decb1ec2a5a
+S = e93bfebd2f14f3d827ca32b464be6e69187f5edbd52def4f96599c37d58eee75
+Invalid = Y
+
+Curve = P-256
+X = 69b7667056e1e11d6caf6e45643f8b21e7a4bebda463c7fdbc13bc98efbd0214
+Y = d3f9b12eb46c7c6fda0da3fc85bc1fd831557f9abc902a3be3cb3e8be7d1aa2f
+Digest = d80e9933e86769731ec16ff31e6821531bcf07fcbad9e2ac16ec9e6cb343a870
+R = 288f7a1cd391842cce21f00e6f15471c04dc182fe4b14d92dc18910879799790
+S = 247b3c4e89a3bcadfea73c7bfd361def43715fa382b8c3edf4ae15d6e55e9979
+Invalid = Y
+
+Curve = P-256
+X = bf02cbcf6d8cc26e91766d8af0b164fc5968535e84c158eb3bc4e2d79c3cc682
+Y = 069ba6cb06b49d60812066afa16ecf7b51352f2c03bd93ec220822b1f3dfba03
+Digest = 7c1048884558961c7e178b3a9b22583fca0d17f355a9887e2f96d363d2a776a3
+R = f5acb06c59c2b4927fb852faa07faf4b1852bbb5d06840935e849c4d293d1bad
+S = 049dab79c89cc02f1484c437f523e080a75f134917fda752f2d5ca397addfe5d
+Invalid = Y
+
+Curve = P-256
+X = 224a4d65b958f6d6afb2904863efd2a734b31798884801fcab5a590f4d6da9de
+Y = 178d51fddada62806f097aa615d33b8f2404e6b1479f5fd4859d595734d6d2b9
+Digest = 4c8d1afb724ad0c2ec458d866ac1dbb4497e273bbf05f88153102987e376fa75
+R = 87b93ee2fecfda54deb8dff8e426f3c72c8864991f8ec2b3205bb3b416de93d2
+S = 4044a24df85be0cc76f21a4430b75b8e77b932a87f51e4eccbc45c263ebf8f66
+Invalid = Y
+
+Curve = P-256
+X = 43691c7795a57ead8c5c68536fe934538d46f12889680a9cb6d055a066228369
+Y = f8790110b3c3b281aa1eae037d4f1234aff587d903d93ba3af225c27ddc9ccac
+Digest = 8581034ec7d7a6b163d71820923f616b362748f2846042c9896d8e4bf7577960
+R = 8acd62e8c262fa50dd9840480969f4ef70f218ebf8ef9584f199031132c6b1ce
+S = cfca7ed3d4347fb2a29e526b43c348ae1ce6c60d44f3191b6d8ea3a2d9c92154
+Invalid = Y
+
+Curve = P-256
+X = 9157dbfcf8cf385f5bb1568ad5c6e2a8652ba6dfc63bc1753edf5268cb7eb596
+Y = 972570f4313d47fc96f7c02d5594d77d46f91e949808825b3d31f029e8296405
+Digest = e5b30e0041a33281210644938d9aaa15ef2c1247b4178f7ca1ee935ce23daabc
+R = dfaea6f297fa320b707866125c2a7d5d515b51a503bee817de9faa343cc48eeb
+S = 8f780ad713f9c3e5a4f7fa4c519833dfefc6a7432389b1e4af463961f09764f2
+Invalid = Y
+
+Curve = P-256
+X = 072b10c081a4c1713a294f248aef850e297991aca47fa96a7470abe3b8acfdda
+Y = 9581145cca04a0fb94cedce752c8f0370861916d2a94e7c647c5373ce6a4c8f5
+Digest = edd72dc0aa91649e09e2489c37ec27efab3b61953762c6b4532a9b1cd08a500d
+R = 09f5483eccec80f9d104815a1be9cc1a8e5b12b6eb482a65c6907b7480cf4f19
+S = a4f90e560c5e4eb8696cb276e5165b6a9d486345dedfb094a76e8442d026378d
+Invalid = Y
+
+Curve = P-256
+X = 09308ea5bfad6e5adf408634b3d5ce9240d35442f7fe116452aaec0d25be8c24
+Y = f40c93e023ef494b1c3079b2d10ef67f3170740495ce2cc57f8ee4b0618b8ee5
+Digest = 0d06ba42d256062e16b319a0f3099109518a765f26bac3b9f56930d965617726
+R = 5cc8aa7c35743ec0c23dde88dabd5e4fcd0192d2116f6926fef788cddb754e73
+S = 9c9c045ebaa1b828c32f82ace0d18daebf5e156eb7cbfdc1eff4399a8a900ae7
+Invalid = Y
+
+Curve = P-256
+X = 2d98ea01f754d34bbc3003df5050200abf445ec728556d7ed7d5c54c55552b6d
+Y = 9b52672742d637a32add056dfd6d8792f2a33c2e69dafabea09b960bc61e230a
+Digest = 41007876926a20f821d72d9c6f2c9dae6c03954123ea6e6939d7e6e669438891
+R = 06108e525f845d0155bf60193222b3219c98e3d49424c2fb2a0987f825c17959
+S = 62b5cdd591e5b507e560167ba8f6f7cda74673eb315680cb89ccbc4eec477dce
+
+Curve = P-256
+X = 40ded13dbbe72c629c38f07f7f95cf75a50e2a524897604c84fafde5e4cafb9f
+Y = a17202e92d7d6a37c438779349fd79567d75a40ef22b7d09ca21ccf4aec9a66c
+Digest = 5aa8e8a6f0622b841416e1a70d79a54641d2c699a075b6960fe5dcf96301da8ca6f15b0948d4ededac30a42e00d3b310
+R = be34730c31730b4e412e6c52c23edbd36583ace2102b39afa11d24b6848cb77f
+S = 03655202d5fd8c9e3ae971b6f080640c406112fd95e7015874e9b6ee77752b10
+Invalid = Y
+
+Curve = P-256
+X = 1f80e19ffeb51dd74f1c397ac3dfd3415ab16ebd0847ed119e6c3b15a1a884b8
+Y = 9b395787371dbfb55d1347d7bed1c261d2908121fb78de1d1bf2d00666a62aed
+Digest = 244656186c11c2e67be88099d55e60f4b68e61fba0b214aac3399dc559cfccc02f9884e85623426dbdc3243f2b5374f7
+R = 249ca2c3eb6e04ac57334c2f75dc5e658bbb485bf187100774f5099dd13ef707
+S = 97363a05202b602d13166346694e38135bbce025be94950e9233f4c8013bf5bf
+Invalid = Y
+
+Curve = P-256
+X = ce4dcfa7384c83443ace0fb82c4ac1adfa100a9b2c7bf09f093f8b6d084e50c2
+Y = d98ae7b91abee648d0bfde192703741ac21daad7262af418b50e406d825eb0d6
+Digest = adaeadda3f0e941fba1d3e206a84e6d7530d800e0f215b3ddd82022f27c5be44fed27bc73084c6f7ca55555532be2e3b
+R = 597e1e04d93a6b444ccc447a48651f17657ff43fb65fe94461d2bf816b01af40
+S = 359fe3817963548e676d6da34c2d0866aa42499237b682002889eaf8893814d2
+
+Curve = P-256
+X = 1b677f535ac69d1acd4592c0d12fac13c9131e5a6f8ab4f9d0afdcb3a3f327e0
+Y = 5dca2c73ec89e58ef8267cba2bb5eb0f551f412f9dc087c1a6944f0ce475277a
+Digest = e34a541f87ff0eaa0c640f555caec6bf11a1320c74c47a8ff172c4e2ec902e48d499732b12a86189e750bbf4c0424c72
+R = df0b0cd76d2555d4c38b3d70bfdf964884d0beeb9f74385f0893e87d20c9642d
+S = 128299aabf1f5496112be1fe04365f5f8215b08a040abdfeca4626f4d15c005b
+Invalid = Y
+
+Curve = P-256
+X = 7ffc2853f3e17887dda13b0eb43f183ce50a5ac0f8bba75fb1921172484f9b94
+Y = 4cc523d14192f80bd5b27d30b3b41e064da87bfbae15572dd382b9a176c123a2
+Digest = 0689927a38486cccf28fe9454e08e0d74843424b89be4cdee8e48f39a69addec730184da72f914cea67231c765ee2574
+R = 3156176d52eb26f9391229de4251993a41b8172f78970bb70e32a245be4bb653
+S = 62827a29e12d2f29b00fb2d02dd5f2d5412e17a4455f4431a5c996881fdfc0ee
+Invalid = Y
+
+Curve = P-256
+X = 5569f76dc94243cde819fb6fc85144ec67e2b5d49539f62e24d406d1b68f0058
+Y = 1208c38dbe25870deab53c486f793a1e250c9d1b8e7c147ea68b71196c440730
+Digest = 97f8f8cea435282ac746730ac744bf97d85d4e249c0b1d9c7b83c7e59aed172ffc3724d7e6fab7d6ab55ffb3a39c0775
+R = 706f2ba4025e7c06b66d6369a3f93b2fec46c51eceff42a158f7431919506cfb
+S = b4e75ac34a96393237fc4337789e37168d79382705b248051c9c72bcbac5f516
+Invalid = Y
+
+Curve = P-256
+X = e4b470c65b2c04db060d7105ec6911589863d3c7f7ce48726ba3f369ea3467e8
+Y = 44c38d3ae098de05f5915a5868c17fee296a6e150beb1f000df5f3bec8fc4532
+Digest = 5b937a2af46dbf18b4a6fb042ea353a6878e0d4beac016002b3d91a42bcba52856c07a3f35c08dfecb4f03e1c0b9948e
+R = c9c347ee5717e4c759ddaf09e86f4e1db2c8658593177cfda4e6514b5e3ecb87
+S = baae01e9e44a7b04d69c8eaaed77c9e3a36ce8962f95cc50a0db146b4e49eb40
+Invalid = Y
+
+Curve = P-256
+X = 96050c5fa2ddd1b2e5451d89ee74a0b7b54347364ddc0231715a6ef1146fe8dc
+Y = e0888a9e78aeea87f6e1e9002b2651169f36c4ee53013cfc8c9912b7fd504858
+Digest = b123e07744f05ad523790ea5bfa3f848869a3bfdbf936a496c8606b577ed8427eb7ee888e0fe18d4e3cfac73baad883f
+R = 2353d6cd3c21b8ea7dbc1cd940519812dbe365a3b15cd6aebba9d11cf269867a
+S = 85f560273cd9e82e6801e4cb1c8cd29cdac34a020da211d77453756b604b8fa7
+
+Curve = P-256
+X = 0c07bb79f44012299fbfd5a0f31397aaf7d757f8a38437407c1b09271c6551a0
+Y = 84fe7846d5d403dc92c0091fbd39f3c5cbca3f94c10b5cae44e2e96562131b13
+Digest = fb8d12652de59e63ef5297641dfbce084808de146720e9069c2ef814bcd80b6187f7422a6cd9c706f8d64ccf80e8bc54
+R = 49e9425f82d0a8c503009cead24e12adc9d48a08594094ca4f6d13ad1e3c571d
+S = 1f1b70aaa30a8ff639aa0935944e9b88326a213ab8fce5194c1a9dec070eb433
+Invalid = Y
+
+Curve = P-256
+X = 71db1de1a1f38f356c91feaff5cfe395d1a5b9d23cf6aa19f38ae0bcc90a486d
+Y = ecdd6ffb174a50f1cc792985c2f9608c399c98b8a64a69d2b5b7cdd9241f67e2
+Digest = 2d8c6585a3b6319a556e27b53d434f455f73e771c8fc6a115f5c92a8e9a81ce2b4336a5c3edf98910689d11f4c93632a
+R = b0443b33a6f249470d2f943675009d21b9ccbead1525ae57815df86bb20470bf
+S = 316dbee27d998e09128539c269e297ac8f34b9ef8249a0619168c3495c5c1198
+Invalid = Y
+
+Curve = P-256
+X = 8219b225aa15472262c648cac8de9aad4173d17a231ba24352a5a1c4eea70fad
+Y = 0fee2b08ad39fbf0db0016ef2896ca99adc07efc8c415f640f3720498be26037
+Digest = a4cc3b23f54d9d48ba6b0ad3da3b2e3a0806f41348bd7844e9c9b8648753bdeef8a039e1fa4f5172c89148d65b14056f
+R = 134fb689101aaad3954de2819d9fbd12072fe2bc36f496bbf0d13fa72114ab96
+S = e65c232bd915b59e087e7fd5ec90bf636cfa80526345c79a0adfd75003045d6f
+Invalid = Y
+
+Curve = P-256
+X = c934195de33b60cf00461fc3c45dad068e9f5f7af5c7fa78591e95aeb04e2617
+Y = b588dd5f9965fdaa523b475c2812c251bc6973e2df21d9beaace976abf5728cb
+Digest = b962b63a7743ad77f9072f2f08d277f6dda8cc3420ddd37d873746008895902bcce218fbfed1a8cb28406978dd8e5134
+R = 71f302440eb4ed2a939b69e33e905e6fdc545c743458d38f7e1a1d456e35f389
+S = 54eaa0eb9cd7503b19a9658f0a04955d9f0ab20ebc8a0877e33c89ee88ad068f
+Invalid = Y
+
+Curve = P-256
+X = 9e1adcd48e2e3f0e4c213501808228e587c40558f52bb54ddbb6102d4048ea92
+Y = 34eff98704790938e7e0bdf87ae39807a6b77dfdc9ecdfe6dd0f241abae1aeb2
+Digest = 21b883fae159867731b123a2606e9b3320fb53a00e4a5dfe3bc3429dd53b8068197be3c7288c1e0bf28a4fc7b13bd70f
+R = ce4f0d7480522c8dd1b02dd0eb382f22406642f038c1ede9411883d72b3e7ed0
+S = 8546e1ee3b77f9927cdaccbc2f1cf19d6b5576b0f738bb1b86a0c66b39ca56fb
+Invalid = Y
+
+Curve = P-256
+X = 93edbecb0b019c2cc03060f54cb4904b920fdb34eb83badd752be9443036ae13
+Y = b494e9295e080a9080fe7e73249b3a5904aa84e1c028121eecd3e2cf1a55f598
+Digest = fcc17b88077570c053650e1de42ae6bb1522900b38996decc87704aab6a87ab01d52f83f6442875f378a262c22d23ab2
+R = eec2986d47b71995892b0915d3d5becc4dcb2ab55206d772e0189541b2184ddf
+S = 8a6c1edeb6452627ad27c8319599c54ac44cdd831ea66f13f49d90affe6ad45b
+
+Curve = P-256
+X = 3205bae876f9bd50b0713959e72457165e826cbbe3895d67320909daa48b0ebc
+Y = d1592562273e5e0f57bbfb92cedd9af7f133255684ee050af9b6f02019bbcafa
+Digest = 299a6070d32a5557010753d7559dbd8d2bde8a8feae5417616ceb5b167997fd2fac0c2bd44264106d3a9720d5e805a04
+R = 0124f3f1c61ec458561a4eaa6c155bd29e59703d14556324924683db3a4cf43b
+S = 688a5c5fc0c7ba92210c50cce5b512a468a880e05acc21ca56571d89f45f603a
+Invalid = Y
+
+Curve = P-256
+X = 484e31e69ef70bb8527853c22c6b6b4cd2a51311dde66c7b63f097dbb6ab27bf
+Y = e1ff8177f4061d4fbbacbbc70519f0fc8c8b6053d72af0fe4f048d615004f74e
+Digest = f1e9cda2e096ece9a1fc57e55eeeb56b1c635380c0f9a1800a4a1a5f105d1fc0c60e776234daaa8a6f7c0f5286bb420b3f607e7cc0a7d840ad5dcbab26c797b0
+R = 91a303d8fe3ab4176070f6406267f6b79bfe5eb5f62ae6aeb374d90667858518
+S = e152119cefa26826ea07ec40a428869132d70812c5578c5a260e48d6800e046a
+Invalid = Y
+
+Curve = P-256
+X = 8b75fc0129c9a78f8395c63ae9694b05cd6950665cf5da7d66118de451422624
+Y = b394171981d4896d6e1b4ef2336d9befe7d27e1eb87f1c14b8ddda622af379dc
+Digest = 0527199fadea30f9e5e66166a3ebcdf6aedf906984535f48165e591eff36f1c0de6b0fa69aefb6399e8a213cc2ce53268fbe18c3471b7708bc27c426aaa769a4
+R = 17e298e67ad2af76f6892fdcead00a88256573868f79dc74431b55103058f0b0
+S = 881328cd91e43d30133f6e471e0b9b04353b17893fb7614fd7333d812a3df6b4
+Invalid = Y
+
+Curve = P-256
+X = 76e51086e078b2b116fd1e9c6fa3d53f675ae40252fb9f0cc62817bd9ce8831d
+Y = ca7e609a0b1d14b7c9249b53da0b2050450e2a25cb6c8f81c5311974a7efb576
+Digest = c926a5026d8f83ffa2092caf863f2d8a886af391462969b13a11d3c6c5fa66bb4281bc6e60a1e99a2e1ae95d689a66282096a0f27aacc048f32d39297649a014
+R = 23b653faaa7d4552388771931803ce939dd5ee62d3fa72b019be1b2272c85592
+S = a03c6f5c54a10861d6b8922821708e9306fd6d5d10d566845a106539cbf4fadd
+Invalid = Y
+
+Curve = P-256
+X = bc7c8e09bd093468f706740a4130c544374fdc924a535ef02e9d3be6c6d3bbfa
+Y = af3f813ae6646f5b6dbfb0f261fd42537705c800bb1647386343428a9f2e10fc
+Digest = 4d74631eb67fd1a6fa93ecb6e6112b6699e78c1d4c24ae81d0d5842efe5d93c2fd7a7863f8d45d1b2fafecbe41b7dc19c4b2bc208e014ffdc216e7eda0392a70
+R = 6bd7ce95af25abfbf14aef4b17392f1da877ab562eca38d785fe39682e9c9324
+S = 6688bea20c87bab34d420642da9bdd4c69456bdec50835887367bb4fb7cd8650
+Invalid = Y
+
+Curve = P-256
+X = 9cb0cf69303dafc761d4e4687b4ecf039e6d34ab964af80810d8d558a4a8d6f7
+Y = 2d51233a1788920a86ee08a1962c79efa317fb7879e297dad2146db995fa1c78
+Digest = 0250f93e6932887df519921f9a8dcff110be0768dc351ef73a940a579fae2d20061759e892e289c3e4ba5f7fe17d6ebb15c5931d48db55ebc81549f6637292fe
+R = 4b9f91e4285287261a1d1c923cf619cd52c175cfe7f1be60a5258c610348ba3d
+S = 28c45f901d71c41b298638ec0d6a85d7fcb0c33bbfec5a9c810846b639289a84
+
+Curve = P-256
+X = e31096c2d512fbf84f81e9bdb16f33121702897605b43a3db546f8fb695b5f6f
+Y = 6fbec6a04a8c59d61c900a851d8bf8522187d3ec2637b10fa8f377689e086bba
+Digest = f91b09107d10904d3968ec29f85e456ac4e828f32e8da3db6a13f5566bfa625e2ad03f8dad5425a073c0d61d25de63dcafa9f4fcd206f29e9cb6b0fecd74aa57
+R = 1b244c21c08c0c0a10477fb7a21382d405b95c755088292859ca0e71bab68361
+S = 852f4cbfd346e90f404e1dd5c4b2c1debca3ea1abefe8400685d703aea6c5c7f
+Invalid = Y
+
+Curve = P-256
+X = 633c2ee5630b62c9ce839efd4d485a6d35e8b9430d264ffe501d28dbace79123
+Y = 4b668a1a6d1a25b089f75c2bd8d8c6a9a14fe7b729f45a82565da2e866e2c490
+Digest = 575c64df58c8dc517ce65b388fa3ed69470163afecbabc3fa94b497ff7f3fe36ff12fabe2b84cebbf667744195091e4e2335a71d36414e0af0d0260fc8e8ea44
+R = bf2111c93ec055a7eda90c106fce494fd866045634fd2aa28d6e018f9106994e
+S = 86b0341208a0aa55edecfd272f49cb34408ce54b7febc1d0a1c2ce77ab6988f8
+Invalid = Y
+
+Curve = P-256
+X = f78dce40d1cb8c4af2749bf22c6f8a9a470b1e41112796215dd017e57df1b38a
+Y = 61b29b0bc03dff7fa00613b4de1e2317cfbf2badd50dee3376c032a887c5b865
+Digest = 4c097f2f5b2489c94258b34d529675bb5d77d4be083b51b01188dd42b4b5473982728763ee6fbad479375c5eacb5edaaec0b6583a10b19aad81ec88dde2d0e7f
+R = 4a96169a5dea36a2594011537ee0dc19e8f9f74e82c07434079447155a830152
+S = a204eaa4e97d7553a1521d9f6baadc0b6d6183ba0f385d8593d6ca83607c4d82
+Invalid = Y
+
+Curve = P-256
+X = 3fcc3b3e1b103fe435ac214c756bdaad309389e1c803e6d84bbbc27039fcf900
+Y = 7f09edd1ec87a6d36dc81c1528d52a62776e666c274415a9f441d6a8df6b9237
+Digest = 1a3dd21cb6ac1fa7fc196319cf534b7608afb93805420fcb5250dff453564a5b22e22971a3ce6dd222405fea018cd0508d86c561eca15e1ac7d79c14e916b86a
+R = 1cac13f277354456ae67ab09b09e07eb1af2a2bf45108da70f5c8c6a4cbcd538
+S = 5d83752e540525602ba7e6fee4d4263f3eda59e67df20aac79ca67e8899fed0d
+Invalid = Y
+
+Curve = P-256
+X = 5ec702d43a67ada86efbfc136cf16d96078906954a3f1f9e440674cd907e4676
+Y = 05a62044fed8470dd4fca38d89d583ce36d50d28b66ab0b51922b21da92c56d9
+Digest = c5c016f6c9b525987dd835131def77cc72d8360d364eeccdd7af8b95712b6cd487c0b846201f3b64466fd140833514ae8d765da395fbd9d3c03ca410effa9a69
+R = 75f3037298f1457dba55743999976a1c2636b2b8ab2ed3df4736a6d2934acc83
+S = 19d43ad168dda1bb8ac423f8f08876515234b3d841e57faef1b5ab27359b27ef
+Invalid = Y
+
+Curve = P-256
+X = f63afe99e1b5fc652782f86b59926af22e6072be93390fe41f541204f9c935d1
+Y = f6e19ce5935e336183c21becf66596b8f559d2d02ee282aa87a7d6f936f7260c
+Digest = 9eb2f9fa96a1f3ffcef9600522730e86d26d328ec0c1bf2fbfe55a38754610341fda1b894fdcf10c9bc4f48819010fdcf0d24f27ff539e40c6855cafbd306386
+R = cef4831e4515c77ca062282614b54a11b7dc4057e6997685c2fbfa95b392bf72
+S = f20dc01bf38e1344ba675a22239d9893b3a3e33d9a403329a3d21650e9125b75
+
+Curve = P-256
+X = 6d11b09d2767cf8d275faee746c203486259f66dd2bfa3a65c39371a66b23385
+Y = 4eb05c73e05261e979182833f20311e5366f72f4b949665ff294f959375534c6
+Digest = 0e71b28b0a1eac7aa881c09daec616c93d9a9286b5f5fdf2642d211021b125fa884b2595b73c7c3e649e61cd7157ef6660076a3b87ddf830db46533f3aa30afa
+R = 15a697cdb614e11c0810e1e764cd501fcabc70874c957587bc4883d9438e177f
+S = 7bf6244f92bc768063cecb5336c8eaacd23db930b28703560f241c7d93950dfd
+Invalid = Y
+
+Curve = P-256
+X = f3899caba038efb534c4cea0bd276814ffd80194473c903b81af11c8c05cb6e6
+Y = 6ea6b17402fcf2e8e737d11ffc7c2ed3b2d0bc3b8f271a381f4294cff62682c3
+Digest = 104ace16689d785df09a81c5cf47a496db30fbd696aa4df080219487575a23641436e70329dd1c13290582c0d03aae200e51189d43666c86f38a5203c16cd7e4
+R = 57b99380452e1d37b133c49b9ba493dee8630940477ca3351a43d90b99871e6a
+S = df599c3a37105af3ecc159b3b685ccb3e151b7d5cf2d97147974ae71f466b615
+Invalid = Y
+
+Curve = P-256
+X = 1fd6f4b98d0755291e7a230e9f81ecf909e6350aadb08e42a3262ff19200fbd2
+Y = 5578fef79bc477acfb8ed0dc10c4f5809c14dc5492405b3792a7940650b305d7
+Digest = 761a54f3718985b6d7bcfdd57d6c4823f854831bd29305fcb07e34e3f825d451fca28a62ce9582e3957d89ea7c1bc1afe3aa58fd2fa18566974600fc394cf2a8
+R = 97a99e96e407b3ada2c2dcf9ceeeb984d9a4d0aa66ddf0a74ca23cabfb1566cc
+S = 0ecac315dc199cfea3c15348c130924a1f787019fe4cd3ae47ca8b111268754a
+Invalid = Y
+
+Curve = P-256
+X = 2dcbd8790cee552e9f18f2b3149a2252dcd58b99ca7dc9680b92c8c43aa33874
+Y = 5dbc8bb8813c8e019d80e19acdb0792f537980fecde93db621aaf1f6d0e6ee34
+Digest = 45b082e804443b53a82229cdf13e4c5f8f31fe93170cc8a23f63eef506cb7748388e1a971a2f81e3daa324cf2bb69118f7418f40df66a24f50c34a55e1416c3a
+R = 2bdbd8b0d759595662cc10b10236136ef6ce429641f68cf6480f472fcc77bc9f
+S = 7e7df0c8b86f7db06caf1610166f7b9c4c75447f991d5aaf4dea720c25985c8c
+
+Curve = P-384
+X = 6881154cfe3f09affbee04cd387b27b7854326faf8906c4b9c9e6ac2c632e0d59717b3f33f6d747d7b7cbb4e4dc01fb8
+Y = ba295ae0966f06ad9d84b3bb4da7f99b56044c99f88d71082cfea6964ea3c63bb79806a6a41fcc314b55b3f64f82b68a
+Digest = 8a6429d55885146f7aab582a1aa9360fa9591b0a
+R = 2112385a75d4edda89ae2bc3c74524dc792544a3a52fdb588da3f0feaee6a11623db275e2ab8abdd998cc42a29c60856
+S = 8d308a3987b81c595f8cec19898b1a42da8eda97496af280033b0f915283f171fed7e2a221fa9c78927962189333f437
+Invalid = Y
+
+Curve = P-384
+X = 2f2f43f244ae027c3d2ec5c900393f80a8ad0e9b9a12a047195d29a39f2b7026b071688dd9a6764379d02a5ed8035ec1
+Y = e43d45851bc76c37d34dbed996a65ffcfbbaf0e2cbfbc9f62d2116bdf3b330bbef5acdbcd0aa6d949f771daa17cda1e3
+Digest = 5f41322db1a276042ae807f0f0d6f1e04cb5cd26
+R = c011c52e9cb02048957a233704ff9a2c1d4c56e08ebb083aa8ba351f041a23a7d0da19088ac6c60ea2ca117531c7cf35
+S = a66ca9bf06c35d129a8253a0f793acf681e482d9994868b275a230b215286e03a66a0de77c7a53174375137fd4688556
+Invalid = Y
+
+Curve = P-384
+X = 9a5e1932d318bfa7986f0dac4489c6f55775427bb60fb24bac7646b9994bbc3a9b5cd15e818cc4e832afc1c3fca9abae
+Y = 64c89e7c3399c136b2718ab675944207157f0bf23d9e2a807ae7ac3bef81da7ec3c56c2d2c08afc53301af2a3cc71861
+Digest = d36ef9ee70a3b61ba31cdfcd0cac6e49331a407f
+R = 4cf6c63fea6c80efc105cd99afe2b53da05ae16566ddb20b9d40a076575ffac419b6807fa336fc6e7c7416c59775ef09
+S = aec2d96054b4b23c49faaf9903ccf63bc96281fb7c1b9d14daa54bba51bb2b2f4d3a901f3b0b9cb2b62976459219350c
+Invalid = Y
+
+Curve = P-384
+X = b3aeff27b65540c6da10a88008404b1d49239c87fbf47932518fb87a9bb132403d1f310f531d086340bb4a68c3e64b9b
+Y = 567e75f442fcd81017b8adc4cce634f5ffa3cd497d38221d34dc1f43aef99133131ff1b197f7b9f37beecae5c438849a
+Digest = dd0f9c326fb50593fd0a0df31abeeb00a22eb956
+R = 3b94a2514eb915b71e18c867ad7f508a35375c5bcd4b797b86054798569870b2477e2ac14406628017d829400efc63b2
+S = 179a10441a0beea3b375248e697e0d19e24bb68184c373fe4302839b97dd7353a5a25929c2733796b0c0d8211bd67c51
+Invalid = Y
+
+Curve = P-384
+X = 0874a2e0b8ff448f0e54321e27f4f1e64d064cdeb7d26f458c32e930120f4e57dc85c2693f977eed4a8ecc8db981b4d9
+Y = 1f69446df4f4c6f5de19003f45f891d0ebcd2fffdb5c81c040e8d6994c43c7feedb98a4a31edfb35e89a30013c3b9267
+Digest = a871caf9fff9856031a79a55b96753c1a34ccb73
+R = 8d9d3e3d0b2b2871ea2f03f27ba8699f214be8d875c0d770b0fff1c4ce341f0c834ac11f9ec12bfdb8320b1724c8c220
+S = 62150dfba8e65c0c7be7ef81c87241d2c37a83c27eb31ccc2b3c3957670a744c81be6d741340b5189cc0c547df81b0d2
+
+Curve = P-384
+X = b4b92211edbd41c5468d2ba70810bc37b5e7c954c7bd0db80c4fa89ccba10bf07cdab953828a068bc0104d28e4040c14
+Y = 93ed318efce3dff98fc782b788d78658ea5ecde4f716e2d5d0ec2d87a2e761daa1f1658cfb857762caa567baaccf9924
+Digest = 765343d50541bc2c0e20193648048016a95e7588
+R = aa3978eabd196ddf9cab2815cc9cbab0b61cd639deaf70e093a10a58ddf9f410ee1ab965ff8fbb98efbe812421a613d3
+S = 02761a2947e1855806b8a25b9ebb0762be9f5517461a371e5783f34b184f32c4ea684b362119b1a2d8a3ff439f10291f
+
+Curve = P-384
+X = 63b4cc14f9efd3b8f29e65806591d1e9c54f34a3f5231339bcdbfa4109c42d946a59cdd7bbd2591fd1b2383a0819772f
+Y = 55ab3d208109da6ef039c23cddd52a5af619266d8fe066dcabb1af885ad5501401a78c44ed3b5fff2892fdcb2a3ac8b2
+Digest = 4535ef8d7396b4f2af65660ebbb56f356cacefd9
+R = a3f9b840fd7201356f35b5dde39027410aad26ac61919c14fe7b0535bb74e7218cb3312bfa60aac63f14166f32ceff26
+S = 1b1bcbcb0237fad4e406c8d4e3e39b55642d8535afa9ccbc9c601cb4e01891df79f1bc792687cb3a5ee7703565c4a13b
+Invalid = Y
+
+Curve = P-384
+X = f82f82f8f7454ce7a94a040ec0bbb52d49e3b9f8ddd095704973c760ee6067a5c28369656f22d70d8bb1cd70ef9bfea0
+Y = 0e36e256d02870ee5646a17aac4b280c9d1d2e1d4803eb3cb32e7f754cc889522120efd7c4d8a82e509a4d8f266d3ce4
+Digest = 26302c41e6da59e2df2e26c12382738880be94cc
+R = 27a2332f3c59464f5dfe7bb1201a3936248d375bde603724c048eb8f7c0c2be3ed4b56c14b51d7d68bd2554526b36d9e
+S = e1f90367b0cc530c545f95163d9ffb1208c943685d5ae221052b83ee40953397be581e5979c9855b20246e9d26d57acc
+Invalid = Y
+
+Curve = P-384
+X = 7d40b51127cb1642dd8538d4124138a2f49c41b4d12f702c1b0cec8deba50c3712e01c2e1e693e00438af0e86025da33
+Y = e734b5939b673c45dd32baf20d234f01b7124b391d14beea231e9c604e813fc83b3a77b0cb1f2ce4873a69b0165e369d
+Digest = 0b30b209147432207a72177997d28d6f1d03330f
+R = abf16821b6657e0005071f78c679cbbb130bee6e7ca63526eef0f747fb721feefe6258dae1aa02064a700e963bd9dedf
+S = 3f7e61c34a30cc5ff7a8be375fcc9c38a76dbc0c30a4356843421ca37a7bcf24edcd41d8235903bb522fb6e5a8033885
+Invalid = Y
+
+Curve = P-384
+X = a5b59d59599c105e39f61354da99c7c9135c749cf996cc2252eb83b008299cdafbcb44227d2d2c4a5ffa44823922893b
+Y = 0399fb0edcbfd0b76b524f22b7b87ddbb4fa02f510661615312a4492eb3f2001e0fc0e479f77c33a88f9a7e20757373c
+Digest = 44aa3083d111bbce7feb412af74a782cd320becd
+R = a4c9cac2409a9bfea1ebe28fec4e19545f08cd18fdd31048f52a3f2d32b2ed859dcae4dc12fb2fecabe542c4f03191ba
+S = b4d83f927ad1980d96cbb0ccc36aa640f786293b8b19e4dd97a797d192b420f630a5e42ac42d8736e7d42008f445dbc1
+Invalid = Y
+
+Curve = P-384
+X = 29178ce9127e1048ea70c7d435439e9ff9915387e51b7e5ca10bfdafe53565978eb3784d9a4226f443d4834f4d451685
+Y = 5cc2970589a453488649711bdf3cdac9a200519aae65b1c6bd54fed0d965755b36b74d978d674275bd71a03e8f054b0e
+Digest = c679b4a0e61406c4869d721192bd314d77e1cb39
+R = 5d6f5e9a94d9c92a0890c558bc0408b3405cd04e33f663df16701e80520e4394f1c54d3c8225d36f4753a799aaf6ff90
+S = d895b1cc522ceec6a7867867b8f603245c6e4d48945dfc43af721ebae4683d40a3c21b905ca3bd4b974d36806825b2cd
+Invalid = Y
+
+Curve = P-384
+X = 9f03569f8c6ca2c16d707f0ca36a8a8cf214a9d5c14034829d709e283cd675eb4e3090c6b973429efdf476c0782e0a7c
+Y = e1b842536731e91596782787d57af17db85dc92fd2fb95ac65339174aee66775ce0a4721d1faeb29da968ea5eb705e59
+Digest = ae1a63f88a59c7da5d9f512d11bbd5d75dd1f583
+R = 31ccbe22a360b1786dac89394c6ef4ed6604943e50837395f96052821f6182914840096e90f2ad650917bd91d7bd4cfd
+S = d97199a6b952dcaefb1defe23def92bf2ee236ad18046a2ccf8924d42ee10a62e70ffe7f3c909b11112278f160d98b7a
+
+Curve = P-384
+X = b85e78a935d169dd5ba8f558f964b21c07804464816f9231233184675f557463a8b00470ac0ca8278cd008f4642e7962
+Y = 8edf7be8584c5f207939d479e65173e2e69673090a8538fa93efb4432127895d92b4e4cf13b7632a830e9a33b37f75e1
+Digest = 811685f7ff2701e692f6830a33d8712d0432cd5a
+R = fd2876b250a94ced71734aa7a0d32423b2c6f039c926c557e748f38e23bbdb46e17d1204832c6f76c3ea854e1da23979
+S = 76409e381799502c81194ba87540aec0b89fc4680dd683780d49f82a46a7191b40f5f06ccb02e45e704c31fcd59382b9
+Invalid = Y
+
+Curve = P-384
+X = 0c74aaa0527524cb6171ab741896b405a6ac4615e474cdc09c9457b18bed33c6383e1b92f2fa1306e8e5dcd1667e45fe
+Y = 7b00d934dfd876f6e07dc0582b20ed650be104fa603a5a1255c62b6059d2685aa9773f1ba31254d213c815d0efc8ed93
+Digest = 328029316d73d1b8d2b8927d12332036e5671384
+R = 832c62b0f34986eda9d1ace5068a0c5318051b0d0166d3dacf137ac072cc359f109ad6e17059e700bb1958bcf4101246
+S = 6bb56f4eb550688ea66e5dd09aebe7e0b39e2716b4697ebb68f113e080f0ff26fd0fc947a34f3c5a8a2f10e07dc1405e
+Invalid = Y
+
+Curve = P-384
+X = 4104de08b4108ee26ee239e0a5d340c1b1aa48b1b3b40717debd6ed3ff0d777923c106f857a3830ce7f3d08d0d6d7908
+Y = 00498c38393e6393edcf254804558f86e461df1f5a6557bc5144f8d2f3806413d372b6ce417d531c08a52d1e38e8b949
+Digest = a13ebaf4431c43b684d1e18e610a75fd7527200e
+R = 9924a3273248db20db007309560a0e616572ac799d773529a5215786cf4a6e03cc73bea81d4810c1eee4b5e975652eee
+S = 6cc8ea4c4c56da87c25946a198e86917227bcb90da7be1dcde7b6547bc45a98e8175dd54af15bb6ef955b4cb48b7bb0a
+Invalid = Y
+
+Curve = P-384
+X = b6bc9418f3da0cce38a65f1b52bb3a9d22a0368e02f5f12fa1f1303ac67df1cffa55d049a782bf5bddb5e841b125aed6
+Y = 3b578a0560280a2958a14286e10faa7f5dec77fd8d90123aff5780efa8a636cee833fc9f10d7a164f1254a483b613746
+Digest = 7b44de2e448107197558cb071bb5bec9a5849467827d29b2c6625708
+R = 6602090aec001c16e5f6e7e3e488bed5d1702d36b258b6a8a2d8392a5ff30a6af12fbf4308d67eed6aaa8b7be8b831c5
+S = 65d0c3bb1910ba0b7cc108ae1ccaae63405ff01a8df91021e17cd46aa6f8ca8f4eaeac6d6fc26fc816a3ea537fd9576b
+Invalid = Y
+
+Curve = P-384
+X = b4ab83a4ded7d76aa15eaecb1bafe59427d3cfc38564af9123cb707da2405184acd40a6c093ba29e321ba0f67c1e0c6a
+Y = 26e2902499495f8550e798617a44ac9990c4c1cc3527dc0dd003a15aee3cbd3955151f7863de1692a94aafd3730e7665
+Digest = 8f902a34f36d7cd36748d5ddcc8fba6040be223a462842d506f185d1
+R = 61e48d5a100049578e820768ea57f30f27ffd1a1f839fabc55e8f4816c9b95d042619cd3bcc7180fd99834e344f53e7f
+S = 977b81d43216f31d8bedc3ffe873047817de3441df8b80a321aa0a80931f25a15c6628f43cf8e48d5c6aeca7626b0a18
+
+Curve = P-384
+X = f886f36fcf34e8df2a7e09220051b9981a3a6f693ec5999f28864e012c13896d633c9564f0118a95631cea8355b25b20
+Y = 746f9a77835325f18338dee5dc88a9b086b858ce15b4e4462a98844bb01811195f4fae0bee8f457c32823e142210dbb8
+Digest = 6a80377d3c7f0e6a50f6dc1656cef5a0d33cf7934441244f69f0062a
+R = 665390653ed280b8f6bd3718d8423f26cb38d2d7faa10fc0f094295677d9dafad45fc64cfc22ded56afdd86a77cf3c33
+S = 864f0eb3a8d93c388d987cfcb60bba76098039d46bf4ff4be083961f70a29e724c25cf56685802b7b5be048107ad52e3
+Invalid = Y
+
+Curve = P-384
+X = 5fc835a2f5429adb719ed22f11dfcb02731da6759a8ea75c21d1af9631187626c31e191f4dcdc183df01c48e13dbbce6
+Y = 9ed2d03df1cbeaefd4478b8106e90f92e0b6e958145cb81b9648aef0b96b71d1d55918564694b1987d68cc8e7cbd7dd1
+Digest = 807f609592e2ededa12792a7006a6db641904e86a1df3cec477dfd3c
+R = 94d9dedd27f2d014ba84ea58d2e88d68f3e86ba88b93750e50255211effe88b0a0e2f62017f22965726cdc77c55bca4f
+S = 14814bd09d9b7ba81b2485777cc588b5c0a4064df95c63f18a8bfd57494cd0f40c5bda9dc6c01ea72540f57a354360ef
+Invalid = Y
+
+Curve = P-384
+X = 0b86851d7c19f0f04a16e5e2903a36d09bf1863e152d87936fb2d74cf916bcf6dedf3c066d242f7dd327df0fcb42270a
+Y = b0c93480740bb635e6c25fb61630fdfcc462a1418366a51b1265656f721e18ba89ebf754c7dfdad865a252c884a6c4fc
+Digest = c34e896a31fc4de7596679e12bb2416a51e58e8942eabd5cb01f0737
+R = 33fa5fe3e495076e90f4b62753d3cdc7603aa7f5b407dbf89a854b9521d15e6c381d3cf28f103035dc4291ae318c5f82
+S = 30919a2a3fae71e1afe8378aedcaa08fadfab6c6bf954031452d4fe514969ede2acf0347a2f1e81abf1bfb9d8bd55a36
+Invalid = Y
+
+Curve = P-384
+X = 6f8f2fc40d1db28309c8850bf94d77c01c5449b4fc556e6bf50e5ee805209c4489d8ff9bd781699eb0e42f6a962d56fe
+Y = a4c7c77271dbbe7e00d1c6e4287dddc5463c6803a577a18f89a5eea01c6addc12404353abbc128cb9cf2496732312d65
+Digest = c19cabc6141b2adf67fe4bd0a3fead50473dea8cb0276de1fdc467c5
+R = 327c4642019a635d80dab82f7dc22e3102a3c1ba684c2b6de67d3d3009a17d39ae3d58ca2caec9f6f03f5ba3b406178c
+S = 6b1af807cc7265cc6d3049959cd7779ae0de819036647f9510b0e9f7e4c0e3fece5fc3741b68881145a2c944dc5c54d1
+
+Curve = P-384
+X = e98ba8016a976dcc3c50127d2af792969835b1096b1644b37c004d1786f4fb1026233f33ad56cd9444ba0a332c92efb8
+Y = 54bbcb78ffa3c855dd24bf182376ff5d28dd7b7551e4b05a19549c9f59c83dcc12a43092d63c5967fc0256612475b7d4
+Digest = d8d9319d3f705d03dfc992e8e7596586200fb1574f2a918350deb268
+R = 3b76a0c0ece2348085f3554fc92b9e5b0fe84801ab2adf1d239d7c81c9697b62285e8e5667774559d1bbc6e86f2ade64
+S = 91d929e42f8223ccc74d4cb09ee7eb619d3a348886c21091ec55d36164ad3cc04e1da6edd88ad89710a908ca4bc00333
+Invalid = Y
+
+Curve = P-384
+X = b8d7a836715635a8b095d3712817aa9e6ffdd98d24be2db751bb0c1fad42b082542500ea255cde17525ec159afca7002
+Y = 1a526c876d4771157b4f66e3056485c95066d4bd1e73e991ce6d5d3642807efe80015c52ef3cf8c86e57ab9a510ec86a
+Digest = fe23e8ab9dc934144247930a48babb0d8ba57703c2bef60e0e9a1e2b
+R = 9e36f47ec1b7ffdc6e3472f3cbec913494c0bbaa0c073f597e01845b5a3107c0e23a4575de4f2b582e1c2fe3067ec048
+S = b013cf51008a89b379a2a6b519b8d229ff0374401eae21a8da350fe35756b94168e7fafbd81f0f681f21c056941a82eb
+Invalid = Y
+
+Curve = P-384
+X = 4ffdecf5d5f7c1164297a93742c8a685bb425b97fdfe85f630dab2064ab29e52a0df34629c2531048c288216723fc9bf
+Y = 84fcff3e7e478a6932ace6f6b0ab70e61d8a5137b76886c59e721d938e0e252e2f7e57c2ab7dab90493446ad85c3fe4c
+Digest = 28d44c363bfb2e36bc59bb68c56e8b5d2587f149839fd3b8c05d9eb3
+R = 7d909d9aacf064c32d070c3149ace8b8f5d83b2006e8460b84c4bce664fc20e91c61ac8b415965b6155eddbe9238fe3d
+S = 19d909e358e71985179dab9113941ecad21e4f3608cb3a32dd065868af1657df8e06aa86855ac7ad757a7f8fb568a953
+Invalid = Y
+
+Curve = P-384
+X = e805e0733fc156bd582faaf794e58d4630ce73fc383cdc964dd337728f774e4989a697d79665a3282ee6e0ee343d6c7b
+Y = 43821b7b9a6ce1ddf0c59ada552668a0cfc85a87a610b5c36b7a691947116b49a4099340306e53494fc6b496cb8d12b0
+Digest = fd1bb27d666e3d40f5bd19d8c026a3614404b9edc11e582eb80b044c
+R = 3d4fa4ec95b55feac607fddc618d6f4eed71da65dc49d732e64460e5c80c57dc4421c64bacf3ef1e22995fd19c2a3cf5
+S = b11898ba475f2b28402d038afc15f171b99aab93437b35a2f8a3b89f42fdb7f93a0469d9da7652882000dd5bb1e8b9a8
+Invalid = Y
+
+Curve = P-384
+X = e15c7ef9791b9392c3e97389f2597ee161545c267e584b94262870ef25fda348f72349f396c27ac884fa8d776387fdd8
+Y = 107b4a7da8be564a14f9c45e4df5cc9b62f0671b3f2c0573c33fa37f985fefd1ae3ff2640947ebb12dffda72757db6af
+Digest = 3d9611421379fc93226fff23f5fe472a33f6bdc759d5705f7e9a2be3
+R = 9d715fd1a3668283fa83c407242e8d2a4f3fa1bf41919ca4101114bd0e0ac1b16c4379edb11de5210eee8618d42e9ed1
+S = 2dc37f453c8cfe01ea80c56d1865daf0f28847b12970132a1853c3ed80da6693e0da47a2476207947f29da34d68d604a
+Invalid = Y
+
+Curve = P-384
+X = efcb97dd73106b0a2be4f665c496352f6938da9d0fa97690dc0e8d018b06dce2ba8d19b93ddfe889d549a33e64497c31
+Y = 66a0cb7e64f40470b6d09b9e12f217b59e9e6615af52fbdc4ddcb379e77809361eca2093a3e24c7103e971567018400f
+Digest = 5598b06acf834ffbb2e50784fe2bc493fa51967f7ffadf1ece63f9b2
+R = 4ea5d4faf8ee52540db2f4c6283cea5302a3540a56e14c8a7533441c248465be99e10f23bba85be9634efaba7a8b172e
+S = 4c98a2142ecaba7db44c78658efffc1175f810a147306ba2e6498553526adb1507d7a99a372e0f84c8dbd160ef7fd5bf
+
+Curve = P-384
+X = 4e916a3cf2561580b49ecc52321db7103292fd2fcce8dd4d6f86be6035808e0df51c3c4ac1894f0b08ef6ebf953e0d18
+Y = 4e6f28895d024b4c71220b27052ddd4bf6115a260825acade48c043b3e06d2b6b8e4ebdf465980f3b013cb575d475bbb
+Digest = 1668ee6ae19c2d6f23b9184b6895ede8f55549b23095d53ef89487f6
+R = efce00544ebe0d98ba6015c07e3e9d09af808d49a0820c22ef572a3ef9c8a684b377bef1f8b3bbddb734b9b0bd0b1cd4
+S = e80d0e183b3f00098308e20e5b4ae393a07f1d1a8defda9a9d10f19b3e5236e42f593b1dc57f6718dd8d4583f0175ff7
+Invalid = Y
+
+Curve = P-384
+X = 3c6528c82d9d5e8dddf41a211c70f78604d81f49853bdc746270f1340a2a645dca3bc7844c3680268fa5973cd1758313
+Y = 4b9e697f1caf83d3224486bb0a8cd6a7c56e47c91043d8cba3aba51b6e504441d37abcc9b7b2d49b9126463703e514a0
+Digest = 1b39217bcc5dc841b32ddf00245623c581f19cac8a4ecd03eb2c07f0
+R = 848814c01c3d18534f39bcd53a8736db16f0f77a015a0e578cbb2f831739723e83b29cb6d4eee7822c76ff056d0f467d
+S = 05beb19f766bd1d4ec5e65786042258298a2dc617e3f13d8e2f0f4b50d934565f3162c737fa791a81897397f29305943
+Invalid = Y
+
+Curve = P-384
+X = 80c3f6488dcd76f33cdb75e30f8452ab9a3bd6110f14e25179b0aefe4c19c60a07b4af10844b130b0b75a7024e341298
+Y = 6c85a17ad4bbefb33910250e05ac02a17c892c3380712d06dd070843dff0d040e219dae78679b774cd5eff0adb67189a
+Digest = 23cd0066d1d88702c5d4461deff89aa5662b517806a04c4da30e0d82
+R = bc444deb0c7dd9f96f20a7ffd3ddb35a1189316655531860c39b5f87f09992106985e5562e083ee9f538c8e2d5363c52
+S = 91adde5d47eae80a98661f4347fd6e4778478c3d4aff3cff8aa92e2345a8e03cd4ab64adfd38e461bb98b496516439e7
+Invalid = Y
+
+Curve = P-384
+X = 97c3f446803a61a7014f61cb7f8b3f36486c7ea96d90ee1767f5c7e1d896dd5114255abb36c74be218c1f0a4e7ebba3d
+Y = 553ed1fed72c62851e042f0171454f120029adba4ee26855ab881d9470355f1947aa1d2e806a7ff2583660fedbd037a0
+Digest = 647eb206a8477440b4bd048d00f37dca8635b15c2a8e79e2a9d74fb9a5553211
+R = 7b06d6c2b63f1cc3bfdaa897d07dc15a83bdf35d979f70c34578332b3f4920422bb24867c51bde10831324df424e04ec
+S = 4bef715161f400dc98d4b63bd13ff4ad4a6c981ead44bfc662fe9bca4b56cd790698e4deddf9a4bd69327f26bfe801e6
+Invalid = Y
+
+Curve = P-384
+X = 08bd5c6cdc1f8c611df96485090e20e9188df6abb766bff3c1ba341ed209ad5dfd78b628ec60998ddfdd0dd029352fbd
+Y = d9831d75dec760e9f405d1aa5e23aac506dc019fb64d44bd57f6c570d017e6609f8fdbb2dc7b28ca9e00e37cd32a3b73
+Digest = 9a4985f744dd6f2774cb6f20ad6b6969e212abf4ac035b72ad3f8b1955ae1862
+R = 8b372c86ed1eec2163d6f7152e53696b4a10958948d863eb622873b471702ac5b2e75ff852149a499e61510905f98e4c
+S = b2ed728e8b30787a28f2a6d3740872e47348686c7cb426411379411310241d25f08a026b853789b1157f1fc1a7f6ff49
+Invalid = Y
+
+Curve = P-384
+X = 10a784abb3c549444a62c28df1c926b8aabb20c8d9aa4b1f7ca830258857cbe9718dbc9845fa9cbb78587a373baee80d
+Y = a1ad0c10b5ab6780cad49c8cd3eebd27de8f1b382ddd7a604458cef8e76ca632a7e44e1c63141a742426cec598029e2e
+Digest = f5b47101b4ff9baf64aca830b6afbc4f9620035d88a1d84a12cefa6f7f99faf2
+R = d9e52be2a3f7f566899cf6daaa38116d092473066f3a1bf91f3df44d81bca1deb438d9d25ce1632599c1d3576a30f128
+S = 0cad30bce4b3d7f40b3eef762a21bb1a3bad77439838b13024b7b2c70316875a99e80723a74a9e7a404715ca06a5d673
+Invalid = Y
+
+Curve = P-384
+X = 8760182393132d69011edfa127e36f92eeac8272641c27f52f3337ef8af7451e6d14f4e4590c7eb9fafb76e8c92865cf
+Y = ebc2b123ed871ca570ead40ae8f6f32335393c569b21b38f626d09c064a3c8668e9fb10a4667e0f0c68bf25ca98fd6dc
+Digest = 979131ca1d07e0b4ac6f27b20a978e0a230159eec4906db5dbd22b10ec71af87
+R = 1db957e5c2d294035d7f476a0cbc28a4aac2614d8212de5017076cd836bf04ffe237dce8fec91f2fb5ef82449ff1c65d
+S = 3e3b9058d0a9c5b417f9c6f86557b9d50e7a902694a7012a1be6bb70708497e4d39fc1f6d6bc60dfa52d23cab173385f
+Invalid = Y
+
+Curve = P-384
+X = 2b1f98d2acdda8347b9a68c75174408eae7de3d6b9c08c26e73ce9ed2ac147b8d90cd82e30ab43909d63f6b457de2071
+Y = 33f5e6f5f5793201991e014cce0045d04adc352298e32f45f4e374450111c8456b5c2efaec43d157949b5c191b2bc934
+Digest = a1daaf888d93a2a7e52bcd2a66cca3ff2e02916616d1919adefdd7257490e5b8
+R = 23d046402cbce807d232bcf0dc96d53c72992e0ba1ffce0d79050c0f4c5ad9bfbbdc1c96c730d67ff3aa3edaa3845da9
+S = 2cd46a4fe5d120b3af3a6d9ea63cc78f4079e8b5520a8fa96828334a4f182ff4d5e3d79470019e4eb8afc4f598b6becb
+Invalid = Y
+
+Curve = P-384
+X = 86ac12dd0a7fe5b81fdae86b12435d316ef9392a3f50b307ab65d9c6079dd0d2d819dc09e22861459c2ed99fbab66fae
+Y = ac8444077aaed6d6ccacbe67a4caacee0b5a094a3575ca12ea4b4774c030fe1c870c9249023f5dc4d9ad6e333668cc38
+Digest = e3bcded61cbb0bf6ec20d59f91e8e73e532f15b082b89c984c1b51fb0d1db8a9
+R = 798065f1d1cbd3a1897794f4a025ed47565df773843f4fa74c85fe4d30e3a394783ec5723b530fc5f57906f946ce15e8
+S = b57166044c57c7d9582066805b5885abc06e0bfc02433850c2b74973205ca357a2da94a65172086f5a1580baa697400b
+
+Curve = P-384
+X = 9e7553eab8cc7e2e7396128f42ab260c6dbb5457cbff2070ea7c0db21def1537939e3f02699e5dd460eca3798d08bd6d
+Y = 892c0c8e47dddf858e89099a8fc1026e8b8333532b22f561f7647f63f9c79dbf5e8dd18fbfe6ff34902233119c5d5aa3
+Digest = 0f2a9b447ea5cfcfb9e67d661d7f0752befd3b4e3454fe40b9ae1eca47806025
+R = 2452da6a48c3749b66e576e0f1f768d51728be17aea149164c4e1654c5ce27f625a4610c4a2eeddb3a0626d3abc6c37c
+S = 499504fb58c9db24a7ff5f7921e1312f8aa583c08a308e080f5ef1acf5cdae7927c4101573db069ab0b6de7f4f1cab38
+Invalid = Y
+
+Curve = P-384
+X = 0cf4dc51e71185a29c0c6fa3c075d9da5bd7ede085053344dce5dbbe8329e8ac9045f7246c9d0efed393b8e113c71429
+Y = fdb7917b73974b355cf9f3bef6a0a460c2d39fdf1fe32a7744be0a54ddd1cfa8d03914cff4b5ca536b40707ff2629aa4
+Digest = 331aefe2369b9c5ee6dd9f850259b3b8512f5934434e61573f97fe2c1cd2b147
+R = 3812c2dc2881d7ef7f621993b161672329b261ff100bbd19fb5826c9face09aec2017b6843d69336b813b673c5402527
+S = 5dc102fab9d6325131c556ec00309c2959d1031a63fbc1e2d5d04996d3234ed33875c0ab98e5878e9bc72742519ed398
+Invalid = Y
+
+Curve = P-384
+X = 6c590434988155236b43147389c6dbfdd27dcd3387e9b4c2587ece670753a542a13a736579887791cf53d31e5ce99994
+Y = 35a20194ff3f1b55f7ffb2758ddd4b98dd0d9e0cc213e10ed25e8e0430fe861066c1d4423c67f0c93f7ebd87fd3c561e
+Digest = 153475076a003545d3ca3d4a772866f12cc85f6e69f8c486a91a80fd709206b1
+R = 89ff866889245e797926509e563b1746920b78c9370a6cdae52663730d131e558e327d1f5fef8faf9e6c802fa29504ed
+S = 8dd68e2de2f788e598b3e5a60c18d81849a0cc14b3b0e3c931910639f3125e5d6045f00330b1fa989252a80f95419b04
+Invalid = Y
+
+Curve = P-384
+X = 499cbdf18ec4e69b88051543c7da80845fa2de8be2b9d9045fee7f104a8b5b7d04e69142de9955c5ab18c5a34ebff075
+Y = a29cb8d28836b201a389922b6f8f93870f09c80a00242d00d32656a43ac1440fc55bcb123551a73290f603c3469be9ed
+Digest = 5f00b3b48c1ee8287abe6f3fbc3438b91f4268f318ae2aa1e7810369d6716020
+R = 25d4d243da6fd9b439a9242c3656fade7acb7a306e8cf23ea89e3ff4f9330be19c61aaa42d7b426d12c8e0f96b80dae5
+S = e7a99cf4b269bb4a6210d185e9654602523b5cfa1cddc94b1db92018aa557ecb6adda44c816975f5ec1756b6df3c44fd
+Invalid = Y
+
+Curve = P-384
+X = 9a74ea00203c571bd91ae873ce0ed517f8f0a929c1854d68abd3b83a5051c0b686bb37d12958a54940cfa2de23902da7
+Y = 6f20ccf8fa360a9ec03d7bb79ff17ad885f714757ef62995f824908561dc0c3dffc49d873627936a2fff018b82879ced
+Digest = 45c3a1b29a18780234f12f5e4b64e7af9de2acf0029ce55b706cc79a7e4df994
+R = acc1fcac98c593fb0a0765fce35a601c2e9570d63ea1e612fff8bc99ac2d4d877750bb44cfb1014e52e00b9235e350af
+S = 7f53de3afa4146b1447e829ebac8f5645e948cc99e871c07280cc631613cfdaf52ccaeccbe93588a3fd12170a7ec79fa
+
+Curve = P-384
+X = e22f221809fb7a054ac799a70b3d24744eb7c5096c8671770399527c88ccf9ddaea0257a0ae9430d927ff5d9f109c533
+Y = af4101d60df9b306ae92da7592f4faf3df422a3e33f1c2ed2973b2b900eefc346b4cf024de650abf537cecd12ac77618
+Digest = ef1057d83a6e6481be7caf2c12c15f085ff971f02f0db8544352558e2b9fd61c
+R = c39a8e79f0560b9f26504469a470c7b2230c0d25de07c206e87dfbde9aff0a5d85322f56dfb50d4c1fc67c67d615dad7
+S = 2ad94dd13a39cf4f4cb24c2c81d4c1181652363addd856dc9ba7455458e40ed047cd113129bc87f43949d5a98a0d5205
+Invalid = Y
+
+Curve = P-384
+X = fa8ebc3682d90ac7356f0b75b9e3376e76518676e0bedd176cfa7fa57fea4b3a399dbb2bf735ec90b9c1705cf9fa6f57
+Y = 18c3fbca0150ec10696b3851f31fb3ba62c0b6be509d249e0d4b374c7a08e49338e0922e2a8a9319999e6569ab8d292e
+Digest = 0c7152ec620fe9b783625196b41192dd5d49df184ad26965c970ac5e28bb1c4b
+R = fb58ab09b8a7ef7a6ec05b854eae11af9b713f7c7540e25115f609846e636ad4f88dcf4dd61e311273df23ccda474f03
+S = 485be4c21b7c3a9c6b39ffc9f0c39f4050f76d2a6b3fae203d016318c541c1b4ad6cfc0d0950636ff6883895dd49e4e9
+
+Curve = P-384
+X = e5f331536a2940cd67234bedf813c12e15aefa9a1a68429f8754bf2769a47c9c2efb5c42135e7b01a110d7302e097eac
+Y = 63b2398612c863febd482184e834d3acb51408c49aacbbd35d8719746f37cb13e013c9505ce034cd815aacd10d2f7a0d
+Digest = d925955406f6b6dd4df05270a2539a5924830dfbcbf6a5a34f21354db246244b
+R = 96c35f22d036785a392dc6abf9b3cfb0ad37b5c59caefcc0b5212e94e86739a2674020ff79258094d90d7d59f09d47a1
+S = 373cbc865384734c56952f7a35a1fdecd88e8b343ee3aa073d30f5f25b73506f1e5f5857f668b0080dec6edeb5e1be96
+Invalid = Y
+
+Curve = P-384
+X = c53ad865beb1e2b92764065f1a6bb465ee94aacabe43426a93c277d02e00fe36be1c859ba08a031fc518a0d007668979
+Y = 6728d42bae9bc097151748ffa0982964bdd16076fa0e7cc15837c1f773b08d02c3dbc57339091ccc34105b84781150b4
+Digest = 6d5fa5b492406a1e93df6bb6364d7b17a24ef43807a1159acc77486dd7b49b60
+R = d4f0dd94fc3b657dbd234767949207624082ff946de9ce0aeb0d9993b8c7d7935760e1bf9d8b233bc7d6cd34928f5218
+S = 0941df05062aa8849610f4b37d184db77ed1bc19ad2bb42f9a12c123017592bf4086bf424b3caad9a404b260a0f69efb
+Invalid = Y
+
+Curve = P-384
+X = 1f94eb6f439a3806f8054dd79124847d138d14d4f52bac93b042f2ee3cdb7dc9e09925c2a5fee70d4ce08c61e3b19160
+Y = 1c4fd111f6e33303069421deb31e873126be35eeb436fe2034856a3ed1e897f26c846ee3233cd16240989a7990c19d8c
+Digest = 8cf5e81c6858b8395421d8c913f1ac887e282b5818eab525fb79feb9bc64bca7eb98f94b9e48b705e6c28311bb0ca672
+R = 3c15c3cedf2a6fbff2f906e661f5932f2542f0ce68e2a8182e5ed3858f33bd3c5666f17ac39e52cb004b80a0d4ba73cd
+S = 9de879083cbb0a97973c94f1963d84f581e4c6541b7d000f9850deb25154b23a37dd72267bdd72665cc7027f88164fab
+Invalid = Y
+
+Curve = P-384
+X = cb908b1fd516a57b8ee1e14383579b33cb154fece20c5035e2b3765195d1951d75bd78fb23e00fef37d7d064fd9af144
+Y = cd99c46b5857401ddcff2cf7cf822121faf1cbad9a011bed8c551f6f59b2c360f79bfbe32adbcaa09583bdfdf7c374bb
+Digest = 965b83f5d34f7443eb88e78fcc23479156c9cb0080dd68334dac0ad33ba8c774100e440063db28b40b51ac37705d4d70
+R = 33f64fb65cd6a8918523f23aea0bbcf56bba1daca7aff817c8791dc92428d605ac629de2e847d43cee55ba9e4a0e83ba
+S = 4428bb478a43ac73ecd6de51ddf7c28ff3c2441625a081714337dd44fea8011bae71959a10947b6ea33f77e128d3c6ae
+
+Curve = P-384
+X = 9b3c48d924194146eca4172b6d7d618423682686f43e1dbc54ed909053d075ca53b68ae12f0f16a1633d5d9cb17011ec
+Y = 695039f837b68e59330ee95d11d5315a8fb5602a7b60c15142dbba6e93b5e4aba8ae4469eac39fa6436323eccc60dcb6
+Digest = c68382d0641ffad850c41365a8ec68e3d55acba376d1bb941e7dcdf7b71f37b8288b023b942373a40be1dfaaf4aea633
+R = 202da4e4e9632bcb6bf0f6dafb7e348528d0b469d77e46b9f939e2fa946a608dd1f166bcbcde96cfad551701da69f6c2
+S = db595b49983882c48df8a396884cd98893a469c4d590e56c6a59b6150d9a0acdf142cf92151052644702ed857a5b7981
+Invalid = Y
+
+Curve = P-384
+X = 5140108b93b52d9ad572d6129ed6564766f8df3755e49fa53eba41a5a0d6c1d24a483c90070583a66e3cfa52b6fb1f31
+Y = ff52498446a40c61e60c97554256472625633eda0c1a8b4061481fecfbe9c4503e99dfc69e86c9e85c8cc53dca6b8dc4
+Digest = 4b945020c329a61221060e924ec682eceb842c09537fe26265ad084753b89f7650cee4e8df30b38126984d80fd25d246
+R = b2726b2ba9da02de35e9953fc283d1e78700860d4c33dce8db04dd41499d904866c1b8debb377f6c0dfcb0704252174f
+S = 0775b027068d7ad55121a278a819f52099ace750d5e996eaec9dee7be72758736cf769650148fbd5c411beb9b88f979e
+Invalid = Y
+
+Curve = P-384
+X = 31f4fc2fac3a163a5796f5e414af6f8107ab5e4a98c755d81efa9d5a83c10128c16c863190112fc29d3d5f3057a2edf1
+Y = fe208743f3e96c3a34b5fff78c9716c074a1ce3dc01c3f0e471ddfae91cd88e7dda38dd0e5e1f91b00b8539da3cc10bc
+Digest = 2d6affdf541609f649dbe9fd5829059bf42021fcfefee42d8c9cd5c127015c06b4c3c13ef56d08767788955887752e44
+R = 706911812ec9e7370234efd57b2855975eab81e9c2fe783aa8e442dc6e7d681dab2dc0dfc6765f87ab67001108e3facf
+S = 42c89efa22d853d32f619c9fe13e9852889ac98a9fed5d4fa47fed238e1cbe70d7970af9f7bdf84e51176af4885f2490
+Invalid = Y
+
+Curve = P-384
+X = 1f7911dcfe63a6f270cf75b8584d9b1b4a00afc1fa43543c945945b8a821ebeb37fbc705a000f9cc7c35f7d27027b7bb
+Y = f11835ec80c4ac06d99247e73bf72522109ac255e6109262de4dfbf9619244f74fb6c9ee57694537d7e79c248db34dc4
+Digest = f4b0a912331e7fc59a7071e5f47c9dafa6dc09b32c5c3d05301b3833bbe0b9168e2b63f12248849572a322b2f5423b8d
+R = 3587c9c6885adf3be1086825f9a41ccd2edfa0bd95e7fc4dba5a9710f41d539132de7772f14c18e318f8992b66d2a86c
+S = 73a844d729599d4e3e3c1b63e9c4bf5a73d1f69e0160857fe63a56c381c051f5c37ea6b4cc4caacb6ff26ef9699efe30
+Invalid = Y
+
+Curve = P-384
+X = 2039661db813d494a9ecb2c4e0cdd7b54068aae8a5d0597009f67f4f36f32c8ee939abe03716e94970bba69f595fead6
+Y = e2d5236e7e357744514e66a3fb111073336de929598eb79fb4368c5bf80814e7584a3b94118faac9321df37452a846fc
+Digest = cae50a424395e38bde9ba31fa5ea0c107ccceaff06663719162aac2c3e15f2b2cfd376f90d371326e1d29e0392a756ee
+R = 164b8ac2b34c4c499b9d6727e130b5ef37c296bd22c306d1396c6aa54ca661f729aa6353b55d7cf1793b80b5a485115f
+S = 4e7187f8f735b7272f2c0985315b5602bb9b1a09f32233aa10570c82d1ccedef6e725800336511e47f88ddbbbdc08f54
+Invalid = Y
+
+Curve = P-384
+X = 46dcf8ee848c6459fa66d1cae91ccd471401a5782cb2d3b9b9264189f0e9ddf7197b05c694931bde3306240cf9d24b7e
+Y = 79d9508f82c5ead05c3f9392f3b1458f6d6c02f44420b9021d656e59402e2645bf3ba1a6b244ddb12edbb69516d5873b
+Digest = 039fe89dfc54e7f2162545af700a8c49a1216b08854643656b07d74e7032516fd0c9368c5e5ce54655e4d08baa29b6f0
+R = 5ffba3b5bd7c3a89ec40b47884b0b3464e8abb78608c6d61e1e62c2ca98d44fcdf61825d69dffee8408d0849d0623bac
+S = 0d2597b5fc3842ffce1957172253a8c9c0e4dbe770ce54f70f139e0545dc34ec639d609e14175bdb2b812ccfda00c9d4
+Invalid = Y
+
+Curve = P-384
+X = 097cea75f685cf4d54324ad2124ce3f77b1e490bbaa1ffacde40dd988f7591e1c5d158e6f232500d958762831914af7f
+Y = 716d8bc056daf69ca2edd21b89a6ae9923cfcae87bfda5f9a6e514dd4b9d28d164fcc613ca2afb9660adfece59f09b66
+Digest = 02afb35f1df33b3d83df3391ca4184121ca52f520dd12ffc891aee77eab6503f232a5b1231bd997239751f46c4133edb
+R = 1c5d4561d2a3af8835839b543098c101c715c545eb7d00300c5cb05bb08dac29e732ffdc31c50915e691999ad505104c
+S = c3442f2fb1498fd47c2f959edff37a19783e3ccee80dc6955ca64db087fd188e67358e7b9223535bbb858d21ba6a978c
+Invalid = Y
+
+Curve = P-384
+X = d2e2b3d262bb1105d914c32c007ea23d15a98197f0ed90b46a17f3d403e406a76c8f752be1a8cd01a94fd45157f6511a
+Y = e585fba180017b9983b4c853ad3a5dd52e079c5f0ef792d1a0213b6085e390b073de1a4b01749ceab27806e5604980fe
+Digest = e66b11b84f87c38526438e5e3c5b4521248c358eaab80e40526906a05fb29d14d4e5686681f03bc3f0025d45dfb83b5f
+R = 49c001c47bbcee10c81c0cdfdb84c86e5b388510801e9c9dc7f81bf667e43f74b6a6769c4ac0a38863dc4f21c558f286
+S = 1fb4ff67340cc44f212404ba60f39a2cb8dcd3f354c81b7219289d32e849d4915e9d2f91969ba71e3dd4414f1e8f18f7
+Invalid = Y
+
+Curve = P-384
+X = cd887c65c01a1f0880bf58611bf360a8435573bc6704bfb249f1192793f6d3283637cd50f3911e5134b0d6130a1db60e
+Y = f2b3cbf4fe475fd15a7897561e5c898f10caa6d9d73fef10d4345917b527ce30caeaef138e21ac6d0a49ef2fef14bee6
+Digest = f6325d6bcaaaf1aba1197a290b33974f2fe8af200d5d726e78705904e9894ec31988e35dc76b9976834b7cd1c4c67146
+R = addfa475b998f391144156c418561d323bdfd0c4f416a2f71a946712c349bb79ba1334c3de5b86c2567b8657fe4ca1f1
+S = 1c314b1339f73545ff457323470695e0474c4b6860b35d703784fbf66e9c665de6ca3acb60283df61413e0740906f19e
+Invalid = Y
+
+Curve = P-384
+X = a370cdbef95d1df5bf68ec487122514a107db87df3f8852068fd4694abcadb9b14302c72491a76a64442fc07bd99f02c
+Y = d397c25dc1a5781573d039f2520cf329bf65120fdbe964b6b80101160e533d5570e62125b9f3276c49244b8d0f3e44ec
+Digest = 709d1bf45b5817f5a67b859651eb47133ebed2622fda09ab66d3467b5e95da50ecc2c74d8f4d289feebec29729a4bfa3
+R = c6c7bb516cc3f37a304328d136b2f44bb89d3dac78f1f5bcd36b412a8b4d879f6cdb75175292c696b58bfa9c91fe6391
+S = 6b711425e1b14f7224cd4b96717a84d65a60ec9951a30152ea1dd3b6ea66a0088d1fd3e9a1ef069804b7d969148c37a0
+
+Curve = P-384
+X = d1cf635ca04f09b58879d29012f2025479a002bda590020e6a238bccc764478131cac7e6980c67027d92ece947fea5a6
+Y = 21f7675c2be60c0a5b7d6df2bcc89b56212a2849ec0210c59316200c59864fd86b9a19e1641d206fd8b29af7768b61d3
+Digest = 5d54d236db6ab4691b3d50dc81471c5d388e5735ebdd435e9742a5a8a0ad0e841bab57326c8535a680ada57d2b3a70fa
+R = 6101d26e76690634b7294b6b162dcc1a5e6233813ba09edf8567fb57a8f707e024abe0eb3ce948675cd518bb3bfd4383
+S = 4e2a30f71c8f18b74184837f981a90485cd5943c7a184aba9ac787d179f170114a96ddbb8720860a213cc289ae340f1f
+Invalid = Y
+
+Curve = P-384
+X = d15ca4b2d944d5539658a19be8ef85874f0c363b870f1cd1f2dc9cb68b2a43a10d37064697c84543e60982ab62bb32c8
+Y = 062fb7dfc379fc6465302ac5d8d11d3b957b594c9ef445cfe856765dd59e6f10f11809e115ac64969baa23543f2e5661
+Digest = 67cf9e6f9e9558a379ef7361771323a4f3925f2c7a5d94d9156bf2d9d45f9f8fc4d47322da622fbce92fc764a2ccc327
+R = e2cf123ce15ca4edad5f087778d483d9536e4a37d2d55599541c06f878e60354aa31df250b2fc4ed252b80219552c958
+S = 696707a7e3f9a4b918e7c994e7332103d8e816bbe6d0d1cf72877318e087ed0e230b0d1269902f369acb432b9e97a389
+
+Curve = P-384
+X = c83d30de9c4e18167cb41c990781b34b9fceb52793b4627e696796c5803515dbc4d142977d914bc04c153261cc5b537f
+Y = 42318e5c15d65c3f545189781619267d899250d80acc611fe7ed0943a0f5bfc9d4328ff7ccf675ae0aac069ccb4b4d6e
+Digest = e8d6b550271b486e79f6975cff753d49519ed9393b207af7039b4c070cbc2fe7d49dd1bb87f7021e442fadd80ce8a5b0
+R = b567c37f7c84107ef72639e52065486c2e5bf4125b861d37ea3b44fc0b75bcd96dcea3e4dbb9e8f4f45923240b2b9e44
+S = d06266e0f27cfe4be1c6210734a8fa689a6cd1d63240cb19127961365e35890a5f1b464dcb4305f3e8295c6f842ef344
+Invalid = Y
+
+Curve = P-384
+X = d4e93c4bafb54c06814011309e9f3d8e68b76a5452e364ef05ccc3b44b271e576c9028106b1584f09271c886d467f41d
+Y = db730ccfdeb6644362f4fb510d5254bfe6f23e891e936132f90f1913e93baa8b1f8c0613a0f0c61a760ce659f22babc6
+Digest = d5c82ff11f555ce21c3f20a9ecfa6047cb6895e32fa0fb379f49085a59f61b7c8fa05058ef144cf47db5738fa40f4890cb59695998a2358162bbbf6d7f53517b
+R = 8d0fd14a59c24b0c2a34b438e162f1f536fe09a698cacfe0760d026d1593265d02f2668d2a5e49ac0b21e93807aa9c18
+S = 3162ffd2adc9dd5ec1bb1d97d2b0c27b8ae234235ffb374878d0b76382002ea505e885c178d56a2d7809bd1d83117ef1
+Invalid = Y
+
+Curve = P-384
+X = c665feccf51e6bca31593087df60f65b9fe14a12022814615deb892eedb99d86069a82aa91319310b66588185282dad6
+Y = 1e6e25bb8ae7714415b94f89def0f75dcb81d4af6b78d61f277b74b990c11aff51bd12fc88d691c99f2afde7fbd13e51
+Digest = ea056beb112fa9aad69c8dfe51ea947b772bf1c11287edcede43a98089d21492ed581edcb6d1823e2873aabba213b84291db3bffa6eac3ae43a92fc2da276a24
+R = 0e18c4063137468fe864fdc405ad4e120176eb91b4538b28ce43a22ae1a310cc22a2f7a2b3a0f3d15e0f82038b4a4301
+S = 5a1620e42041ce4357daf824befbb2ed65596bcd8214e88726149b26b1f416b9472a8877413f1c3705fc2edf4731943b
+
+Curve = P-384
+X = a6bbf85e8068151482ce855ccf0ed22988fcf4b162c4b811cb7243b849299e3390a083147fbd68683203ba33588b13ae
+Y = 5c837ec9f2eda225c83ab2d5f10b1aa5bfb56387deebf27ecda779f6254a17968260247c75dd813ea0e1926887d46f86
+Digest = 81b1303e10f25d37877b09f9d82dbd894e40264992d86cc74656ebeef505b46fdf9dec312a7f0a26e3f56a7195d5b01d198c378fff9d049e00cbad9586da20c9
+R = 9c11879e59659848274fc1ef5a6a181af813d23708b09a24dc06c089b93b918828dd938a75a34d5a681b0af362dc19a0
+S = 9c362231962ba7579c4a874e87bdc60dc15cb2e0677149c8ea31162963e05a6614616f67a5269616071cf095be7ff44b
+Invalid = Y
+
+Curve = P-384
+X = 9c1eb5cdb1a873e4c275b7ded8712b9058ee0d9ded06c96a2a8d7c652b82e894e2f918dd8e18138e5c34821744b97952
+Y = dd474c93619f02b5d4fe30ea7805c1a13fb80008a81bb5f3eeb95cd11f38841b8e34d64f2c6cc2d6cc2587365eed6b6e
+Digest = c0f9ae90fe8aaf54962e7d47a832e4ca6e60355e4066cd2b08bff78650d4e4a5d1eb1de296f9f0ef92887e09f82e0db4411aa9c3c6b109159bd39feed40419a3
+R = f17b2f2fa3b5c8e9c62a633e5d417139ddf3dafba75b464fa156c99b3948a0aca532c7fd3e14a266eb17e7fa80881da2
+S = 01c246866983fa74d6dff38b1ea091f8afd218b5a42467761b147c19a3bb20cd24be8ed1f95f1e61863a709d2d0148e2
+Invalid = Y
+
+Curve = P-384
+X = 20622a293edc96d83fee77cf1ee8077c61d6f8ed0073d53cfb5ee9c68e764c553fa4fc35fe42dade3a7307179d6fc9c2
+Y = 710fa24383f78cc4568fe0f4ecbbe6b11f0dce5434f4483712a6d2befae975a2efb554907aa46356f29bf7c6c2707c65
+Digest = 5cb8ed471a4001e280a0927faf25183c857b9b2de21c8566e8a1bf04ee085c36db7fab9d8f627898b3bb23c10225305938b56a732659f2cab3fa857d80dfde19
+R = 45a6cf5cef06256139caa709292d1e0f963d176add188572e9c7be29af21a95853a98e23aef0a0850e58d44d60b6d780
+S = df8d71cd5ab22fc718070078103483e5258734872ab935435f21ea199018e49a69c064a63801beb0759fde6e2c4a85b8
+Invalid = Y
+
+Curve = P-384
+X = 83a4fecc0bf0a353b0acf6f54094b822f2b12564e172b296f3461cafa7315d7d31d0089b1b4c18ad3c86bd18f539774a
+Y = e4fd57c5b2937e6fba1e7d72fc3f02352bd79c13611931935f4dfd073b9379f862f2277585137e996e212b5b6533dcba
+Digest = cd7c623c3c3b52f46be0ebb2b353ff97db3cd7dfc1a059a57668fc50101aeeb37b8aee9ddda8ab611546999a120cc9acb0e2c3df48dee66d5c31a46a7be94bc7
+R = fb02804010a570d702ebfbcf3d6cc9d55ddac2bd4b4de56d325e9790571b1737f91d3fa1d4caeec6eea806195aed3187
+S = 1fd20fe383e907e77639c05594642798619b2742090919bedeefb672c5700881baf0df19b9529d64bc7bb02683226103
+
+Curve = P-384
+X = 208a8c5a6b59458160c5b680116c8b23799c54a7ee8954a4869425a717739facfe4fe24540505cdc133fde8c74bfca78
+Y = 22aa7aba797bde1e8389c3c3f8d8d9aa2a914f4d2d7aaf7187ebed9b2761975718ef97660ba0b8a71dee17f2b982e2cf
+Digest = 007b907b90fa60835d45d2f0201a4486d9782fea4f0a235d97d4968336c5369c6c2e82bded56288a10fd6741f4c15d1633bc92e0196308d9f0490fc2077d3b6c
+R = 0b4e835ed83151d2bde96e201c54544ba5f301aca853957d3c538c9858fcce796b60fc50f5600a48dcdf13e5bc029827
+S = 0270adf02d31d5428d523e13d7d315c1929a1d89bbd0f61eec0b1186abe1c307cbba6b1067a68bc3947e6196d49719a0
+Invalid = Y
+
+Curve = P-384
+X = 80ae47e99107d6148b1088c6694df5c1273ff336b66e45b68a7c65fed735129dadcaf2b900e9f8ec50eff70a5ba89ea3
+Y = 47450efb5669bfacd7cbff1f801aafa0812ff88a6ae7b5a1f85e88e19129ed995f509fbf8dec15ce42bbbbd33814c09e
+Digest = 1cacc8f609080e7b8339529f944850a700977ef9107f40956fb35645e15fdd54ef01755f07a2582d0bf2ca0cb84ee8ab154fe0914dfc9ad7ad5fe54b857d0f4e
+R = bae6fba7b1485ecdca48219ead3c39295fa9c196b1f0941445b1ac768e33962f68d37f1f1749eaad7200064aa202fb41
+S = b411a38d02deb42d1015a7837b033c89d2f37d92c70fa8bb1f592223f7750520b950f30277abfb4155a3ab194b3beca0
+Invalid = Y
+
+Curve = P-384
+X = 45cb6dcca8d2e80ac04536a22f9d68ea2313245550108ddcd32799d154c0a55492e49463e826275bd9bf0d5e380205c1
+Y = 6fd124f5a6c745751ccfb3ba4dd9144ea8fd41a4d9a4b34820434da66aa7385e73ffe71e6c11ed1beb6c7af22ce00edf
+Digest = dd7947a5b9a1c988dd7dff537e15335aacafd3e602adc8373765013f338334dd58aed4fb7144de0007c3410d79f5e78bcd4cf0dd63cc33ed3dd564882e299c7b
+R = 2c782c4263eeee63657fbf20fa287a1a81fcd14b1d3bae333928ba4fc31abb20edebc130714380608e38ea74309eca9d
+S = 716113d95bc9dba532bfb470112b0d43d9cd6560ad15e0de2e514994801ff339bcf19ad4ee2b8af573f57c038fbd70f0
+
+Curve = P-384
+X = 36c1459d9e9f7b6c1598778c784cbf94661a2b11370c02ee092f6ea0ca20acf81f1ed5048a28a1466a91689df26bc291
+Y = d1367418c7b216bd32c6dafc8b2be99d02cab68df990758b2ddd543b7eb6ff6e285b649ffe588b1811b549cfb5f0289b
+Digest = 242ff2713c03e3d5277652f8e7fb1e5a1f0422b6652e1bdd696e46c03cdd3aaac329b1d88e7aa345ff7224ce6dc6df05c7e9d7dc2665282c817d15a15b8288fd
+R = 40c338adeb504193444bdb95336177362031aaadc5b7e151e42030df9dd8687f3cb8fe2292fd4f9206989c089d966dae
+S = be4b2ba251094c24de006c89af2b5c77e6937f36d7bb703b4f8edcfe65d45f4b2fd2486222163ae0ed9e215c0a96f488
+Invalid = Y
+
+Curve = P-384
+X = b5eb6670bb0b0d3aef10e533d3660756b7372a2a081d9d920130034f48202cd43b9e2d1e5893d0cfb322db65ab839716
+Y = e28444770396041b489b302786a57fca9a98f19685cb4b455d219151e64645ad30dd3149ec96f3bc90879834b65e58aa
+Digest = 8d2e653807e87962883956ee3705b2167c50370c3af12eb8f6c26f0f15ede56dddc7d0c9642a1c1c2444b06571fa1a4d47e7884acc7ea3884daaa50940f782e2
+R = 0887a13df940907864b425ec0d8f91ac719abcc62b276fa08c5122b38831c8930abd3c8454e98182bb588fc72843717a
+S = a380284eacaa36a34e35f04fbf6e28ffb59176f41ea52d9c9bc1362eccd8e0d699c2e08111d93e9dc2785637b1f4f09e
+Invalid = Y
+
+Curve = P-384
+X = 700e8f65e052e918a63a96fa57f4eda849f9f9faca3302d6ead66ebf85838f8145a6d6718a681b7bef73170d7254958f
+Y = 9e9e10357658913007803859165926cd1e5e92c3a644d834098cb1cbfab466349bf4238a5154cf50ed77c77a78263e81
+Digest = cf885fa7a96db595f825a0ccc56b70b60e0e1c30d0a15af636d1f4957328aecb7eeb734d5874bd72ddaf15c357ca36bd42abf387f7b771ea6160e2e23a08652e
+R = 59be870e0fd684b000cce95c616d9f34674354e9d20db15d204b8a6285ff55258e4eeb49da1573ef1030cd6b2626dcfb
+S = c0bbbf71d87479d82575458be9f4d686921db7ea458d620271f51ec3f4d1afe3bf25ef9c0c400eb7b92cd7058fb17346
+Invalid = Y
+
+Curve = P-384
+X = a9de6f029445fffcf16349b44095cc83b11e3d0d9f08654b158014803b1cc31b8dfe00b1a8167c6f704d69cdd62c6512
+Y = 27336a503a669ba1d1f3619f51dc8aa2a44b2075c682a36f071be486e7dafba9adfac2ce74be0442b7251e99304ffc05
+Digest = b7e73f38767f253790e7fff019b4e0e61562aeb97b2b749afec2a61c87ab0e15916d4286c0a13989912f6bafdf3efc6f64ddc3b944f9041266e5abd4480c1606
+R = f93a4d2eb94d087f28572847e0099ae2ee944efacdad392ec268c9c1e632e6ccd670c36584e58aba52a4c2b07127d55a
+S = 941ee89cea6e7ed20213a95482fae134707ddf4d292ab1952ed5464f1f1138669dedbfc9998b696eaf469be5fb240c80
+Invalid = Y
+
+Curve = P-384
+X = e63500d6d13069c01fafc4518f1d429661c5bb6ad1ff0383037ca6a469a5c20c453dce03bf6e4164f7e26f849016b3d0
+Y = 83b7b731c2531c3ac61b194cf3db6dc02ccdfa16d9eb49f97bc4ec3fe6c8bd865ea27f1538531ad07dc44fc5107af8e6
+Digest = afc0ed355377d0ab0c4f79d420dcf67ad4920c013d5c8afde2287525da4596672927540418a61568b21ae7799d7659f16b85f611bd6e8d2066a55903da0c48b9
+R = eb78733e73fd64a6a1f23eba5311af23d26816fb8847671e01fdbd8dc7d5fce1a0823b080ee99e8d75edb3f100e16077
+S = bcaedfe599f98b51542c0f94ae1010611c6767ac3abb2bd887399d62fd0f1b3a0e97deb24c95a76de44521bf24c8645e
+Invalid = Y
+
+Curve = P-384
+X = 3ebd869be687f82d844416e6816d698d82e1e22a1f451d50b6c146134deb07f05204c0b04e7dc07ebdcfd916531dc7c3
+Y = 6e4d7bde063edb7254a82b9d9249d2a2b9ad8988c37a84ac9f7c09daed42b1fd28f7cca1ea8b4f91a66e878224800bdc
+Digest = 56a61339a35750e95770f28846930e3f594e8d759e07423718734a82b2a80430b0fb3378e40bdcf5c12be135be9a9bec32916b4988a763091a6da7b44631414e
+R = 575f87a8a7980555a198cfdec279cbb2f89551b5271d242397c29f6bc4bf413dc30312a7e626ef7fc77a9124a79bf9be
+S = f0b7d759246ad36ba8240c537b1eeb5d148c38d324f48028c598eaef6e49d79ff3f6cfe3a32fbbf6f3ed3aaaec31d572
+Invalid = Y
+
+# The following tests use digests equal to the order and 2^n - 1, where n is
+# the number of bits in the order. This is to test the truncated digest not
+# being fully reduced.
+
+Curve = P-256
+X = e57231383637c82c1ac801724cf7e03e67198f467a9beb60ac13cb582d13afa8
+Y = 8f190e090155fcf63810b858bc88e259dc49afef8bdef6fd06d93dddb1991aed
+Digest = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551
+R = 05cc6037bb021f4910ea2e489fab2bae6bb6a2769a97f42ba5736994102b7f10
+S = 5db54832ceabf8bccdb8be99b1a49cecff8feee045cb697dec43118e2695b1da
+
+Curve = P-256
+X = 6e0e2897b9a554ee287cdaf43bfbe25ca8404373971575a0e4b61c61aff5a2fe
+Y = 23ea7823a411eb1b39f81bbde24c2cd6ac68be2c7eec3a0671c8676131b8905c
+Digest = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+R = 16831feeceab2fab1c575e073e944d73ce7e6f3e9b06312088f06159c530ff50
+S = 870cb824692638538b1569c6093fcb693c054e8e3b9a919e3bb26798910f66e9
+
+Curve = P-384
+X = f4a961c19f9cc4ebe4f43081110955f3cede085a08c1415d726e80b2eb774028c5fc96f092ba3ea7d1288dd57fe1db08
+Y = 981398eed0895e09b3b582a0616f3024e51cca7b1ecc347dbf0d24a5f6a222b0c31912f8f5e427d4dde5c6c45212bb10
+Digest = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973
+R = 0b77eaff05bbd922dd80525d2ab301cc119318f5a920a12c71c4b5ff5bb77d25a538983df9bdd5984b0d159daf21f1a2
+S = 73af85ad03a34b6b3993082bf719018d25d1555717b2d2f2535d0601af06a71ad020eff8232d065ab9d7fc4cd0c0ee42
+
+Curve = P-384
+X = 54dd8d7cbf2ccdf1a42f5bbc615a372803b094f6040e3c7b651a61bc6912432c836cf2410ab7d67f543236751d81066f
+Y = 2219d6257b1c80bf327c96786f2b5d0b5a9b9bf7eee9c853bf66a3bf09520494cb1f7823e4c566d79a617b7e201ead96
+Digest = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+R = 9d923e199d98272e44b8fba382bf3c19660ecb4a9aae3513ff6802a73fef510c15c202807c3f9334b0bce7d6c6a80839
+S = 520784e6290d04d9b61993ee5ebc6fa8ff527fb0777c43cdefc7586701e60edb399005a5648ff852de80208232849fbd
+
+# The following tests are intended to stress the final comparison in ECDSA.
+# ECDSA verification computes some curve point (x, y), picking the fully-reduced
+# representive of x mod p, and checking that x mod n is r. (n is the order of
+# the group and p defines the underlying prime field.)
+#
+# This makes the computation sensitive to values near n and p, and which of n or
+# p is larger. Additionally, there is an optimization that performs the
+# comparison mod p rather than n and compensates for the difference.
+#
+# These tests were generated by picking a target value of r and x, adjusting
+# both until x corresponded to a point on the curve, and then computing the
+# public key by solving for P in ECDSA's (x, y) = u1*G + u2*P. The digest is the
+# hash of "hello, world" with the suitably-sized SHA-2 hash, so the test vectors
+# are suitable for both message- and digest-based APIs.
+#
+# "x" in the comments refer to the x-coordinate of the computed point, not that
+# of the public key.
+
+# r = 5, x = 5 is valid.
+Curve = P-256
+X = 264d796a0dab9b376d34eea6fe297dde1c7b73e53944bc96c8f1e8a6850bb6c9
+Y = cf5308020eed460c649ddae61d4ef8bb79958113f106befaf4f18876d12a5e64
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = 0000000000000000000000000000000000000000000000000000000000000005
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+
+# r = 5 + n, x = 5 is invalid. r must already be reduced.
+Curve = P-256
+X = 264d796a0dab9b376d34eea6fe297dde1c7b73e53944bc96c8f1e8a6850bb6c9
+Y = cf5308020eed460c649ddae61d4ef8bb79958113f106befaf4f18876d12a5e64
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632556
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+Invalid = Y
+
+# r = n-2, x = n-2 is the largest x without a reduction.
+Curve = P-256
+X = 50a50c01132bf79e42b31fb278f7317b29515e9e1c973a41266b69048826fb8e
+Y = aac53e7df37b5eb25ce4ddb705fc7135c6b1e00a7f56e30744f62f258afa5537
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+
+# r = n-3, x = n-2 is incorrect.
+Curve = P-256
+X = 50a50c01132bf79e42b31fb278f7317b29515e9e1c973a41266b69048826fb8e
+Y = aac53e7df37b5eb25ce4ddb705fc7135c6b1e00a7f56e30744f62f258afa5537
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+Invalid = Y
+
+# r = 3, x = n+3 is the smallest x with a reduction.
+Curve = P-256
+X = ce24c99032d52ac6ead23c0ae3ec68ef41e51a281fd457808c83136d7dcce90e
+Y = 8f7a154b551e9f39c59279357aa491b2a62bdebc2bb78613883fc72936c057e0
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = 0000000000000000000000000000000000000000000000000000000000000003
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+
+# r = 4, x = n+3 is incorrect.
+Curve = P-256
+X = ce24c99032d52ac6ead23c0ae3ec68ef41e51a281fd457808c83136d7dcce90e
+Y = 8f7a154b551e9f39c59279357aa491b2a62bdebc2bb78613883fc72936c057e0
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = 0000000000000000000000000000000000000000000000000000000000000004
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+Invalid = Y
+
+# r = p-3-n, x = p-3 is the largest valid x.
+Curve = P-256
+X = 768a0d300a595005a520130e50927d403395c8e1e40be997b48fc048410f7cdb
+Y = 16f217d8e1c02bd887e5de388a17783b182e61b5d534152dc2c4be8d75fdd706
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = 000000000000000000000000000000004319055358e8617b0c46353d039cdaab
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+
+# r = p-n+5, x = 5 is incorrect. r is too large to compare r+n with x.
+Curve = P-256
+X = 0ec505bc19b14a43e05678cccf07a443d3e871a2e19b68a4da91859a0650f324
+Y = 77300e4f64e9982d94dff5d294428bb37cc9be66117cae9c389d2d495f68b987
+Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b
+R = 000000000000000000000000000000004319055358e8617b0c46353d039cdab3
+S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e
+Invalid = Y
+
+# r = 2, x = 2 is valid.
+Curve = P-384
+X = 016d2db67561bc126ad6c344d6eeb2713a9e2892c649af0f015c6b7617f160c8a3b3a88add669d7155025073c5ac5b4f
+Y = 43bf2ed0088af08645c80aa0a24a567a94ba2d794e9689d3ad4b185bc5d2dd008333e2dd2ebb5069a9b32251a3cac71e
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+
+# r = 2 + n, x = 2 is invalid. r must already be reduced.
+Curve = P-384
+X = 016d2db67561bc126ad6c344d6eeb2713a9e2892c649af0f015c6b7617f160c8a3b3a88add669d7155025073c5ac5b4f
+Y = 43bf2ed0088af08645c80aa0a24a567a94ba2d794e9689d3ad4b185bc5d2dd008333e2dd2ebb5069a9b32251a3cac71e
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52975
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+Invalid = Y
+
+# r = n-1, x = n-1 is the largest x without a reduction.
+Curve = P-384
+X = b5b375264c09acf145ca91d12ab10a096092a41ec43f4d718e129ea1c12b2dea62c7785efc52f46f009fb1dba133e811
+Y = bc0b2af172b4b3068d032a798080e76f4d56f72069519e3c19a43682a41794e52cb3ca139348d6bbc923e6a4f7945cb1
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52972
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+
+# r = n-2, x = n-1 is incorrect.
+Curve = P-384
+X = b5b375264c09acf145ca91d12ab10a096092a41ec43f4d718e129ea1c12b2dea62c7785efc52f46f009fb1dba133e811
+Y = bc0b2af172b4b3068d032a798080e76f4d56f72069519e3c19a43682a41794e52cb3ca139348d6bbc923e6a4f7945cb1
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52971
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+Invalid = Y
+
+# r = 2, x = n+2 is the smallest x with a reduction.
+Curve = P-384
+X = 01b54a697305092bac2939fb906d7471b411c4eba8654169166a5da3810e1fc96795df921f7abbf519be4a027435176c
+Y = a19012a3518773d508106d4153adee43c3c384fa62ce36a4addea08f593ec9c76b09a6b9c69d29bd7d47eb48e167dd2f
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+
+# r = 3, x = n+2 is incorrect.
+Curve = P-384
+X = 01b54a697305092bac2939fb906d7471b411c4eba8654169166a5da3810e1fc96795df921f7abbf519be4a027435176c
+Y = a19012a3518773d508106d4153adee43c3c384fa62ce36a4addea08f593ec9c76b09a6b9c69d29bd7d47eb48e167dd2f
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+Invalid = Y
+
+# r = p-1-n, x = p-1 is the largest valid x.
+Curve = P-384
+X = c4fd8e68006b83f7b7b20b731ae405813aa05f6e57374589b36ae1cecd1d49cae1418c22f398188bcf4ef02e89fe7394
+Y = dd1164b3707f59e05129fa228b8448031db159985f035d93470dc42b3ab4129f0760c46cf201d42e73a7e33ba7402ea6
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = 000000000000000000000000000000000000000000000000389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68b
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+
+# r = p-n+2, x = 2 is incorrect. r is too large to compare r+n with x.
+Curve = P-384
+X = 4e5e4f1a6e97059a6cf2f4e8129e5c7c64cb84f9994a41ff5bf30b29c1bf5ba6898627c91a23c73e05cd1a43c8f908c0
+Y = 06a0aed7f1e63a728f87dbd5360a67571a076ab0b4cde81b10d499959814ddb3a8c7854b0bbfa87cc272f90bca2a2254
+Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e
+R = 000000000000000000000000000000000000000000000000389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68e
+S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970
+Invalid = Y
diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl
new file mode 100644
index 0000000000..5bcdb3f26e
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -0,0 +1,625 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifdef __KERNEL__
+# define __ARM_ARCH __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code   32
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+.align	5
+
+.global	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	adr	$Ktbl,K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_neon+4)
+#else
+.word	K256-(.LK256_add_neon+8)
+#endif
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	5
+.skip	16
+sha256_block_data_order_neon:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+
+	@ K256 is just at the boundary of being easily referenced by an ADR from
+	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+	@ not fit. By moving code around, we could make it fit, but this is too
+	@ fragile. For simplicity, just load the offset from
+	@ .LK256_shortcut_neon.
+	@
+	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+	@ support it. We might be able to emulate it with a macro, but Android's
+	@ did not work when I tried it.
+	@ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm
+	ldr	$Ktbl,.LK256_shortcut_neon
+.LK256_add_neon:
+	add	$Ktbl,pc,$Ktbl
+
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl
new file mode 100644
index 0000000000..b1fa016181
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -0,0 +1,649 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
+.code	32
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+
+.global	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	adr	$Ktbl,K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	adr	$Ktbl,K512
+	VFP_ABI_PUSH
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl
new file mode 100644
index 0000000000..a3ed706774
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl
@@ -0,0 +1,575 @@
+#! /usr/bin/env perl
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
+# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+# Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+#
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significantly faster
+#	and the gap is only 40-90%.
+
+my ($flavour, $output) = @ARGV;
+
+if ($output =~ /sha512-armv8/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$func="sha${BITS}_block_data_order_nohw";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__AARCH64EB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adrp	$Ktbl,:pg_hi21:.LK$BITS
+	add	$Ktbl,$Ktbl,:lo12:.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	$func,.-$func
+
+.section .rodata
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
+.align	6
+sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adrp		$Ktbl,:pg_hi21:.LK256
+	add		$Ktbl,$Ktbl,:lo12:.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+___
+}
+
+if ($SZ==8) {
+my $Ktbl="x3";
+
+my @H = map("v$_.16b",(0..4));
+my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
+my @MSG=map("v$_.16b",(16..23));
+my ($W0,$W1)=("v24.2d","v25.2d");
+my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
+
+$code.=<<___;
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+.type	sha512_block_data_order_hw,%function
+.align	6
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64	// load input
+	ld1		{@MSG[4]-@MSG[7]},[$inp],#64
+
+	ld1.64		{@H[0]-@H[3]},[$ctx]		// load context
+	adrp		$Ktbl,:pg_hi21:.LK512
+	add		$Ktbl,$Ktbl,:lo12:.LK512
+
+	rev64		@MSG[0],@MSG[0]
+	rev64		@MSG[1],@MSG[1]
+	rev64		@MSG[2],@MSG[2]
+	rev64		@MSG[3],@MSG[3]
+	rev64		@MSG[4],@MSG[4]
+	rev64		@MSG[5],@MSG[5]
+	rev64		@MSG[6],@MSG[6]
+	rev64		@MSG[7],@MSG[7]
+	b		.Loop_hw
+
+.align	4
+.Loop_hw:
+	ld1.64		{$W0},[$Ktbl],#16
+	subs		$num,$num,#1
+	sub		x4,$inp,#128
+	orr		$AB,@H[0],@H[0]			// offload
+	orr		$CD,@H[1],@H[1]
+	orr		$EF,@H[2],@H[2]
+	orr		$GH,@H[3],@H[3]
+	csel		$inp,$inp,x4,ne			// conditional rewind
+___
+for($i=0;$i<32;$i++) {
+$code.=<<___;
+	add.i64		$W0,$W0,@MSG[0]
+	ld1.64		{$W1},[$Ktbl],#16
+	ext		$W0,$W0,$W0,#8
+	ext		$fg,@H[2],@H[3],#8
+	ext		$de,@H[1],@H[2],#8
+	add.i64		@H[3],@H[3],$W0			// "T1 + H + K512[i]"
+	 sha512su0	@MSG[0],@MSG[1]
+	 ext		$m9_10,@MSG[4],@MSG[5],#8
+	sha512h		@H[3],$fg,$de
+	 sha512su1	@MSG[0],@MSG[7],$m9_10
+	add.i64		@H[4],@H[1],@H[3]		// "D + T1"
+	sha512h2	@H[3],$H[1],@H[0]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+	@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+for(;$i<40;$i++) {
+$code.=<<___	if ($i<39);
+	ld1.64		{$W1},[$Ktbl],#16
+___
+$code.=<<___	if ($i==39);
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ	// rewind
+___
+$code.=<<___;
+	add.i64		$W0,$W0,@MSG[0]
+	 ld1		{@MSG[0]},[$inp],#16		// load next input
+	ext		$W0,$W0,$W0,#8
+	ext		$fg,@H[2],@H[3],#8
+	ext		$de,@H[1],@H[2],#8
+	add.i64		@H[3],@H[3],$W0			// "T1 + H + K512[i]"
+	sha512h		@H[3],$fg,$de
+	 rev64		@MSG[0],@MSG[0]
+	add.i64		@H[4],@H[1],@H[3]		// "D + T1"
+	sha512h2	@H[3],$H[1],@H[0]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+	@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+$code.=<<___;
+	add.i64		@H[0],@H[0],$AB			// accumulate
+	add.i64		@H[1],@H[1],$CD
+	add.i64		@H[2],@H[2],$EF
+	add.i64		@H[3],@H[3],$GH
+
+	cbnz		$num,.Loop_hw
+
+	st1.64		{@H[0]-@H[3]},[$ctx]		// store context
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
+#endif
+___
+}
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+{   my  %opcode = (
+	"sha512h"	=> 0xce608000,	"sha512h2"	=> 0xce608400,
+	"sha512su0"	=> 0xcec08000,	"sha512su1"	=> 0xce608800	);
+
+    sub unsha512 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/\bext\b/		and s/\.2d/\.16b/g	or
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
new file mode 100644
index 0000000000..76a411043e
--- /dev/null
+++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -0,0 +1,1655 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the License.
+# ====================================================================
+#
+# sha256/512_block procedure for x86_64.
+#
+# 40% improvement over compiler-generated code on Opteron. On EM64T
+# sha256 was observed to run >80% faster and sha512 - >40%. No magical
+# tricks, just straight implementation... I really wonder why gcc
+# [being armed with inline assembler] fails to generate as fast code.
+# The only thing which is cool about this module is that it's very
+# same instruction sequence used for both SHA-256 and SHA-512. In
+# former case the instructions operate on 32-bit operands, while in
+# latter - on 64-bit ones. All I had to do is to get one flavor right,
+# the other one passed the test right away:-)
+#
+# sha256_block runs in ~1005 cycles on Opteron, which gives you
+# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+# frequency in GHz. sha512_block runs in ~1275 cycles, which results
+# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+# Well, if you compare it to IA-64 implementation, which maintains
+# X[16] in register bank[!], tends to 4 instructions per CPU clock
+# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+# issue Opteron pipeline and X[16] maintained in memory. So that *if*
+# there is a way to improve it, *then* the only way would be to try to
+# offload X[16] updates to SSE unit, but that would require "deeper"
+# loop unroll, which in turn would naturally cause size blow-up, not
+# to mention increased complexity! And once again, only *if* it's
+# actually possible to noticeably improve overall ILP, instruction
+# level parallelism, on a given CPU implementation in this case.
+#
+# Special note on Intel EM64T. While Opteron CPU exhibits perfect
+# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+# [currently available] EM64T CPUs apparently are far from it. On the
+# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+# sha256_block:-( This is presumably because 64-bit shifts/rotates
+# apparently are not atomic instructions, but implemented in microcode.
+#
+# May 2012.
+#
+# Optimization including one of Pavel Semjanov's ideas, alternative
+# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
+# unfortunately -2% SHA512 on P4 [which nobody should care about
+# that much].
+#
+# June 2012.
+#
+# Add SIMD code paths, see below for improvement coefficients. SSSE3
+# code path was not attempted for SHA512, because improvement is not
+# estimated to be high enough, noticeably less than 9%, to justify
+# the effort, not on pre-AVX processors. [Obviously with exclusion
+# for VIA Nano, but it has SHA512 instruction that is faster and
+# should be used instead.] For reference, corresponding estimated
+# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
+# higher coefficients are observed on VIA Nano and Bulldozer has more
+# to do with specifics of their architecture [which is topic for
+# separate discussion].
+#
+# November 2012.
+#
+# Add AVX2 code path. Two consecutive input blocks are loaded to
+# 256-bit %ymm registers, with data from first block to least
+# significant 128-bit halves and data from second to most significant.
+# The data is then processed with same SIMD instruction sequence as
+# for AVX, but with %ymm as operands. Side effect is increased stack
+# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
+# code size increase.
+#
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance in cycles per processed byte (less is better):
+#
+#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
+#
+# AMD K8	14.9	-	    -		    9.57    -
+# P4		17.3	-	    -		    30.8    -
+# Core 2	15.6	13.8(+13%)  -		    9.97    -
+# Westmere	14.8	12.3(+19%)  -		    9.58    -
+# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
+# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
+# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
+# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
+# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
+# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
+# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
+# Atom		23.0	18.9(+22%)  -		    14.7    -
+# Silvermont	27.4	20.6(+33%)  -               17.5    -
+# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
+# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
+#
+# (*)	whichever best applicable, including SHAEXT;
+# (**)	switch from ror to shrd stands for fair share of improvement;
+# (***)	execution time is fully determined by remaining integer-only
+#	part, body_00_15; reducing the amount of SIMD instructions
+#	below certain limit makes no difference/sense; to conserve
+#	space SHA256 XOP code path is therefore omitted;
+#
+# Modified from upstream OpenSSL to remove the XOP code.
+
+my ($flavour, $output) = @ARGV;
+
+if ($output =~ /sha512-x86_64/) {
+	$func="sha512_block_data_order";
+	$TABLE="K512";
+	$SZ=8;
+	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
+					"%r8", "%r9", "%r10","%r11");
+	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+} else {
+	$func="sha256_block_data_order";
+	$TABLE="K256";
+	$SZ=4;
+	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+					"%r8d","%r9d","%r10d","%r11d");
+	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+}
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+#
+# This file also has an AVX2 implementation, controlled by setting $avx to 2.
+# For now, we intentionally disable it. While it gives a 13-16% perf boost, the
+# CFI annotations are wrong. It allocates stack in a loop and should be
+# rewritten to avoid this.
+$avx = 1;
+$shaext = 1;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$_rsp="`16*$SZ+3*8`(%rsp)";
+$framesz="16*$SZ+4*8";
+
+
+sub ROUND_00_15()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+  my $STRIDE=$SZ;
+     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
+
+$code.=<<___;
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
+	mov	$f,$a2
+
+	xor	$e,$a0
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$g,$a2			# f^g
+
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	xor	$a,$a1
+	and	$e,$a2			# (f^g)&e
+
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
+
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
+	add	$a2,$T1			# T1+=Ch(e,f,g)
+
+	mov	$a,$a2
+	add	($Tbl),$T1		# T1+=K[round]
+	xor	$a,$a1
+
+	xor	$b,$a2			# a^b, b^c in next round
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	mov	$b,$h
+
+	and	$a2,$a3
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+
+	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
+	add	$T1,$d			# d+=T1
+	add	$T1,$h			# h+=T1
+
+	lea	$STRIDE($Tbl),$Tbl	# round++
+___
+$code.=<<___ if ($i<15);
+	add	$a1,$h			# h+=Sigma0(a)
+___
+	($a2,$a3) = ($a3,$a2);
+}
+
+sub ROUND_16_XX()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
+
+	mov	$a0,$T1
+	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
+	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
+	mov	$a2,$a1
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+
+	xor	$T1,$a0
+	shr	\$$sigma0[2],$T1
+	ror	\$$sigma0[0],$a0
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
+
+	ror	\$$sigma1[0],$a2
+	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
+	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
+	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+
+	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a2,$T1
+	mov	$a,$a1
+___
+	&ROUND_00_15(@_);
+}
+
+$code=<<___;
+.text
+
+.globl	${func}_nohw
+.type	${func}_nohw,\@function,3
+.align	16
+${func}_nohw:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax		# copy %rsp
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+.cfi_cfa_expression	$_rsp,deref,+8
+.Lprologue:
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	mov	$B,$a3
+	lea	$TABLE(%rip),$Tbl
+	xor	$C,$a3			# magic
+___
+	for($i=0;$i<16;$i++) {
+		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
+		$code.="	bswap	$T1\n";
+		&ROUND_00_15($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+$code.=<<___;
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+___
+	for(;$i<32;$i++) {
+		&ROUND_16_XX($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+
+$code.=<<___;
+	cmpb	\$0,`$SZ-1`($Tbl)
+	jnz	.Lrounds_16_xx
+
+	mov	$_ctx,$ctx
+	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
+	lea	16*$SZ($inp),$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop
+
+	mov	$_rsp,%rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	ret
+.cfi_endproc
+.size	${func}_nohw,.-${func}_nohw
+___
+
+if ($SZ==4) {
+$code.=<<___;
+.section .rodata
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.text
+___
+} else {
+$code.=<<___;
+.section .rodata
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.text
+___
+}
+
+######################################################################
+# SIMD code paths
+#
+if ($SZ==4 && $shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.globl	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,\@function,3
+.align	64
+sha256_block_data_order_hw:
+.cfi_startproc
+	_CET_ENDBR
+___
+$code.=<<___ if ($win64);
+	lea	`-8-5*16`(%rsp),%rsp
+	movaps	%xmm6,-8-5*16(%rax)
+	movaps	%xmm7,-8-4*16(%rax)
+	movaps	%xmm8,-8-3*16(%rax)
+	movaps	%xmm9,-8-2*16(%rax)
+	movaps	%xmm10,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+	lea		K256+0x80(%rip),$Tbl
+	movdqu		($ctx),$ABEF		# DCBA
+	movdqu		16($ctx),$CDGH		# HGFE
+	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
+
+	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
+	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
+	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
+	movdqa		$TMP,$BSWAP		# offload
+	palignr		\$8,$CDGH,$ABEF		# ABEF
+	punpcklqdq	$Wi,$CDGH		# CDGH
+	jmp		.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu		($inp),@MSG[0]
+	movdqu		0x10($inp),@MSG[1]
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$TMP,@MSG[0]
+	movdqu		0x30($inp),@MSG[3]
+
+	movdqa		0*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	pshufb		$TMP,@MSG[1]
+	movdqa		$CDGH,$CDGH_SAVE	# offload
+	sha256rnds2	$ABEF,$CDGH		# 0-3
+	pshufd		\$0x0e,$Wi,$Wi
+	nop
+	movdqa		$ABEF,$ABEF_SAVE	# offload
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		1*32-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	pshufb		$TMP,@MSG[2]
+	sha256rnds2	$ABEF,$CDGH		# 4-7
+	pshufd		\$0x0e,$Wi,$Wi
+	lea		0x40($inp),$inp
+	sha256msg1	@MSG[1],@MSG[0]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		2*32-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	pshufb		$TMP,@MSG[3]
+	sha256rnds2	$ABEF,$CDGH		# 8-11
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[3],$TMP
+	palignr		\$4,@MSG[2],$TMP
+	nop
+	paddd		$TMP,@MSG[0]
+	sha256msg1	@MSG[2],@MSG[1]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		3*32-0x80($Tbl),$Wi
+	paddd		@MSG[3],$Wi
+	sha256msg2	@MSG[3],@MSG[0]
+	sha256rnds2	$ABEF,$CDGH		# 12-15
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[0],$TMP
+	palignr		\$4,@MSG[3],$TMP
+	nop
+	paddd		$TMP,@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+	sha256rnds2	$CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+	movdqa		$i*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 16-19...
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	nop
+	paddd		$TMP,@MSG[2]
+	sha256msg1	@MSG[0],@MSG[3]
+	sha256rnds2	$CDGH,$ABEF
+___
+	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqa		13*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 52-55
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	sha256rnds2	$CDGH,$ABEF
+	paddd		$TMP,@MSG[2]
+
+	movdqa		14*32-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	sha256rnds2	$ABEF,$CDGH		# 56-59
+	pshufd		\$0x0e,$Wi,$Wi
+	sha256msg2	@MSG[1],@MSG[2]
+	movdqa		$BSWAP,$TMP
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		15*32-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	nop
+	sha256rnds2	$ABEF,$CDGH		# 60-63
+	pshufd		\$0x0e,$Wi,$Wi
+	dec		$num
+	nop
+	sha256rnds2	$CDGH,$ABEF
+
+	paddd		$CDGH_SAVE,$CDGH
+	paddd		$ABEF_SAVE,$ABEF
+	jnz		.Loop_shaext
+
+	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
+	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
+	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
+	punpckhqdq	$CDGH,$ABEF		# DCBA
+	palignr		\$8,$TMP,$CDGH		# HGFE
+
+	movdqu	$ABEF,($ctx)
+	movdqu	$CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-8-5*16(%rax),%xmm6
+	movaps	-8-4*16(%rax),%xmm7
+	movaps	-8-3*16(%rax),%xmm8
+	movaps	-8-2*16(%rax),%xmm9
+	movaps	-8-1*16(%rax),%xmm10
+	mov	%rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+___
+}}}
+{{{
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
+	'&mov	($a,$a1)',
+	'&mov	($a4,$f)',
+
+	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
+	'&xor	($a0,$e)',
+	'&xor	($a4,$g)',			# f^g
+
+	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
+	'&xor	($a1,$a)',
+	'&and	($a4,$e)',			# (f^g)&e
+
+	'&xor	($a0,$e)',
+	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
+	'&mov	($a2,$a)',
+
+	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
+	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
+	'&xor	($a2,$b)',			# a^b, b^c in next round
+
+	'&add	($h,$a4)',			# h+=Ch(e,f,g)
+	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
+	'&and	($a3,$a2)',			# (b^c)&(a^b)
+
+	'&xor	($a1,$a)',
+	'&add	($h,$a0)',			# h+=Sigma1(e)
+	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
+
+	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
+	'&add	($d,$h)',			# d+=h
+	'&add	($h,$a3)',			# h+=Maj(a,b,c)
+
+	'&mov	($a0,$d)',
+	'&add	($a1,$h);'.			# h+=Sigma0(a)
+	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+	);
+}
+
+######################################################################
+# SSSE3 code path
+#
+if ($SZ==4) {	# SHA256 only
+my @X = map("%xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+.globl	${func}_ssse3
+.type	${func}_ssse3,\@function,3
+.align	64
+${func}_ssse3:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax		# copy %rsp
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$`$framesz+$win64*16*4`,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+.cfi_cfa_expression	$_rsp,deref,+8
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,16*$SZ+32(%rsp)
+	movaps	%xmm7,16*$SZ+48(%rsp)
+	movaps	%xmm8,16*$SZ+64(%rsp)
+	movaps	%xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___;
+.Lprologue_ssse3:
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+
+$code.=<<___;
+	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	movdqu	0x00($inp),@X[0]
+	movdqu	0x10($inp),@X[1]
+	movdqu	0x20($inp),@X[2]
+	pshufb	$t3,@X[0]
+	movdqu	0x30($inp),@X[3]
+	lea	$TABLE(%rip),$Tbl
+	pshufb	$t3,@X[1]
+	movdqa	0x00($Tbl),$t0
+	movdqa	0x20($Tbl),$t1
+	pshufb	$t3,@X[2]
+	paddd	@X[0],$t0
+	movdqa	0x40($Tbl),$t2
+	pshufb	$t3,@X[3]
+	movdqa	0x60($Tbl),$t3
+	paddd	@X[1],$t1
+	paddd	@X[2],$t2
+	paddd	@X[3],$t3
+	movdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	movdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	movdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	movdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_SSSE3 () {
+	(
+	'&movdqa	($t0,@X[1]);',
+	'&movdqa	($t3,@X[3])',
+	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
+	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
+	'&movdqa	($t1,$t0)',
+	'&movdqa	($t2,$t0);',
+	'&psrld		($t0,$sigma0[2])',
+	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
+	'&psrld		($t2,$sigma0[0])',
+	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
+	'&pxor		($t0,$t2)',
+	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t1)',
+	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t2);',
+	 '&movdqa	($t2,$t3)',
+	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
+	 '&psrld	($t3,$sigma1[2])',
+	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2)',
+	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
+	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
+	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&movdqa	($t2,$t3);',
+	 '&psrld	($t3,$sigma1[2])',
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	'&movdqa	($t2,16*2*$j."($Tbl)")',
+	 '&pshufb	($t3,$t5)',
+	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub SSSE3_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+    if (0) {
+	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+    } else {			# squeeze extra 4% on Westmere and 19% on Atom
+	  eval(shift(@insns));	#@
+	&movdqa		($t0,@X[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t3,@X[3]);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&palignr	($t0,@X[0],$SZ);	# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&movdqa		($t1,$t0);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,$t0);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t2,$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&pslld		($t1,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&psrld		($t2,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);		# sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
+	 &pshufd	($t3,$t3,0b10000000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 #&pshufb	($t3,$t5);
+	 &pshufd	($t3,$t3,0b00001000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,16*2*$j."($Tbl)");
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &pslldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+    }
+	&paddd		($t2,@X[0]);
+	  foreach (@insns) { eval; }		# remaining instructions
+	&movdqa		(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&SSSE3_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
+	&jne	(".Lssse3_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+
+	add	$SZ*0($ctx),$A
+	lea	16*$SZ($inp),$inp
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_ssse3
+
+	mov	$_rsp,%rsi
+.cfi_def_cfa	%rsi,8
+___
+$code.=<<___ if ($win64);
+	movaps	16*$SZ+32(%rsp),%xmm6
+	movaps	16*$SZ+48(%rsp),%xmm7
+	movaps	16*$SZ+64(%rsp),%xmm8
+	movaps	16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_ssse3:
+	ret
+.cfi_endproc
+.size	${func}_ssse3,.-${func}_ssse3
+___
+}
+
+if ($avx) {{
+######################################################################
+# AVX+shrd code path
+#
+local *ror = sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.globl	${func}_avx
+.type	${func}_avx,\@function,3
+.align	64
+${func}_avx:
+.cfi_startproc
+	_CET_ENDBR
+	mov	%rsp,%rax		# copy %rsp
+.cfi_def_cfa_register	%rax
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+.cfi_cfa_expression	$_rsp,deref,+8
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,16*$SZ+32(%rsp)
+	movaps	%xmm7,16*$SZ+48(%rsp)
+	movaps	%xmm8,16*$SZ+64(%rsp)
+	movaps	%xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	%xmm10,16*$SZ+96(%rsp)
+	movaps	%xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx:
+
+	vzeroupper
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+					if ($SZ==4) {	# SHA256
+    my @X = map("%xmm$_",(0..3));
+    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	0x00($inp),@X[0]
+	vmovdqu	0x10($inp),@X[1]
+	vmovdqu	0x20($inp),@X[2]
+	vmovdqu	0x30($inp),@X[3]
+	vpshufb	$t3,@X[0],@X[0]
+	lea	$TABLE(%rip),$Tbl
+	vpshufb	$t3,@X[1],@X[1]
+	vpshufb	$t3,@X[2],@X[2]
+	vpaddd	0x00($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[3],@X[3]
+	vpaddd	0x20($Tbl),@X[1],$t1
+	vpaddd	0x40($Tbl),@X[2],$t2
+	vpaddd	0x60($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	vmovdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_AVX () {
+	(
+	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
+	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
+	'&vpsrld	($t2,$t0,$sigma0[0]);',
+	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
+	'&vpsrld	($t3,$t0,$sigma0[2])',
+	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
+	'&vpxor		($t0,$t3,$t2)',
+	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
+	'&vpxor		($t0,$t0,$t1)',
+	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
+	'&vpxor		($t0,$t0,$t2)',
+	 '&vpsrld	($t2,$t3,$sigma1[2]);',
+	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
+	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
+	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
+	 '&vpxor	($t2,$t2,$t3);',
+	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
+	 '&vpxor	($t2,$t2,$t3)',
+	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
+	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
+	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&vpsrld	($t2,$t3,$sigma1[2])',
+	 '&vpsrlq	($t3,$t3,$sigma1[0])',
+	 '&vpxor	($t2,$t2,$t3);',
+	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
+	 '&vpxor	($t2,$t2,$t3)',
+	 '&vpshufb	($t2,$t2,$t5)',
+	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub AVX_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+	foreach (Xupdate_256_AVX()) {		# 29 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&AVX_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
+	&jne	(".Lavx_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+
+					} else {	# SHA512
+    my @X = map("%xmm$_",(0..7));
+    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
+
+$code.=<<___;
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	0x00($inp),@X[0]
+	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
+	vmovdqu	0x10($inp),@X[1]
+	vmovdqu	0x20($inp),@X[2]
+	vpshufb	$t3,@X[0],@X[0]
+	vmovdqu	0x30($inp),@X[3]
+	vpshufb	$t3,@X[1],@X[1]
+	vmovdqu	0x40($inp),@X[4]
+	vpshufb	$t3,@X[2],@X[2]
+	vmovdqu	0x50($inp),@X[5]
+	vpshufb	$t3,@X[3],@X[3]
+	vmovdqu	0x60($inp),@X[6]
+	vpshufb	$t3,@X[4],@X[4]
+	vmovdqu	0x70($inp),@X[7]
+	vpshufb	$t3,@X[5],@X[5]
+	vpaddq	-0x80($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[6],@X[6]
+	vpaddq	-0x60($Tbl),@X[1],$t1
+	vpshufb	$t3,@X[7],@X[7]
+	vpaddq	-0x40($Tbl),@X[2],$t2
+	vpaddq	-0x20($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	vpaddq	0x00($Tbl),@X[4],$t0
+	vmovdqa	$t1,0x10(%rsp)
+	vpaddq	0x20($Tbl),@X[5],$t1
+	vmovdqa	$t2,0x20(%rsp)
+	vpaddq	0x40($Tbl),@X[6],$t2
+	vmovdqa	$t3,0x30(%rsp)
+	vpaddq	0x60($Tbl),@X[7],$t3
+	vmovdqa	$t0,0x40(%rsp)
+	mov	$A,$a1
+	vmovdqa	$t1,0x50(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x60(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x70(%rsp)
+	mov	$E,$a0
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	add	\$`16*2*$SZ`,$Tbl
+___
+sub Xupdate_512_AVX () {
+	(
+	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
+	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
+	'&vpsrlq	($t2,$t0,$sigma0[0])',
+	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
+	'&vpsrlq	($t3,$t0,$sigma0[2])',
+	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
+	 '&vpxor	($t0,$t3,$t2)',
+	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
+	 '&vpxor	($t0,$t0,$t1)',
+	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
+	 '&vpxor	($t0,$t0,$t2)',
+	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
+	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
+	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
+	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
+	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
+	 '&vpxor	($t3,$t3,$t2)',
+	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
+	 '&vpxor	($t3,$t3,$t1)',
+	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
+	 '&vpxor	($t3,$t3,$t2)',
+	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
+	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
+	);
+}
+
+sub AVX_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body);			# 52 instructions
+
+	foreach (Xupdate_512_AVX()) {		# 23 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<8; $j++) {
+	&AVX_512_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
+	&jne	(".Lavx_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+}
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+
+	add	$SZ*0($ctx),$A
+	lea	16*$SZ($inp),$inp
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_avx
+
+	mov	$_rsp,%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	16*$SZ+32(%rsp),%xmm6
+	movaps	16*$SZ+48(%rsp),%xmm7
+	movaps	16*$SZ+64(%rsp),%xmm8
+	movaps	16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	16*$SZ+96(%rsp),%xmm10
+	movaps	16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	ret
+.cfi_endproc
+.size	${func}_avx,.-${func}_avx
+___
+
+}}}}}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HanderlData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+___
+$code.=<<___;
+	mov	%rax,%rsi		# put aside Rsp
+	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx
+	jb	.Lin_prologue		# non-AVX code
+
+	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$`$SZ==4?8:12`,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+___
+
+$code.=<<___ if ($SZ==4 && $shaext);
+.type	shaext_handler,\@abi-omnipotent
+.align	16
+shaext_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue_shaext(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lin_prologue
+
+	lea	.Lepilogue_shaext(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	lea	-8-5*16(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$10,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lin_prologue
+.size	shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${func}_nohw
+	.rva	.LSEH_end_${func}_nohw
+	.rva	.LSEH_info_${func}_nohw
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+	.rva	.LSEH_begin_${func}_hw
+	.rva	.LSEH_end_${func}_hw
+	.rva	.LSEH_info_${func}_hw
+___
+$code.=<<___ if ($SZ==4);
+	.rva	.LSEH_begin_${func}_ssse3
+	.rva	.LSEH_end_${func}_ssse3
+	.rva	.LSEH_info_${func}_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_${func}_avx
+	.rva	.LSEH_end_${func}_avx
+	.rva	.LSEH_info_${func}_avx
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_${func}_nohw:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue,.Lepilogue			# HandlerData[]
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+.LSEH_info_${func}_hw:
+	.byte	9,0,0,0
+	.rva	shaext_handler
+___
+$code.=<<___ if ($SZ==4);
+.LSEH_info_${func}_ssse3:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_${func}_avx:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+}
+
+sub sha256op38 {
+    my $instr = shift;
+    my %opcodelet = (
+		"sha256rnds2" => 0xcb,
+  		"sha256msg1"  => 0xcc,
+		"sha256msg2"  => 0xcd	);
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x38);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/internal.h b/ring-0.17.14/crypto/internal.h
new file mode 100644
index 0000000000..99223d1aca
--- /dev/null
+++ b/ring-0.17.14/crypto/internal.h
@@ -0,0 +1,474 @@
+// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_CRYPTO_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_INTERNAL_H
+
+#include <ring-core/base.h> // Must be first.
+
+#include "ring-core/check.h"
+
+#if defined(__clang__)
+// Don't require prototypes for functions defined in C that are only
+// used from Rust.
+#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#endif
+
+#if defined(__GNUC__) && \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+// |alignas| and |alignof| were added in C11. GCC added support in version 4.8.
+// Testing for __STDC_VERSION__/__cplusplus doesn't work because 4.7 already
+// reports support for C11.
+#define alignas(x) __attribute__ ((aligned (x)))
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define alignas(x) __declspec(align(x))
+#else
+#include <stdalign.h>
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#define RING_NOINLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define RING_NOINLINE __declspec(noinline)
+#else
+#define RING_NOINLINE
+#endif
+
+// Some C compilers require a useless cast when dealing with arrays for the
+// reason explained in
+// https://gustedt.wordpress.com/2011/02/12/const-and-arrays/
+#if defined(__clang__) || defined(_MSC_VER)
+#define RING_CORE_POINTLESS_ARRAY_CONST_CAST(cast)
+#else
+#define RING_CORE_POINTLESS_ARRAY_CONST_CAST(cast) cast
+#endif
+
+// `uint8_t` isn't guaranteed to be 'unsigned char' and only 'char' and
+// 'unsigned char' are allowed to alias according to ISO C.
+typedef unsigned char aliasing_uint8_t;
+
+#if (!defined(_MSC_VER) || defined(__clang__)) && defined(OPENSSL_64_BIT)
+#define BORINGSSL_HAS_UINT128
+typedef __int128_t int128_t;
+typedef __uint128_t uint128_t;
+#endif
+
+// GCC-like compilers indicate SSE2 with |__SSE2__|. MSVC leaves the caller to
+// know that x86_64 has SSE2, and uses _M_IX86_FP to indicate SSE2 on x86.
+// https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+# if defined(_MSC_VER) && !defined(__clang__)
+#  if defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#   define OPENSSL_SSE2
+#  else
+#   error "SSE2 is required."
+#  endif
+# elif !defined(__SSE2__)
+#  error "SSE2 is required."
+# endif
+#endif
+
+// For convenience in testing the fallback code, we allow disabling SSE2
+// intrinsics via |OPENSSL_NO_SSE2_FOR_TESTING|. We require SSE2 on x86 and
+// x86_64, so we would otherwise need to test such code on a non-x86 platform.
+//
+// This does not remove the above requirement for SSE2 support with assembly
+// optimizations. It only disables some intrinsics-based optimizations so that
+// we can test the fallback code on CI.
+#if defined(OPENSSL_SSE2) && defined(OPENSSL_NO_SSE2_FOR_TESTING)
+#undef OPENSSL_SSE2
+#endif
+
+// Pointer utility functions.
+
+// buffers_alias returns one if |a| and |b| alias and zero otherwise.
+static inline int buffers_alias(const void *a, size_t a_bytes,
+                                const void *b, size_t b_bytes) {
+  // Cast |a| and |b| to integers. In C, pointer comparisons between unrelated
+  // objects are undefined whereas pointer to integer conversions are merely
+  // implementation-defined. We assume the implementation defined it in a sane
+  // way.
+  uintptr_t a_u = (uintptr_t)a;
+  uintptr_t b_u = (uintptr_t)b;
+  return a_u + a_bytes > b_u && b_u + b_bytes > a_u;
+}
+
+
+// Constant-time utility functions.
+//
+// The following methods return a bitmask of all ones (0xff...f) for true and 0
+// for false. This is useful for choosing a value based on the result of a
+// conditional in constant time. For example,
+//
+// if (a < b) {
+//   c = a;
+// } else {
+//   c = b;
+// }
+//
+// can be written as
+//
+// crypto_word_t lt = constant_time_lt_w(a, b);
+// c = constant_time_select_w(lt, a, b);
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// '=': conversion from 'crypto_word_t' to 'uint8_t', possible loss of data
+#pragma warning(disable: 4242)
+//  'initializing': conversion from 'crypto_word_t' to 'uint8_t', ...
+#pragma warning(disable: 4244)
+#endif
+
+// crypto_word_t is the type that most constant-time functions use. Ideally we
+// would like it to be |size_t|, but NaCl builds in 64-bit mode with 32-bit
+// pointers, which means that |size_t| can be 32 bits when |BN_ULONG| is 64
+// bits. Since we want to be able to do constant-time operations on a
+// |BN_ULONG|, |crypto_word_t| is defined as an unsigned value with the native
+// word length.
+#if defined(OPENSSL_64_BIT)
+typedef uint64_t crypto_word_t;
+#define CRYPTO_WORD_BITS (64u)
+#elif defined(OPENSSL_32_BIT)
+typedef uint32_t crypto_word_t;
+#define CRYPTO_WORD_BITS (32u)
+#else
+#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
+#endif
+
+#define CONSTTIME_TRUE_W ~((crypto_word_t)0)
+#define CONSTTIME_FALSE_W ((crypto_word_t)0)
+
+// value_barrier_w returns |a|, but prevents GCC and Clang from reasoning about
+// the returned value. This is used to mitigate compilers undoing constant-time
+// code, until we can express our requirements directly in the language.
+//
+// Note the compiler is aware that |value_barrier_w| has no side effects and
+// always has the same output for a given input. This allows it to eliminate
+// dead code, move computations across loops, and vectorize.
+static inline crypto_word_t value_barrier_w(crypto_word_t a) {
+#if defined(__GNUC__) || defined(__clang__)
+  __asm__("" : "+r"(a) : /* no inputs */);
+#endif
+  return a;
+}
+
+// value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|.
+static inline uint32_t value_barrier_u32(uint32_t a) {
+#if defined(__GNUC__) || defined(__clang__)
+  __asm__("" : "+r"(a) : /* no inputs */);
+#endif
+  return a;
+}
+
+// |value_barrier_u8| could be defined as above, but compilers other than
+// clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM.
+
+// constant_time_msb_w returns the given value with the MSB copied to all the
+// other bits.
+static inline crypto_word_t constant_time_msb_w(crypto_word_t a) {
+  return 0u - (a >> (sizeof(a) * 8 - 1));
+}
+
+// constant_time_is_zero returns 0xff..f if a == 0 and 0 otherwise.
+static inline crypto_word_t constant_time_is_zero_w(crypto_word_t a) {
+  // Here is an SMT-LIB verification of this formula:
+  //
+  // (define-fun is_zero ((a (_ BitVec 32))) (_ BitVec 32)
+  //   (bvand (bvnot a) (bvsub a #x00000001))
+  // )
+  //
+  // (declare-fun a () (_ BitVec 32))
+  //
+  // (assert (not (= (= #x00000001 (bvlshr (is_zero a) #x0000001f)) (= a #x00000000))))
+  // (check-sat)
+  // (get-model)
+  return constant_time_msb_w(~a & (a - 1));
+}
+
+static inline crypto_word_t constant_time_is_nonzero_w(crypto_word_t a) {
+  return ~constant_time_is_zero_w(a);
+}
+
+// constant_time_eq_w returns 0xff..f if a == b and 0 otherwise.
+static inline crypto_word_t constant_time_eq_w(crypto_word_t a,
+                                               crypto_word_t b) {
+  return constant_time_is_zero_w(a ^ b);
+}
+
+// constant_time_select_w returns (mask & a) | (~mask & b). When |mask| is all
+// 1s or all 0s (as returned by the methods above), the select methods return
+// either |a| (if |mask| is nonzero) or |b| (if |mask| is zero).
+static inline crypto_word_t constant_time_select_w(crypto_word_t mask,
+                                                   crypto_word_t a,
+                                                   crypto_word_t b) {
+  // Clang recognizes this pattern as a select. While it usually transforms it
+  // to a cmov, it sometimes further transforms it into a branch, which we do
+  // not want.
+  //
+  // Hiding the value of the mask from the compiler evades this transformation.
+  mask = value_barrier_w(mask);
+  return (mask & a) | (~mask & b);
+}
+
+// constant_time_select_8 acts like |constant_time_select| but operates on
+// 8-bit values.
+static inline uint8_t constant_time_select_8(crypto_word_t mask, uint8_t a,
+                                             uint8_t b) {
+  // |mask| is a word instead of |uint8_t| to avoid materializing 0x000..0MM
+  // Making both |mask| and its value barrier |uint8_t| would allow the compiler
+  // to materialize 0x????..?MM instead, but only clang is that clever.
+  // However, vectorization of bitwise operations seems to work better on
+  // |uint8_t| than a mix of |uint64_t| and |uint8_t|, so |m| is cast to
+  // |uint8_t| after the value barrier but before the bitwise operations.
+  uint8_t m = value_barrier_w(mask);
+  return (m & a) | (~m & b);
+}
+
+// constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if
+// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
+// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
+static inline void constant_time_conditional_memcpy(void *dst, const void *src,
+                                                    const size_t n,
+                                                    const crypto_word_t mask) {
+  debug_assert_nonsecret(!buffers_alias(dst, n, src, n));
+  uint8_t *out = (uint8_t *)dst;
+  const uint8_t *in = (const uint8_t *)src;
+  for (size_t i = 0; i < n; i++) {
+    out[i] = constant_time_select_8(mask, in[i], out[i]);
+  }
+}
+
+// constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if
+// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
+// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
+static inline void constant_time_conditional_memxor(void *dst, const void *src,
+                                                    size_t n,
+                                                    const crypto_word_t mask) {
+  debug_assert_nonsecret(!buffers_alias(dst, n, src, n));
+  aliasing_uint8_t *out = dst;
+  const aliasing_uint8_t *in = src;
+#if defined(__GNUC__) && !defined(__clang__)
+  // gcc 13.2.0 doesn't automatically vectorize this loop regardless of barrier
+  typedef aliasing_uint8_t v32u8 __attribute__((vector_size(32), aligned(1), may_alias));
+  size_t n_vec = n&~(size_t)31;
+  v32u8 masks = ((aliasing_uint8_t)mask-(v32u8){}); // broadcast
+  for (size_t i = 0; i < n_vec; i += 32) {
+    *(v32u8*)&out[i] ^= masks & *(v32u8 const*)&in[i];
+  }
+  out += n_vec;
+  n -= n_vec;
+#endif
+  for (size_t i = 0; i < n; i++) {
+    out[i] ^= value_barrier_w(mask) & in[i];
+  }
+}
+
+#if defined(BORINGSSL_CONSTANT_TIME_VALIDATION)
+
+// CONSTTIME_SECRET takes a pointer and a number of bytes and marks that region
+// of memory as secret. Secret data is tracked as it flows to registers and
+// other parts of a memory. If secret data is used as a condition for a branch,
+// or as a memory index, it will trigger warnings in valgrind.
+#define CONSTTIME_SECRET(ptr, len) VALGRIND_MAKE_MEM_UNDEFINED(ptr, len)
+
+// CONSTTIME_DECLASSIFY takes a pointer and a number of bytes and marks that
+// region of memory as public. Public data is not subject to constant-time
+// rules.
+#define CONSTTIME_DECLASSIFY(ptr, len) VALGRIND_MAKE_MEM_DEFINED(ptr, len)
+
+#else
+
+#define CONSTTIME_SECRET(ptr, len)
+#define CONSTTIME_DECLASSIFY(ptr, len)
+
+#endif  // BORINGSSL_CONSTANT_TIME_VALIDATION
+
+static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) {
+  // Return |v| through a value barrier to be safe. Valgrind-based constant-time
+  // validation is partly to check the compiler has not undone any constant-time
+  // work. Any place |BORINGSSL_CONSTANT_TIME_VALIDATION| influences
+  // optimizations, this validation is inaccurate.
+  //
+  // However, by sending pointers through valgrind, we likely inhibit escape
+  // analysis. On local variables, particularly booleans, we likely
+  // significantly impact optimizations.
+  //
+  // Thus, to be safe, stick a value barrier, in hopes of comparably inhibiting
+  // compiler analysis.
+  CONSTTIME_DECLASSIFY(&v, sizeof(v));
+  return value_barrier_w(v);
+}
+
+static inline int constant_time_declassify_int(int v) {
+  OPENSSL_STATIC_ASSERT(sizeof(uint32_t) == sizeof(int),
+                "int is not the same size as uint32_t");
+  // See comment above.
+  CONSTTIME_DECLASSIFY(&v, sizeof(v));
+  return value_barrier_u32((uint32_t)v);
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+// '=': conversion from 'int64_t' to 'int32_t', possible loss of data
+#pragma warning(pop)
+#endif
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+// declassify_assert behaves like |assert| but declassifies the result of
+// evaluating |expr|. This allows the assertion to branch on the (presumably
+// public) result, but still ensures that values leading up to the computation
+// were secret.
+#define declassify_assert(expr) dev_assert_secret(constant_time_declassify_int(expr))
+
+// Endianness conversions.
+
+#if defined(__GNUC__) && __GNUC__ >= 2
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+  return __builtin_bswap32(x);
+}
+
+static inline uint64_t CRYPTO_bswap8(uint64_t x) {
+  return __builtin_bswap64(x);
+}
+#elif defined(_MSC_VER)
+#pragma warning(push, 3)
+#include <stdlib.h>
+#pragma warning(pop)
+#pragma intrinsic(_byteswap_ulong)
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+  return _byteswap_ulong(x);
+}
+#endif
+
+#if !defined(RING_CORE_NOSTDLIBINC)
+#include <string.h>
+#endif
+
+static inline void *OPENSSL_memcpy(void *dst, const void *src, size_t n) {
+#if !defined(RING_CORE_NOSTDLIBINC)
+  if (n == 0) {
+    return dst;
+  }
+  return memcpy(dst, src, n);
+#else
+  aliasing_uint8_t *d = dst;
+  const aliasing_uint8_t *s = src;
+  for (size_t i = 0; i < n; ++i) {
+    d[i] = s[i];
+  }
+  return dst;
+#endif
+}
+
+static inline void *OPENSSL_memset(void *dst, int c, size_t n) {
+#if !defined(RING_CORE_NOSTDLIBINC)
+  if (n == 0) {
+    return dst;
+  }
+  return memset(dst, c, n);
+#else
+  aliasing_uint8_t *d = dst;
+  for (size_t i = 0; i < n; ++i) {
+    d[i] = (aliasing_uint8_t)c;
+  }
+  return dst;
+#endif
+}
+
+
+// Loads and stores.
+//
+// The following functions load and store sized integers with the specified
+// endianness. They use |memcpy|, and so avoid alignment or strict aliasing
+// requirements on the input and output pointers.
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define RING_BIG_ENDIAN
+#endif
+#endif
+
+static inline uint32_t CRYPTO_load_u32_le(const void *in) {
+  uint32_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+#if defined(RING_BIG_ENDIAN)
+  return CRYPTO_bswap4(v);
+#else
+  return v;
+#endif
+}
+
+static inline void CRYPTO_store_u32_le(void *out, uint32_t v) {
+#if defined(RING_BIG_ENDIAN)
+  v = CRYPTO_bswap4(v);
+#endif
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
+static inline uint32_t CRYPTO_load_u32_be(const void *in) {
+  uint32_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+#if !defined(RING_BIG_ENDIAN)
+  return CRYPTO_bswap4(v);
+#else
+  return v;
+#endif
+}
+
+static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
+#if !defined(RING_BIG_ENDIAN)
+  v = CRYPTO_bswap4(v);
+#endif
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
+// Runtime CPU feature support
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+// OPENSSL_ia32cap_P contains the Intel CPUID bits when running on an x86 or
+// x86-64 system.
+//
+//   Index 0:
+//     EDX for CPUID where EAX = 1
+//     Bit 30 is used to indicate an Intel CPU
+//   Index 1:
+//     ECX for CPUID where EAX = 1
+//   Index 2:
+//     EBX for CPUID where EAX = 7, ECX = 0
+//     Bit 14 (for removed feature MPX) is used to indicate a preference for ymm
+//       registers over zmm even when zmm registers are supported
+//   Index 3:
+//     ECX for CPUID where EAX = 7, ECX = 0
+//
+// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM,
+// and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See
+// caveats in cpu_intel.c.)
+#if defined(OPENSSL_X86_64)
+extern uint32_t avx2_available;
+extern uint32_t adx_bmi2_available;
+#endif
+#endif
+
+
+#if defined(OPENSSL_ARM)
+extern alignas(4) uint32_t neon_available;
+#endif  // OPENSSL_ARM
+
+#endif  // OPENSSL_HEADER_CRYPTO_INTERNAL_H
diff --git a/ring-0.17.14/crypto/limbs/limbs.c b/ring-0.17.14/crypto/limbs/limbs.c
new file mode 100644
index 0000000000..d0027d400d
--- /dev/null
+++ b/ring-0.17.14/crypto/limbs/limbs.c
@@ -0,0 +1,170 @@
+/* Copyright 2016-2017 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "limbs.h"
+
+#include "../internal.h"
+#include "../fipsmodule/bn/internal.h"
+#include "limbs.inl"
+
+
+/* XXX: We assume that the conversion from |Carry| to |Limb| is constant-time,
+ * but we haven't verified that assumption. TODO: Fix it so we don't need to
+ * make that assumption. */
+
+/* Returns 0xfff..f if |a| is zero, and zero otherwise. */
+Limb LIMB_is_zero(const Limb a) {
+  return constant_time_is_zero_w(a);
+}
+
+/* Returns 0xfff..f if |a| is all zero limbs, and zero otherwise. |num_limbs|
+ * may be zero. */
+Limb LIMBS_are_zero(const Limb a[], size_t num_limbs) {
+  Limb all = 0;
+  for (size_t i = 0; i < num_limbs; ++i) {
+    all |= a[i];
+  }
+  return LIMB_is_zero(all);
+}
+
+/* Returns 0xffff..f if |a == b|, and zero otherwise. |num_limbs| may be zero. */
+Limb LIMBS_equal(const Limb a[], const Limb b[], size_t num_limbs) {
+  Limb eq = CONSTTIME_TRUE_W;
+  for (size_t i = 0; i < num_limbs; ++i) {
+    eq = constant_time_select_w(eq, constant_time_eq_w(a[i], b[i]), eq);
+  }
+  return eq;
+}
+
+/* Returns 0xffff...f if |a| is less than |b|, and zero otherwise. */
+Limb LIMBS_less_than(const Limb a[], const Limb b[], size_t num_limbs) {
+  debug_assert_nonsecret(num_limbs >= 1);
+  /* There are lots of ways to implement this. It is implemented this way to
+   * be consistent with |LIMBS_limbs_reduce_once| and other code that makes such
+   * comparisons as part of doing conditional reductions. */
+  Limb dummy;
+  Carry borrow = limb_sub(&dummy, a[0], b[0]);
+  for (size_t i = 1; i < num_limbs; ++i) {
+    borrow = limb_sbb(&dummy, a[i], b[i], borrow);
+  }
+  return constant_time_is_nonzero_w(borrow);
+}
+
+/* if (r >= m) { r -= m; } */
+void LIMBS_reduce_once(Limb r[], const Limb m[], size_t num_limbs) {
+  debug_assert_nonsecret(num_limbs >= 1);
+  /* This could be done more efficiently if we had |num_limbs| of extra space
+   * available, by storing |r - m| and then doing a conditional copy of either
+   * |r| or |r - m|. But, in order to operate in constant space, with an eye
+   * towards this function being used in RSA in the future, we do things a
+   * slightly less efficient way. */
+  Limb lt = LIMBS_less_than(r, m, num_limbs);
+  Carry borrow =
+      limb_sub(&r[0], r[0], constant_time_select_w(lt, 0, m[0]));
+  for (size_t i = 1; i < num_limbs; ++i) {
+    /* XXX: This is probably particularly inefficient because the operations in
+     * constant_time_select affect the carry flag, so there will likely be
+     * loads and stores of |borrow|. */
+    borrow =
+        limb_sbb(&r[i], r[i], constant_time_select_w(lt, 0, m[i]), borrow);
+  }
+  dev_assert_secret(borrow == 0);
+}
+
+void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[],
+                   size_t num_limbs) {
+  Limb overflow1 =
+      constant_time_is_nonzero_w(limbs_add(r, a, b, num_limbs));
+  Limb overflow2 = ~LIMBS_less_than(r, m, num_limbs);
+  Limb overflow = overflow1 | overflow2;
+  Carry borrow = limb_sub(&r[0], r[0], m[0] & overflow);
+  for (size_t i = 1; i < num_limbs; ++i) {
+    borrow = limb_sbb(&r[i], r[i], m[i] & overflow, borrow);
+  }
+}
+
+void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[],
+                   size_t num_limbs) {
+  Limb underflow =
+      constant_time_is_nonzero_w(limbs_sub(r, a, b, num_limbs));
+  Carry carry = limb_add(&r[0], r[0], m[0] & underflow);
+  for (size_t i = 1; i < num_limbs; ++i) {
+    carry = limb_adc(&r[i], r[i], m[i] & underflow, carry);
+  }
+}
+
+void LIMBS_shl_mod(Limb r[], const Limb a[], const Limb m[], size_t num_limbs) {
+  Limb overflow1 =
+      constant_time_is_nonzero_w(a[num_limbs - 1] & LIMB_HIGH_BIT);
+  Limb carry = 0;
+  for (size_t i = 0; i < num_limbs; ++i) {
+    Limb limb = a[i];
+    Limb new_carry = limb >> (LIMB_BITS - 1);
+    r[i] = (limb << 1) | carry;
+    carry = new_carry;
+  }
+  Limb overflow2 = ~LIMBS_less_than(r, m, num_limbs);
+  Limb overflow = overflow1 | overflow2;
+  Carry borrow = limb_sub(&r[0], r[0], m[0] & overflow);
+  for (size_t i = 1; i < num_limbs; ++i) {
+    borrow = limb_sbb(&r[i], r[i], m[i] & overflow, borrow);
+  }
+}
+
+int LIMBS_select_512_32(Limb r[], const Limb table[], size_t num_limbs,
+                        crypto_word_t index) {
+  if (num_limbs % (512 / LIMB_BITS) != 0) {
+    return 0;
+  }
+  limbs_select(r, table, num_limbs, 32, index);
+  return 1;
+}
+
+static const Limb FIVE_BITS_MASK = 0x1f;
+
+crypto_word_t LIMBS_window5_split_window(Limb lower_limb, Limb higher_limb, size_t index_within_word) {
+  Limb high_bits = (higher_limb << (LIMB_BITS - index_within_word))
+    & FIVE_BITS_MASK;
+  // There are no bits outside the window above |index_within_word| (if there
+  // were then this wouldn't be a split window), so we don't need to mask
+  // |low_bits|.
+  Limb low_bits = lower_limb >> index_within_word;
+  return low_bits | high_bits;
+}
+
+crypto_word_t LIMBS_window5_unsplit_window(Limb limb, size_t index_within_word) {
+  return (limb >> index_within_word) & FIVE_BITS_MASK;
+}
+
+Limb LIMB_shr(Limb a, size_t shift) {
+  return a >> shift;
+}
+
+Limb limbs_mul_add_limb(Limb r[], const Limb a[], Limb b, size_t num_limbs) {
+  Limb carried = 0;
+  for (size_t i = 0; i < num_limbs; ++i) {
+    Limb lo;
+    Limb hi;
+    bn_umult_lohi(&lo, &hi, a[i], b);
+    Limb tmp;
+    Carry c = limb_add(&tmp, lo, carried);
+    c = limb_adc(&carried, hi, 0, c);
+    dev_assert_secret(c == 0);
+    c = limb_add(&r[i], r[i], tmp);
+    c = limb_adc(&carried, carried, 0, c);
+    // (A * B) + C + D never carries.
+    dev_assert_secret(c == 0);
+  }
+  return carried;
+}
diff --git a/ring-0.17.14/crypto/limbs/limbs.h b/ring-0.17.14/crypto/limbs/limbs.h
new file mode 100644
index 0000000000..0cf83dd651
--- /dev/null
+++ b/ring-0.17.14/crypto/limbs/limbs.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef RING_LIMBS_H
+#define RING_LIMBS_H
+
+#include <ring-core/base.h>
+
+#include "../internal.h"
+
+typedef crypto_word_t Limb;
+
+#define LIMB_BITS CRYPTO_WORD_BITS
+#define LIMB_HIGH_BIT ((Limb)(1) << (LIMB_BITS - 1))
+
+
+Limb LIMBS_are_zero(const Limb a[], size_t num_limbs);
+Limb LIMBS_equal(const Limb a[], const Limb b[], size_t num_limbs);
+void LIMBS_reduce_once(Limb r[], const Limb m[], size_t num_limbs);
+void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[],
+                   size_t num_limbs);
+void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[],
+                   size_t num_limbs);
+void LIMBS_shl_mod(Limb r[], const Limb a[], const Limb m[], size_t num_limbs);
+Limb limbs_mul_add_limb(Limb r[], const Limb a[], Limb b, size_t num_limbs);
+
+#endif /* RING_LIMBS_H */
diff --git a/ring-0.17.14/crypto/limbs/limbs.inl b/ring-0.17.14/crypto/limbs/limbs.inl
new file mode 100644
index 0000000000..1ca72cbb23
--- /dev/null
+++ b/ring-0.17.14/crypto/limbs/limbs.inl
@@ -0,0 +1,162 @@
+/* Copyright 2016 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "limbs.h"
+#include "ring-core/check.h"
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push, 3)
+#include <intrin.h>
+#pragma warning(pop)
+
+/* MSVC 2015 RC, when compiling for x86 with /Ox (at least), miscompiles
+ * _addcarry_u32(c, 0, prod_hi, &x) like so:
+ *
+ *     add eax,esi ; The previous add that might have set the carry flag.
+ *     xor esi,esi ; OOPS! Carry flag is now reset!
+ *     mov dword ptr [edi-4],eax
+ *     adc esi,dword ptr [prod_hi]
+ *
+ * We test with MSVC 2015 update 2, so make sure we're using a version at least
+ * as new as that. */
+#if _MSC_FULL_VER < 190023918
+#error "MSVC 2015 Update 2 or later is required."
+#endif
+typedef uint8_t Carry;
+#if LIMB_BITS == 64
+#pragma intrinsic(_addcarry_u64, _subborrow_u64)
+#define RING_CORE_ADDCARRY_INTRINSIC _addcarry_u64
+#define RING_CORE_SUBBORROW_INTRINSIC _subborrow_u64
+#elif LIMB_BITS == 32
+#pragma intrinsic(_addcarry_u32, _subborrow_u32)
+#define RING_CORE_ADDCARRY_INTRINSIC _addcarry_u32
+#define RING_CORE_SUBBORROW_INTRINSIC _subborrow_u32
+typedef uint64_t DoubleLimb;
+#endif
+#else
+typedef Limb Carry;
+#if LIMB_BITS == 64
+typedef __uint128_t DoubleLimb;
+#elif LIMB_BITS == 32
+typedef uint64_t DoubleLimb;
+#endif
+#endif
+
+/* |*r = a + b + carry_in|, returning carry out bit. |carry_in| must be 0 or 1.
+ */
+static inline Carry limb_adc(Limb *r, Limb a, Limb b, Carry carry_in) {
+  dev_assert_secret(carry_in == 0 || carry_in == 1);
+  Carry ret;
+#if defined(RING_CORE_ADDCARRY_INTRINSIC)
+  ret = RING_CORE_ADDCARRY_INTRINSIC(carry_in, a, b, r);
+#else
+  DoubleLimb x = (DoubleLimb)a + b + carry_in;
+  *r = (Limb)x;
+  ret = (Carry)(x >> LIMB_BITS);
+#endif
+  dev_assert_secret(ret == 0 || ret == 1);
+  return ret;
+}
+
+/* |*r = a + b|, returning carry bit. */
+static inline Carry limb_add(Limb *r, Limb a, Limb b) {
+  Carry ret;
+#if defined(RING_CORE_ADDCARRY_INTRINSIC)
+  ret = RING_CORE_ADDCARRY_INTRINSIC(0, a, b, r);
+#else
+  DoubleLimb x = (DoubleLimb)a + b;
+  *r = (Limb)x;
+  ret = (Carry)(x >> LIMB_BITS);
+#endif
+  dev_assert_secret(ret == 0 || ret == 1);
+  return ret;
+}
+
+/* |*r = a - b - borrow_in|, returning the borrow out bit. |borrow_in| must be
+ * 0 or 1. */
+static inline Carry limb_sbb(Limb *r, Limb a, Limb b, Carry borrow_in) {
+  dev_assert_secret(borrow_in == 0 || borrow_in == 1);
+  Carry ret;
+#if defined(RING_CORE_SUBBORROW_INTRINSIC)
+  ret = RING_CORE_SUBBORROW_INTRINSIC(borrow_in, a, b, r);
+#else
+  DoubleLimb x = (DoubleLimb)a - b - borrow_in;
+  *r = (Limb)x;
+  ret = (Carry)((x >> LIMB_BITS) & 1);
+#endif
+  dev_assert_secret(ret == 0 || ret == 1);
+  return ret;
+}
+
+/* |*r = a - b|, returning borrow bit. */
+static inline Carry limb_sub(Limb *r, Limb a, Limb b) {
+  Carry ret;
+#if defined(RING_CORE_SUBBORROW_INTRINSIC)
+  ret = RING_CORE_SUBBORROW_INTRINSIC(0, a, b, r);
+#else
+  DoubleLimb x = (DoubleLimb)a - b;
+  *r = (Limb)x;
+  ret = (Carry)((x >> LIMB_BITS) & 1);
+#endif
+  dev_assert_secret(ret == 0 || ret == 1);
+  return ret;
+}
+
+static inline Carry limbs_add(Limb r[], const Limb a[], const Limb b[],
+                              size_t num_limbs) {
+  debug_assert_nonsecret(num_limbs >= 1);
+  Carry carry = limb_add(&r[0], a[0], b[0]);
+  for (size_t i = 1; i < num_limbs; ++i) {
+    carry = limb_adc(&r[i], a[i], b[i], carry);
+  }
+  return carry;
+}
+
+/* |r -= s|, returning the borrow. */
+static inline Carry limbs_sub(Limb r[], const Limb a[], const Limb b[],
+                              size_t num_limbs) {
+  debug_assert_nonsecret(num_limbs >= 1);
+  Carry borrow = limb_sub(&r[0], a[0], b[0]);
+  for (size_t i = 1; i < num_limbs; ++i) {
+    borrow = limb_sbb(&r[i], a[i], b[i], borrow);
+  }
+  return borrow;
+}
+
+static inline void limbs_copy(Limb r[], const Limb a[], size_t num_limbs) {
+  for (size_t i = 0; i < num_limbs; ++i) {
+    r[i] = a[i];
+  }
+}
+
+static inline void limbs_select(Limb r[], const Limb table[],
+                                size_t num_limbs, size_t num_entries,
+                                crypto_word_t index) {
+  for (size_t i = 0; i < num_limbs; ++i) {
+    r[i] = 0;
+  }
+
+  for (size_t e = 0; e < num_entries; ++e) {
+    Limb equal = constant_time_eq_w(index, e);
+    for (size_t i = 0; i < num_limbs; ++i) {
+      r[i] = constant_time_select_w(equal, table[(e * num_limbs) + i], r[i]);
+    }
+  }
+}
+
+static inline void limbs_zero(Limb r[], size_t num_limbs) {
+  for (size_t i = 0; i < num_limbs; ++i) {
+    r[i] = 0;
+  }
+}
diff --git a/ring-0.17.14/crypto/mem.c b/ring-0.17.14/crypto/mem.c
new file mode 100644
index 0000000000..5a85ceaca8
--- /dev/null
+++ b/ring-0.17.14/crypto/mem.c
@@ -0,0 +1,28 @@
+// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ring-core/mem.h>
+#include "internal.h"
+
+int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) {
+  const aliasing_uint8_t *a = in_a;
+  const aliasing_uint8_t *b = in_b;
+  uint8_t x = 0;
+
+  for (size_t i = 0; i < len; i++) {
+    x |= a[i] ^ b[i];
+  }
+
+  return x;
+}
diff --git a/ring-0.17.14/crypto/perlasm/arm-xlate.pl b/ring-0.17.14/crypto/perlasm/arm-xlate.pl
new file mode 100644
index 0000000000..36971387ed
--- /dev/null
+++ b/ring-0.17.14/crypto/perlasm/arm-xlate.pl
@@ -0,0 +1,263 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+
+my $flavour = shift;
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+$flavour = "linux32" if (!$flavour or $flavour eq "void");
+
+my %GLOBALS;
+my $dotinlocallabels=($flavour=~/linux/)?1:0;
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $arch = sub {
+    if ($flavour =~ /linux/)	{ ".arch\t".join(',',@_); }
+    elsif ($flavour =~ /win64/) { ".arch\t".join(',',@_); }
+    else			{ ""; }
+};
+my $fpu = sub {
+    if ($flavour =~ /linux/)	{ ".fpu\t".join(',',@_); }
+    else			{ ""; }
+};
+my $hidden = sub {
+    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+    elsif ($flavour =~ /win64/) { ""; }
+    else			{ ".hidden\t".join(',',@_); }
+};
+my $comm = sub {
+    my @args = split(/,\s*/,shift);
+    my $name = @args[0];
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    if ($flavour =~ /ios32/)	{
+	$ret = ".comm\t_$name,@args[1]\n";
+	$ret .= ".non_lazy_symbol_pointer\n";
+	$ret .= "$name:\n";
+	$ret .= ".indirect_symbol\t_$name\n";
+	$ret .= ".long\t0";
+	$name = "_$name";
+    } else			{ $ret = ".comm\t".join(',',@args); }
+
+    $$global = $name;
+    $ret;
+};
+my $globl = sub {
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios/		&& do { $name = "_$name";
+				last;
+			      };
+    }
+
+    $ret = ".globl	$name\n";
+    # All symbols in assembly files are hidden.
+    $ret .= &$hidden($name);
+    $$global = $name;
+    $ret;
+};
+my $global = $globl;
+my $extern = sub {
+    &$globl(@_);
+    return;	# return nothing
+};
+my $type = sub {
+    if ($flavour =~ /linux/)	{ ".type\t".join(',',@_); }
+    elsif ($flavour =~ /ios32/)	{ if (join(',',@_) =~ /(\w+),%function/) {
+					"#ifdef __thumb2__\n".
+					".thumb_func	$1\n".
+					"#endif";
+				  }
+			        }
+    elsif ($flavour =~ /win64/) { if (join(',',@_) =~ /(\w+),%function/) {
+                # See https://sourceware.org/binutils/docs/as/Pseudo-Ops.html
+                # Per https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#coff-symbol-table,
+                # the type for functions is 0x20, or 32.
+                ".def $1\n".
+                "   .type 32\n".
+                ".endef";
+            }
+        }
+    else			{ ""; }
+};
+my $size = sub {
+    if ($flavour =~ /linux/)	{ ".size\t".join(',',@_); }
+    else			{ ""; }
+};
+my $inst = sub {
+    if ($flavour =~ /linux/)    { ".inst\t".join(',',@_); }
+    else                        { ".long\t".join(',',@_); }
+};
+my $asciz = sub {
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {	".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";	}
+    else
+    {	"";	}
+};
+my $section = sub {
+    if ($flavour =~ /ios/) {
+        if ($_[0] eq ".rodata") {
+            return ".section\t__TEXT,__const";
+        }
+        die "Unknown section name $_[0]";
+    } else {
+        return ".section\t" . join(",", @_);
+    }
+};
+
+sub range {
+  my ($r,$sfx,$start,$end) = @_;
+
+    join(",",map("$r$_$sfx",($start..$end)));
+}
+
+sub expand_line {
+  my $line = shift;
+  my @ret = ();
+
+    pos($line)=0;
+
+    while ($line =~ m/\G[^@\/\{\"]*/g) {
+	if ($line =~ m/\G(@|\/\/|$)/gc) {
+	    last;
+	}
+	elsif ($line =~ m/\G\{/gc) {
+	    my $saved_pos = pos($line);
+	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
+	    pos($line) = $saved_pos;
+	    $line =~ m/\G[^\}]*\}/g;
+	}
+	elsif ($line =~ m/\G\"/gc) {
+	    $line =~ m/\G[^\"]*\"/g;
+	}
+    }
+
+    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
+
+    return $line;
+}
+
+my ($arch_defines, $target_defines);
+if ($flavour =~ /32/) {
+    $arch_defines = "defined(OPENSSL_ARM)";
+} elsif ($flavour =~ /64/) {
+    $arch_defines = "defined(OPENSSL_AARCH64)";
+} else {
+    die "unknown architecture: $flavour";
+}
+if ($flavour =~ /linux/) {
+    # Although the flavour is specified as "linux", it is really used by all
+    # ELF platforms.
+    $target_defines = "defined(__ELF__)";
+} elsif ($flavour =~ /ios/) {
+    # Although the flavour is specified as "ios", it is really used by all Apple
+    # platforms.
+    $target_defines = "defined(__APPLE__)";
+} elsif ($flavour =~ /win/) {
+    $target_defines = "defined(_WIN32)";
+} else {
+    die "unknown target: $flavour";
+}
+
+print <<___;
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && $arch_defines && $target_defines
+___
+
+while(my $line=<>) {
+
+    if ($line =~ m/^\s*(#|@|\/\/)/)	{ print $line; next; }
+
+    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;		# ... and at the end
+
+    if ($flavour =~ /64/) {
+	my $copy = $line;
+	# Also remove line comments.
+	$copy =~ s|//.*||;
+	if ($copy =~ /\b[wx]18\b/) {
+	    die "r18 is reserved by the platform and may not be used.";
+	}
+    }
+
+    {
+	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
+	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
+    }
+
+    {
+	$line =~ s|(^[\.\w]+)\:\s*||;
+	my $label = $1;
+	if ($label) {
+	    printf "%s:",($GLOBALS{$label} or $label);
+	}
+    }
+
+    if ($line !~ m/^[#@]/) {
+	$line =~ s|^\s*(\.?)(\S+)\s*||;
+	my $c = $1; $c = "\t" if ($c eq "");
+	my $mnemonic = $2;
+	my $opcode;
+	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
+	    $opcode = eval("\$$1_$2");
+	} else {
+	    $opcode = eval("\$$mnemonic");
+	}
+
+	if ($flavour =~ /ios/) {
+	    # Mach-O and ELF use different syntax for these relocations. Note
+	    # that we require :pg_hi21: to be explicitly listed. It is normally
+	    # optional with adrp instructions.
+	    $line =~ s|:pg_hi21:(\w+)|\1\@PAGE|;
+	    $line =~ s|:lo12:(\w+)|\1\@PAGEOFF|;
+	} else {
+	    # Clang's integrated assembly does not support the optional
+	    # :pg_hi21: markers, so erase them.
+	    $line =~ s|:pg_hi21:||;
+	}
+
+	my $arg=expand_line($line);
+
+	if (ref($opcode) eq 'CODE') {
+		$line = &$opcode($arg);
+	} elsif ($mnemonic)         {
+		$line = $c.$mnemonic;
+		$line.= "\t$arg" if ($arg ne "");
+	}
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+print <<___;
+#endif  // !OPENSSL_NO_ASM && $arch_defines && $target_defines
+___
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/ring-0.17.14/crypto/perlasm/x86_64-xlate.pl b/ring-0.17.14/crypto/perlasm/x86_64-xlate.pl
new file mode 100644
index 0000000000..f9d71e1d24
--- /dev/null
+++ b/ring-0.17.14/crypto/perlasm/x86_64-xlate.pl
@@ -0,0 +1,1894 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>.
+#
+# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
+# format is way easier to parse. Because it's simpler to "gear" from
+# Unix ABI to Windows one [see cross-reference "card" at the end of
+# file]. Because Linux targets were available first...
+#
+# In addition the script also "distills" code suitable for GNU
+# assembler, so that it can be compiled with more rigid assemblers,
+# such as Solaris /usr/ccs/bin/as.
+#
+# This translator is not designed to convert *arbitrary* assembler
+# code from AT&T format to MASM one. It's designed to convert just
+# enough to provide for dual-ABI OpenSSL modules development...
+# There *are* limitations and you might have to modify your assembler
+# code or this script to achieve the desired result...
+#
+# Currently recognized limitations:
+#
+# - can't use multiple ops per line;
+#
+# Dual-ABI styling rules.
+#
+# 1. Adhere to Unix register and stack layout [see cross-reference
+#    ABI "card" at the end for explanation].
+# 2. Forget about "red zone," stick to more traditional blended
+#    stack frame allocation. If volatile storage is actually required
+#    that is. If not, just leave the stack as is.
+# 3. Functions tagged with ".type name,@function" get crafted with
+#    unified Win64 prologue and epilogue automatically. If you want
+#    to take care of ABI differences yourself, tag functions as
+#    ".type name,@abi-omnipotent" instead.
+# 4. To optimize the Win64 prologue you can specify number of input
+#    arguments as ".type name,@function,N." Keep in mind that if N is
+#    larger than 6, then you *have to* write "abi-omnipotent" code,
+#    because >6 cases can't be addressed with unified prologue.
+# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
+#    (sorry about latter).
+# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
+#    required to identify the spots, where to inject Win64 epilogue!
+# 7. Stick to explicit ip-relative addressing. If you have to use
+#    GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
+#    Both are recognized and translated to proper Win64 addressing
+#    modes.
+#
+# 8. In order to provide for structured exception handling unified
+#    Win64 prologue copies %rsp value to %rax. For further details
+#    see SEH paragraph at the end.
+# 9. .init segment is allowed to contain calls to functions only.
+# a. If function accepts more than 4 arguments *and* >4th argument
+#    is declared as non 64-bit value, do clear its upper part.
+#
+# TODO(https://crbug.com/boringssl/259): The dual-ABI mechanism described here
+# does not quite unwind correctly on Windows. The seh_directive logic below has
+# the start of a new mechanism.
+
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
+
+my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
+my $elf=1;	$elf=0 if (!$gas);
+my $apple=0;
+my $win64=0;
+my $prefix="";
+my $decor=".L";
+
+my $masmref=8 + 50727*2**-32;	# 8.00.50727 shipped with VS2005
+my $masm=0;
+my $PTR=" PTR";
+
+my $nasmref=2.03;
+my $nasm=0;
+
+if    ($flavour eq "mingw64")	{ $gas=1; $elf=0; $win64=1;
+				  # TODO(davidben): Before supporting the
+				  # mingw64 perlasm flavour, do away with this
+				  # environment variable check.
+                                  die "mingw64 not supported";
+				  $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`;
+				  $prefix =~ s|\R$||; # Better chomp
+				}
+elsif ($flavour eq "macosx")	{ $gas=1; $elf=0; $apple=1; $prefix="_"; $decor="L\$"; }
+elsif ($flavour eq "masm")	{ $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; }
+elsif ($flavour eq "nasm")	{ $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; }
+elsif (!$gas)			{ die "unknown flavour $flavour"; }
+
+my $current_segment;
+my $current_function;
+my %globals;
+
+{ package opcode;	# pick up opcodes
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /^([a-z][a-z0-9]*)/i) {
+	    bless $self,$class;
+	    $self->{op} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    undef $self->{sz};
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
+		$self->{op} = $1;
+		$self->{sz} = $2;
+	    } elsif ($self->{op} =~ /call|jmp/) {
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) {
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^or([qlwb])$/) {
+		$self->{op} = "or";
+		$self->{sz} = $1;
+	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
+		$self->{op} = $1;
+		$self->{sz} = $2;
+	    }
+	}
+	$ret;
+    }
+    sub size {
+	my ($self, $sz) = @_;
+	$self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
+	$self->{sz};
+    }
+    sub out {
+	my $self = shift;
+	if ($gas) {
+	    if ($self->{op} eq "movz") {	# movz is pain...
+		sprintf "%s%s%s",$self->{op},$self->{sz},shift;
+	    } elsif ($self->{op} =~ /^set/) {
+		"$self->{op}";
+	    } elsif ($self->{op} eq "ret") {
+		my $epilogue = "";
+		if ($win64 && $current_function->{abi} eq "svr4") {
+		    $epilogue = "movq	8(%rsp),%rdi\n\t" .
+				"movq	16(%rsp),%rsi\n\t";
+		}
+	    	$epilogue . "ret";
+	    } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") {
+		".p2align\t3\n\t.quad";
+	    } else {
+		"$self->{op}$self->{sz}";
+	    }
+	} else {
+	    $self->{op} =~ s/^movz/movzx/;
+	    if ($self->{op} eq "ret") {
+		$self->{op} = "";
+		if ($win64 && $current_function->{abi} eq "svr4") {
+		    $self->{op} = "mov	rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t".
+				  "mov	rsi,QWORD$PTR\[16+rsp\]\n\t";
+	    	}
+		$self->{op} .= "ret";
+	    } elsif ($self->{op} =~ /^(pop|push)f/) {
+		$self->{op} .= $self->{sz};
+	    } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
+		$self->{op} = "\tDQ";
+	    }
+	    $self->{op};
+	}
+    }
+    sub mnemonic {
+	my ($self, $op) = @_;
+	$self->{op}=$op if (defined($op));
+	$self->{op};
+    }
+}
+{ package const;	# pick up constants, which start with $
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /^\$([^,]+)/) {
+	    bless $self, $class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+	}
+	$ret;
+    }
+    sub out {
+    	my $self = shift;
+
+	$self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig;
+	if ($gas) {
+	    # Solaris /usr/ccs/bin/as can't handle multiplications
+	    # in $self->{value}
+	    my $value = $self->{value};
+	    no warnings;    # oct might complain about overflow, ignore here...
+	    $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	    if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+		$self->{value} = $value;
+	    }
+	    sprintf "\$%s",$self->{value};
+	} else {
+	    my $value = $self->{value};
+	    $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
+	    sprintf "%s",$value;
+	}
+    }
+}
+{ package ea;		# pick up effective addresses: expr(%reg,%reg,scale)
+
+    my %szmap = (	b=>"BYTE$PTR",    w=>"WORD$PTR",
+			l=>"DWORD$PTR",   d=>"DWORD$PTR",
+			q=>"QWORD$PTR",   o=>"OWORD$PTR",
+			x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
+			z=>"ZMMWORD$PTR" ) if (!$gas);
+
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	# optional * ----vvv--- appears in indirect jmp/call
+	if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) {
+	    bless $self, $class;
+	    $self->{asterisk} = $1;
+	    $self->{label} = $2;
+	    ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
+	    $self->{scale} = 1 if (!defined($self->{scale}));
+	    $self->{opmask} = $4;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    if ($win64 && $self->{label} =~ s/\@GOTPCREL//) {
+		die if ($opcode->mnemonic() ne "mov");
+		$opcode->mnemonic("lea");
+	    }
+	    $self->{base}  =~ s/^%//;
+	    $self->{index} =~ s/^%// if (defined($self->{index}));
+	    $self->{opcode} = $opcode;
+	}
+	$ret;
+    }
+    sub size {}
+    sub out {
+	my ($self, $sz) = @_;
+
+	$self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+	$self->{label} =~ s/\.L/$decor/g;
+
+	# Silently convert all EAs to 64-bit. This is required for
+	# elder GNU assembler and results in more compact code,
+	# *but* most importantly AES module depends on this feature!
+	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+
+	# Some assemblers insist on signed presentation of 32-bit
+	# offsets, but sign extension is a tricky business in perl...
+	if ((1<<31)<<1) {
+	    $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg;
+	} else {
+	    $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg;
+	}
+
+	# if base register is %rbp or %r13, see if it's possible to
+	# flip base and index registers [for better performance]
+	if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+	    $self->{base} =~ /(rbp|r13)/) {
+		$self->{base} = $self->{index}; $self->{index} = $1;
+	}
+
+	if ($gas) {
+	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
+
+	    if (defined($self->{index})) {
+		sprintf "%s%s(%s,%%%s,%d)%s",
+					$self->{asterisk},$self->{label},
+					$self->{base}?"%$self->{base}":"",
+					$self->{index},$self->{scale},
+					$self->{opmask};
+	    } else {
+		sprintf "%s%s(%%%s)%s",	$self->{asterisk},$self->{label},
+					$self->{base},$self->{opmask};
+	    }
+	} else {
+	    $self->{label} =~ s/\./\$/g;
+	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
+	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
+
+	    my $mnemonic = $self->{opcode}->mnemonic();
+	    ($self->{asterisk})				&& ($sz="q") ||
+	    ($mnemonic =~ /^v?mov([qd])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^v?pinsr([qdwb])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^vpbroadcast([qdwb])$/)	&& ($sz=$1)  ||
+	    ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/)	&& ($sz="x");
+
+	    $self->{opmask}  =~ s/%(k[0-7])/$1/;
+
+	    if (defined($self->{index})) {
+		sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
+					$self->{label}?"$self->{label}+":"",
+					$self->{index},$self->{scale},
+					$self->{base}?"+$self->{base}":"",
+					$self->{opmask};
+	    } elsif ($self->{base} eq "rip") {
+		sprintf "%s[%s]",$szmap{$sz},$self->{label};
+	    } else {
+		sprintf "%s[%s%s]%s",	$szmap{$sz},
+					$self->{label}?"$self->{label}+":"",
+					$self->{base},$self->{opmask};
+	    }
+	}
+    }
+}
+{ package register;	# pick up registers, which start with %.
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	# optional * ----vvv--- appears in indirect jmp/call
+	if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) {
+	    bless $self,$class;
+	    $self->{asterisk} = $1;
+	    $self->{value} = $2;
+	    $self->{opmask} = $3;
+	    $opcode->size($self->size());
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+	}
+	$ret;
+    }
+    sub size {
+	my	$self = shift;
+	my	$ret;
+
+	if    ($self->{value} =~ /^r[\d]+b$/i)	{ $ret="b"; }
+	elsif ($self->{value} =~ /^r[\d]+w$/i)	{ $ret="w"; }
+	elsif ($self->{value} =~ /^r[\d]+d$/i)	{ $ret="l"; }
+	elsif ($self->{value} =~ /^r[\w]+$/i)	{ $ret="q"; }
+	elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
+	elsif ($self->{value} =~ /^[\w]{2}l$/i)	{ $ret="b"; }
+	elsif ($self->{value} =~ /^[\w]{2}$/i)	{ $ret="w"; }
+	elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
+
+	$ret;
+    }
+    sub out {
+    	my $self = shift;
+	if ($gas)	{ sprintf "%s%%%s%s",	$self->{asterisk},
+						$self->{value},
+						$self->{opmask}; }
+	else		{ $self->{opmask} =~ s/%(k[0-7])/$1/;
+			  $self->{value}.$self->{opmask}; }
+    }
+}
+{ package label;	# pick up labels, which end with :
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /(^[\.\w]+)\:/) {
+	    bless $self,$class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    $self->{value} =~ s/^\.L/$decor/;
+	}
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+
+	if ($gas) {
+	    my $func = ($globals{$self->{value}} or $self->{value}) . ":";
+	    if ($win64	&& $current_function->{name} eq $self->{value}
+			&& $current_function->{abi} eq "svr4") {
+		$func .= "\n";
+		$func .= "	movq	%rdi,8(%rsp)\n";
+		$func .= "	movq	%rsi,16(%rsp)\n";
+		$func .= "	movq	%rsp,%rax\n";
+		$func .= "${decor}SEH_begin_$current_function->{name}:\n";
+		my $narg = $current_function->{narg};
+		$narg=6 if (!defined($narg));
+		$func .= "	movq	%rcx,%rdi\n" if ($narg>0);
+		$func .= "	movq	%rdx,%rsi\n" if ($narg>1);
+		$func .= "	movq	%r8,%rdx\n"  if ($narg>2);
+		$func .= "	movq	%r9,%rcx\n"  if ($narg>3);
+		$func .= "	movq	40(%rsp),%r8\n" if ($narg>4);
+		$func .= "	movq	48(%rsp),%r9\n" if ($narg>5);
+	    }
+	    $func;
+	} elsif ($self->{value} ne "$current_function->{name}") {
+	    # Make all labels in masm global.
+	    $self->{value} .= ":" if ($masm);
+	    $self->{value} . ":";
+	} elsif ($win64 && $current_function->{abi} eq "svr4") {
+	    my $func =	"$current_function->{name}" .
+			($nasm ? ":" : "\tPROC $current_function->{scope}") .
+			"\n";
+	    $func .= "	mov	QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n";
+	    $func .= "	mov	QWORD$PTR\[16+rsp\],rsi\n";
+	    $func .= "	mov	rax,rsp\n";
+	    $func .= "${decor}SEH_begin_$current_function->{name}:";
+	    $func .= ":" if ($masm);
+	    $func .= "\n";
+	    my $narg = $current_function->{narg};
+	    $narg=6 if (!defined($narg));
+	    $func .= "	mov	rdi,rcx\n" if ($narg>0);
+	    $func .= "	mov	rsi,rdx\n" if ($narg>1);
+	    $func .= "	mov	rdx,r8\n"  if ($narg>2);
+	    $func .= "	mov	rcx,r9\n"  if ($narg>3);
+	    $func .= "	mov	r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4);
+	    $func .= "	mov	r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5);
+	    $func .= "\n";
+	} else {
+	   "$current_function->{name}".
+			($nasm ? ":" : "\tPROC $current_function->{scope}");
+	}
+    }
+}
+{ package expr;		# pick up expressions
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /(^[^,]+)/) {
+	    bless $self,$class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    $self->{value} =~ s/\@PLT// if (!$elf);
+	    $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+	    $self->{value} =~ s/\.L/$decor/g;
+	    $self->{opcode} = $opcode;
+	}
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+	if ($nasm && $self->{opcode}->mnemonic()=~m/^j(?![re]cxz)/) {
+	    "NEAR ".$self->{value};
+	} else {
+	    $self->{value};
+	}
+    }
+}
+{ package cfi_directive;
+    # CFI directives annotate instructions that are significant for
+    # stack unwinding procedure compliant with DWARF specification,
+    # see http://dwarfstd.org/. Besides naturally expected for this
+    # script platform-specific filtering function, this module adds
+    # three auxiliary synthetic directives not recognized by [GNU]
+    # assembler:
+    #
+    # - .cfi_push to annotate push instructions in prologue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_offset;
+    # - .cfi_pop to annotate pop instructions in epilogue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_restore;
+    # - [and most notably] .cfi_cfa_expression which encodes
+    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+    #   byte vector;
+    #
+    # CFA expressions were introduced in DWARF specification version
+    # 3 and describe how to deduce CFA, Canonical Frame Address. This
+    # becomes handy if your stack frame is variable and you can't
+    # spare register for [previous] frame pointer. Suggested directive
+    # syntax is made-up mix of DWARF operator suffixes [subset of]
+    # and references to registers with optional bias. Following example
+    # describes offloaded *original* stack pointer at specific offset
+    # from *current* stack pointer:
+    #
+    #   .cfi_cfa_expression     %rsp+40,deref,+8
+    #
+    # Final +8 has everything to do with the fact that CFA is defined
+    # as reference to top of caller's stack, and on x86_64 call to
+    # subroutine pushes 8-byte return address. In other words original
+    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+
+    # Below constants are taken from "DWARF Expressions" section of the
+    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+    my %DW_OP_simple = (	# no-arg operators, mapped directly
+	deref	=> 0x06,	dup	=> 0x12,
+	drop	=> 0x13,	over	=> 0x14,
+	pick	=> 0x15,	swap	=> 0x16,
+	rot	=> 0x17,	xderef	=> 0x18,
+
+	abs	=> 0x19,	and	=> 0x1a,
+	div	=> 0x1b,	minus	=> 0x1c,
+	mod	=> 0x1d,	mul	=> 0x1e,
+	neg	=> 0x1f,	not	=> 0x20,
+	or	=> 0x21,	plus	=> 0x22,
+	shl	=> 0x24,	shr	=> 0x25,
+	shra	=> 0x26,	xor	=> 0x27,
+	);
+
+    my %DW_OP_complex = (	# used in specific subroutines
+	constu		=> 0x10,	# uleb128
+	consts		=> 0x11,	# sleb128
+	plus_uconst	=> 0x23,	# uleb128
+	lit0 		=> 0x30,	# add 0-31 to opcode
+	reg0		=> 0x50,	# add 0-31 to opcode
+	breg0		=> 0x70,	# add 0-31 to opcole, sleb128
+	regx		=> 0x90,	# uleb28
+	fbreg		=> 0x91,	# sleb128
+	bregx		=> 0x92,	# uleb128, sleb128
+	piece		=> 0x93,	# uleb128
+	);
+
+    # Following constants are defined in x86_64 ABI supplement, for
+    # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+    # see section 3.7 "Stack Unwind Algorithm".
+    my %DW_reg_idx = (
+	"%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
+	"%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
+	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+	);
+
+    my ($cfa_reg, $cfa_rsp);
+    my @cfa_stack;
+
+    # [us]leb128 format is variable-length integer representation base
+    # 2^128, with most significant bit of each byte being 0 denoting
+    # *last* most significant digit. See "Variable Length Data" in the
+    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+    sub sleb128 {
+	use integer;	# get right shift extend sign
+
+	my $val = shift;
+	my $sign = ($val < 0) ? -1 : 0;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if remaining bits are same and equal to most
+	    # significant bit of the current digit, if so, it's
+	    # last digit...
+	    last if (($val>>6) == $sign);
+
+	    @ret[-1] |= 0x80;
+	    $val >>= 7;
+	}
+
+	return @ret;
+    }
+    sub uleb128 {
+	my $val = shift;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if it's last significant digit...
+	    last if (($val >>= 7) == 0);
+
+	    @ret[-1] |= 0x80;
+	}
+
+	return @ret;
+    }
+    sub const {
+	my $val = shift;
+
+	if ($val >= 0 && $val < 32) {
+            return ($DW_OP_complex{lit0}+$val);
+	}
+	return ($DW_OP_complex{consts}, sleb128($val));
+    }
+    sub reg {
+	my $val = shift;
+
+	return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+	my $reg = $DW_reg_idx{$1};
+	my $off = eval ("0 $2 $3");
+
+	return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+	# Yes, we use DW_OP_bregX+0 to push register value and not
+	# DW_OP_regX, because latter would require even DW_OP_piece,
+	# which would be a waste under the circumstances. If you have
+	# to use DWP_OP_reg, use "regx:N"...
+    }
+    sub cfa_expression {
+	my $line = shift;
+	my @ret;
+
+	foreach my $token (split(/,\s*/,$line)) {
+	    if ($token =~ /^%r/) {
+		push @ret,reg($token);
+	    } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+		push @ret,reg("$2+$1");
+	    } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+		my $i = 1*eval($2);
+		push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+	    } elsif (my $i = 1*eval($token) or $token eq "0") {
+		if ($token =~ /^\+/) {
+		    push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+		} else {
+		    push @ret,const($i);
+		}
+	    } else {
+		push @ret,$DW_OP_simple{$token};
+	    }
+	}
+
+	# Finally we return DW_CFA_def_cfa_expression, 15, followed by
+	# length of the expression and of course the expression itself.
+	return (15,scalar(@ret),@ret);
+    }
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+	    bless $self,$class;
+	    $ret = $self;
+	    undef $self->{value};
+	    my $dir = $1;
+
+	    SWITCH: for ($dir) {
+	    # What is $cfa_rsp? Effectively it's difference between %rsp
+	    # value and current CFA, Canonical Frame Address, which is
+	    # why it starts with -8. Recall that CFA is top of caller's
+	    # stack...
+	    /startproc/	&& do {	($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; };
+	    /endproc/	&& do {	($cfa_reg, $cfa_rsp) = ("%rsp",  0); last; };
+	    /def_cfa_register/
+			&& do {	$cfa_reg = $$line; last; };
+	    /def_cfa_offset/
+			&& do {	$cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp");
+				last;
+			      };
+	    /adjust_cfa_offset/
+			&& do {	$cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp");
+				last;
+			      };
+	    /def_cfa/	&& do {	if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+				    $cfa_reg = $1;
+				    $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp");
+				}
+				last;
+			      };
+	    /push/	&& do {	$dir = undef;
+				$cfa_rsp -= 8;
+				if ($cfa_reg eq "%rsp") {
+				    $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+				}
+				$self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+				last;
+			      };
+	    /pop/	&& do {	$dir = undef;
+				$cfa_rsp += 8;
+				if ($cfa_reg eq "%rsp") {
+				    $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+				}
+				$self->{value} .= ".cfi_restore\t$$line";
+				last;
+			      };
+	    /cfa_expression/
+			&& do {	$dir = undef;
+				$self->{value} = ".cfi_escape\t" .
+					join(",", map(sprintf("0x%02x", $_),
+						      cfa_expression($$line)));
+				last;
+			      };
+	    /remember_state/
+			&& do {	push @cfa_stack, [$cfa_reg, $cfa_rsp];
+				last;
+			      };
+	    /restore_state/
+			&& do {	($cfa_reg, $cfa_rsp) = @{pop @cfa_stack};
+				last;
+			      };
+	    }
+
+	    $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+	    $$line = "";
+	}
+
+	return $ret;
+    }
+    sub out {
+	my $self = shift;
+	return ($elf ? $self->{value} : undef);
+    }
+}
+{ package seh_directive;
+    # This implements directives, like MASM, gas, and clang-assembler for
+    # specifying Windows unwind codes. See
+    # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170
+    # for details on the Windows unwind mechanism. As perlasm generally uses gas
+    # syntax, the syntax is patterned after the gas spelling, described in
+    # https://sourceware.org/legacy-ml/binutils/2009-08/msg00193.html
+    #
+    # TODO(https://crbug.com/boringssl/571): Translate to the MASM directives
+    # when using the MASM output. Emit as-is when using "mingw64" output, which
+    # is Windows with gas syntax.
+    #
+    # TODO(https://crbug.com/boringssl/259): For now, SEH directives are ignored
+    # on non-Windows platforms. This means functions need to specify both CFI
+    # and SEH directives, often redundantly. Ideally we'd abstract between the
+    # two. E.g., we can synthesize CFI from SEH prologues, but SEH does not
+    # annotate epilogs, so we'd need to combine parts from both. Or we can
+    # restrict ourselves to a subset of CFI and synthesize SEH from CFI.
+    #
+    # Additionally, this only supports @abi-omnipotent functions. It is
+    # incompatible with the automatic calling convention conversion. The main
+    # complication is the current scheme modifies RDI and RSI (non-volatile on
+    # Windows) at the start of the function, and saves them in the parameter
+    # stack area. This can be expressed with .seh_savereg, but .seh_savereg is
+    # only usable late in the prologue. However, unwind information gives enough
+    # information to locate the parameter stack area at any point in the
+    # function, so we can defer conversion or implement other schemes.
+
+    my $UWOP_PUSH_NONVOL = 0;
+    my $UWOP_ALLOC_LARGE = 1;
+    my $UWOP_ALLOC_SMALL = 2;
+    my $UWOP_SET_FPREG = 3;
+    my $UWOP_SAVE_NONVOL = 4;
+    my $UWOP_SAVE_NONVOL_FAR = 5;
+    my $UWOP_SAVE_XMM128 = 8;
+    my $UWOP_SAVE_XMM128_FAR = 9;
+
+    my %UWOP_REG_TO_NUMBER = ("%rax" => 0, "%rcx" => 1, "%rdx" => 2, "%rbx" => 3,
+			      "%rsp" => 4, "%rbp" => 5, "%rsi" => 6, "%rdi" => 7,
+			      map(("%r$_" => $_), (8..15)));
+    my %UWOP_NUMBER_TO_REG = reverse %UWOP_REG_TO_NUMBER;
+
+    # The contents of the pdata and xdata sections so far.
+    my ($xdata, $pdata) = ("", "");
+
+    my %info;
+
+    my $next_label = 0;
+    my $current_label_func = "";
+
+    # _new_unwind_label allocates a new label, unique to the file.
+    sub _new_unwind_label {
+	my ($name) = (@_);
+	# Labels only need to be unique, but to make diffs easier to read, scope
+	# them all under the current function.
+	my $func = $current_function->{name};
+	if ($func ne $current_label_func) {
+	    $current_label_func = $func;
+	    $next_label = 0;
+	}
+
+	my $num = $next_label++;
+	return ".LSEH_${name}_${func}_${num}";
+    }
+
+    sub _check_in_proc {
+	die "Missing .seh_startproc directive" unless %info;
+    }
+
+    sub _check_in_prologue {
+	_check_in_proc();
+	die "Invalid SEH directive after .seh_endprologue" if defined($info{endprologue});
+    }
+
+    sub _check_not_in_proc {
+	die "Missing .seh_endproc directive" if %info;
+    }
+
+    sub _startproc {
+	_check_not_in_proc();
+	if ($current_function->{abi} eq "svr4") {
+	    die "SEH directives can only be used with \@abi-omnipotent";
+	}
+
+	my $info_label = _new_unwind_label("info");
+	my $start_label = _new_unwind_label("begin");
+	%info = (
+	    # info_label is the label of the function's entry in .xdata.
+	    info_label => $info_label,
+	    # start_label is the start of the function.
+	    start_label => $start_label,
+	    # endprologue is the label of the end of the prologue.
+	    endprologue => undef,
+	    # unwind_codes contains the textual representation of the
+	    # unwind codes in the function so far.
+	    unwind_codes => "",
+	    # num_codes is the number of 16-bit words in unwind_codes.
+	    num_codes => 0,
+	    # frame_reg is the number of the frame register, or zero if
+	    # there is none.
+	    frame_reg => 0,
+	    # frame_offset is the offset into the fixed part of the stack that
+	    # the frame register points into.
+	    frame_offset => 0,
+	    # has_offset is whether directives taking an offset have
+	    # been used. This is used to check that such directives
+	    # come after the fixed portion of the stack frame is established.
+	    has_offset => 0,
+	    # has_nonpushreg is whether directives other than
+	    # .seh_pushreg have been used. This is used to check that
+	    # .seh_pushreg directives are first.
+	    has_nonpushreg => 0,
+	);
+	return $start_label;
+    }
+
+    sub _add_unwind_code {
+	my ($op, $value, @extra) = @_;
+	_check_in_prologue();
+	if ($op != $UWOP_PUSH_NONVOL) {
+	    $info{has_nonpushreg} = 1;
+	} elsif ($info{has_nonpushreg}) {
+	    die ".seh_pushreg directives must appear first in the prologue";
+	}
+
+	my $label = _new_unwind_label("prologue");
+	# Encode an UNWIND_CODE structure. See
+	# https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-unwind_code
+	my $encoded = $op | ($value << 4);
+	my $codes = <<____;
+	.byte	$label-$info{start_label}
+	.byte	$encoded
+____
+	# Some opcodes need additional values to encode themselves.
+	foreach (@extra) {
+	    $codes .= "\t.value\t$_\n";
+	}
+
+	$info{num_codes} += 1 + scalar(@extra);
+	# Unwind codes are listed in reverse order.
+	$info{unwind_codes} = $codes . $info{unwind_codes};
+	return $label;
+    }
+
+    sub _updating_fixed_allocation {
+	_check_in_prologue();
+	if ($info{frame_reg} != 0) {
+	    # Windows documentation does not explicitly forbid .seh_stackalloc
+	    # after .seh_setframe, but it appears to have no effect. Offsets are
+	    # still relative to the fixed allocation when the frame register was
+	    # established.
+	    die "fixed allocation may not be increased after .seh_setframe";
+	}
+	if ($info{has_offset}) {
+	    # Windows documentation does not explicitly forbid .seh_savereg
+	    # before .seh_stackalloc, but it does not work very well. Offsets
+	    # are relative to the top of the final fixed allocation, not where
+	    # RSP currently is.
+	    die "directives with an offset must come after the fixed allocation is established.";
+	}
+    }
+
+    sub _endproc {
+	_check_in_proc();
+	if (!defined($info{endprologue})) {
+	    die "Missing .seh_endprologue";
+	}
+
+	my $end_label = _new_unwind_label("end");
+	# Encode a RUNTIME_FUNCTION. See
+	# https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-runtime_function
+	$pdata .= <<____;
+	.rva	$info{start_label}
+	.rva	$end_label
+	.rva	$info{info_label}
+
+____
+
+	# Encode an UNWIND_INFO. See
+	# https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-unwind_info
+	my $frame_encoded = $info{frame_reg} | (($info{frame_offset} / 16) << 4);
+	$xdata .= <<____;
+$info{info_label}:
+	.byte	1	# version 1, no flags
+	.byte	$info{endprologue}-$info{start_label}
+	.byte	$info{num_codes}
+	.byte	$frame_encoded
+$info{unwind_codes}
+____
+
+	# UNWIND_INFOs must be 4-byte aligned. If needed, we must add an extra
+	# unwind code. This does not change the unwind code count. Windows
+	# documentation says "For alignment purposes, this array always has an
+	# even number of entries, and the final entry is potentially unused. In
+	# that case, the array is one longer than indicated by the count of
+	# unwind codes field."
+	if ($info{num_codes} & 1) {
+	    $xdata .= "\t.value\t0\n";
+	}
+
+	%info = ();
+	return $end_label;
+    }
+
+    sub re {
+	my ($class, $line) = @_;
+	if ($$line =~ s/^\s*\.seh_(\w+)\s*//) {
+	    my $dir = $1;
+	    if (!$win64) {
+		$$line = "";
+		return;
+	    }
+
+	    my $label;
+	    SWITCH: for ($dir) {
+		/^startproc$/ && do {
+		    $label = _startproc($1);
+		    last;
+		};
+		/^pushreg$/ && do {
+		    $$line =~ /^(%\w+)\s*$/ or die "could not parse .seh_$dir";
+		    my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1";
+		    _updating_fixed_allocation();
+		    $label = _add_unwind_code($UWOP_PUSH_NONVOL, $reg_num);
+		    last;
+		};
+		/^stackalloc$/ && do {
+		    my $num = eval($$line);
+		    if ($num <= 0 || $num % 8 != 0) {
+			die "invalid stack allocation: $num";
+		    }
+		    _updating_fixed_allocation();
+		    if ($num <= 128) {
+			$label = _add_unwind_code($UWOP_ALLOC_SMALL, ($num - 8) / 8);
+		    } elsif ($num < 512 * 1024) {
+			$label = _add_unwind_code($UWOP_ALLOC_LARGE, 0, $num / 8);
+		    } elsif ($num < 4 * 1024 * 1024 * 1024) {
+			$label = _add_unwind_code($UWOP_ALLOC_LARGE, 1, $num >> 16, $num & 0xffff);
+		    } else {
+			die "stack allocation too large: $num"
+		    }
+		    last;
+		};
+		/^setframe$/ && do {
+		    if ($info{frame_reg} != 0) {
+			die "duplicate .seh_setframe directive";
+		    }
+		    if ($info{has_offset}) {
+			die "directives with with an offset must come after .seh_setframe.";
+		    }
+		    $$line =~ /(%\w+)\s*,\s*(.+)/ or die "could not parse .seh_$dir";
+		    my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1";
+		    my $offset = eval($2);
+		    if ($offset < 0 || $offset % 16 != 0 || $offset > 240) {
+			die "invalid offset: $offset";
+		    }
+		    $info{frame_reg} = $reg_num;
+		    $info{frame_offset} = $offset;
+		    $label = _add_unwind_code($UWOP_SET_FPREG, 0);
+		    last;
+		};
+		/^savereg$/ && do {
+		    $$line =~ /(%\w+)\s*,\s*(.+)/ or die "could not parse .seh_$dir";
+		    my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1";
+		    my $offset = eval($2);
+		    if ($offset < 0 || $offset % 8 != 0) {
+			die "invalid offset: $offset";
+		    }
+		    if ($offset < 8 * 65536) {
+			$label = _add_unwind_code($UWOP_SAVE_NONVOL, $reg_num, $offset / 8);
+		    } else {
+			$label = _add_unwind_code($UWOP_SAVE_NONVOL_FAR, $reg_num, $offset >> 16, $offset & 0xffff);
+		    }
+		    $info{has_offset} = 1;
+		    last;
+		};
+		/^savexmm$/ && do {
+		    $$line =~ /%xmm(\d+)\s*,\s*(.+)/ or die "could not parse .seh_$dir";
+		    my $reg_num = $1;
+		    my $offset = eval($2);
+		    if ($offset < 0 || $offset % 16 != 0) {
+			die "invalid offset: $offset";
+		    }
+		    if ($offset < 16 * 65536) {
+			$label = _add_unwind_code($UWOP_SAVE_XMM128, $reg_num, $offset / 16);
+		    } else {
+			$label = _add_unwind_code($UWOP_SAVE_XMM128_FAR, $reg_num, $offset >> 16, $offset & 0xffff);
+		    }
+		    $info{has_offset} = 1;
+		    last;
+		};
+		/^endprologue$/ && do {
+		    _check_in_prologue();
+		    if ($info{num_codes} == 0) {
+			# If a Windows function has no directives (i.e. it
+			# doesn't touch the stack), it is a leaf function and is
+			# not expected to appear in .pdata or .xdata.
+			die ".seh_endprologue found with no unwind codes";
+		    }
+
+		    $label = _new_unwind_label("endprologue");
+		    $info{endprologue} = $label;
+		    last;
+		};
+		/^endproc$/ && do {
+		    $label = _endproc();
+		    last;
+		};
+		die "unknown SEH directive .seh_$dir";
+	    }
+
+	    # All SEH directives compile to labels inline. The other data is
+	    # emitted later.
+	    $$line = "";
+	    $label .= ":";
+	    return label->re(\$label);
+	}
+    }
+
+    sub pdata_and_xdata {
+	return "" unless $win64;
+
+	my $ret = "";
+	if ($pdata ne "") {
+	    $ret .= <<____;
+.section	.pdata
+.align	4
+$pdata
+____
+	}
+	if ($xdata ne "") {
+	    $ret .= <<____;
+.section	.xdata
+.align	4
+$xdata
+____
+	}
+	return $ret;
+    }
+}
+{ package directive;	# pick up directives, which start with .
+    my %sections;
+    sub nasm_section {
+	my ($name, $qualifiers) = @_;
+	my $ret = "section\t$name";
+	if (exists $sections{$name}) {
+	    # Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392701. Only
+	    # emit section qualifiers the first time a section is referenced.
+	    # For all subsequent references, require the qualifiers match and
+	    # omit them.
+	    #
+	    # See also https://crbug.com/1422018 and b/270643835.
+	    my $old = $sections{$name};
+	    die "Inconsistent qualifiers: $qualifiers vs $old" if ($qualifiers ne "" && $qualifiers ne $old);
+	} else {
+	    $sections{$name} = $qualifiers;
+	    if ($qualifiers ne "") {
+		$ret .= " $qualifiers";
+	    }
+	}
+	return $ret;
+    }
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+	my	$dir;
+
+	# chain-call to cfi_directive and seh_directive.
+	$ret = cfi_directive->re($line) and return $ret;
+	$ret = seh_directive->re($line) and return $ret;
+
+	if ($$line =~ /^\s*(\.\w+)/) {
+	    bless $self,$class;
+	    $dir = $1;
+	    $ret = $self;
+	    undef $self->{value};
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    SWITCH: for ($dir) {
+		/\.global|\.globl|\.extern/
+			    && do { $globals{$$line} = $prefix . $$line;
+				    $$line = $globals{$$line} if ($prefix);
+				    last;
+				  };
+		/\.type/    && do { my ($sym,$type,$narg) = split(/\s*,\s*/,$$line);
+				    if ($type eq "\@function") {
+					undef $current_function;
+					$current_function->{name} = $sym;
+					$current_function->{abi}  = "svr4";
+					$current_function->{narg} = $narg;
+					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+				    } elsif ($type eq "\@abi-omnipotent") {
+					undef $current_function;
+					$current_function->{name} = $sym;
+					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+				    }
+				    $$line =~ s/\@abi\-omnipotent/\@function/;
+				    $$line =~ s/\@function.*/\@function/;
+				    last;
+				  };
+		/\.asciz/   && do { if ($$line =~ /^"(.*)"$/) {
+					$dir  = ".byte";
+					$$line = join(",",unpack("C*",$1),0);
+				    }
+				    last;
+				  };
+		/\.rva|\.long|\.quad|\.byte/
+			    && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+				    $$line =~ s/\.L/$decor/g;
+				    last;
+				  };
+	    }
+
+	    if ($gas) {
+		$self->{value} = $dir . "\t" . $$line;
+
+		if ($dir =~ /\.extern/) {
+		    if ($flavour eq "elf") {
+			$self->{value} .= "\n.hidden $$line";
+		    } else {
+			$self->{value} = "";
+		    }
+		} elsif (!$elf && $dir =~ /\.type/) {
+		    $self->{value} = "";
+		    $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
+				(defined($globals{$1})?".scl 2;":".scl 3;") .
+				"\t.type 32;\t.endef"
+				if ($win64 && $$line =~ /([^,]+),\@function/);
+		} elsif (!$elf && $dir =~ /\.size/) {
+		    $self->{value} = "";
+		    if (defined($current_function)) {
+			$self->{value} .= "${decor}SEH_end_$current_function->{name}:"
+				if ($win64 && $current_function->{abi} eq "svr4");
+			undef $current_function;
+		    }
+		} elsif (!$elf && $dir =~ /\.align/) {
+		    $self->{value} = ".p2align\t" . (log($$line)/log(2));
+		} elsif ($dir eq ".section") {
+		    $current_segment=$$line;
+		    if (!$elf && $current_segment eq ".rodata") {
+			if	($flavour eq "macosx") { $self->{value} = ".section\t__DATA,__const"; }
+		    }
+		    if (!$elf && $current_segment eq ".init") {
+			if	($flavour eq "macosx")	{ $self->{value} = ".mod_init_func"; }
+			elsif	($flavour eq "mingw64")	{ $self->{value} = ".section\t.ctors"; }
+		    }
+		} elsif ($dir =~ /\.(text|data)/) {
+		    $current_segment=".$1";
+		} elsif ($dir =~ /\.global|\.globl|\.extern/) {
+		    if ($flavour eq "macosx") {
+		        $self->{value} .= "\n.private_extern $$line";
+		    } else {
+		        $self->{value} .= "\n.hidden $$line";
+		    }
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
+		}
+		$$line = "";
+		return $self;
+	    }
+
+	    # non-gas case or nasm/masm
+	    SWITCH: for ($dir) {
+		/\.text/    && do { my $v=undef;
+				    if ($nasm) {
+					$v=nasm_section(".text", "code align=64")."\n";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = ".text\$";
+					$v.="$current_segment\tSEGMENT ";
+					$v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
+					$v.=" 'CODE'";
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.data/    && do { my $v=undef;
+				    if ($nasm) {
+					$v=nasm_section(".data", "data align=8")."\n";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT";
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.section/ && do { my $v=undef;
+				    $$line =~ s/([^,]*).*/$1/;
+				    $$line = ".CRT\$XCU" if ($$line eq ".init");
+				    $$line = ".rdata" if ($$line eq ".rodata");
+				    if ($nasm) {
+					my $qualifiers = "";
+					if ($$line=~/\.([prx])data/) {
+					    $qualifiers = "rdata align=";
+					    $qualifiers .= $1 eq "p"? 4 : 8;
+					} elsif ($$line=~/\.CRT\$/i) {
+					    $qualifiers = "rdata align=8";
+					}
+					$v = nasm_section($$line, $qualifiers);
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$v.="$$line\tSEGMENT";
+					if ($$line=~/\.([prx])data/) {
+					    $v.=" READONLY";
+					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
+					} elsif ($$line=~/\.CRT\$/i) {
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
+					}
+				    }
+				    $current_segment = $$line;
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.extern/  && do { $self->{value}  = "EXTERN\t".$$line;
+				    $self->{value} .= ":NEAR" if ($masm);
+				    last;
+				  };
+		/\.globl|.global/
+			    && do { $self->{value}  = $masm?"PUBLIC":"global";
+				    $self->{value} .= "\t".$$line;
+				    last;
+				  };
+		/\.size/    && do { if (defined($current_function)) {
+					undef $self->{value};
+					if ($current_function->{abi} eq "svr4") {
+					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
+					    $self->{value}.=":\n" if($masm);
+					}
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
+					undef $current_function;
+				    }
+				    last;
+				  };
+		/\.align/   && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096;
+				    $self->{value} = "ALIGN\t".($$line>$max?$max:$$line);
+				    last;
+				  };
+		/\.(value|long|rva|quad)/
+			    && do { my $sz  = substr($1,0,1);
+				    my @arr = split(/,\s*/,$$line);
+				    my $last = pop(@arr);
+				    my $conv = sub  {	my $var=shift;
+							$var=~s/^(0b[0-1]+)/oct($1)/eig;
+							$var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
+							if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
+							{ $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
+							$var;
+						    };
+
+				    $sz =~ tr/bvlrq/BWDDQ/;
+				    $self->{value} = "\tD$sz\t";
+				    for (@arr) { $self->{value} .= &$conv($_).","; }
+				    $self->{value} .= &$conv($last);
+				    last;
+				  };
+		/\.byte/    && do { my @str=split(/,\s*/,$$line);
+				    map(s/(0b[0-1]+)/oct($1)/eig,@str);
+				    map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+				    while ($#str>15) {
+					$self->{value}.="\tDB\t"
+						.join(",",@str[0..15])."\n";
+					foreach (0..15) { shift @str; }
+				    }
+				    $self->{value}.="\tDB\t"
+						.join(",",@str) if (@str);
+				    last;
+				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+	    }
+	    $$line = "";
+	}
+
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+	$self->{value};
+    }
+}
+
+# Upon initial x86_64 introduction SSE>2 extensions were not introduced
+# yet. In order not to be bothered by tracing exact assembler versions,
+# but at the same time to provide a bare security minimum of AES-NI, we
+# hard-code some instructions. Extensions past AES-NI on the other hand
+# are traced by examining assembler version in individual perlasm
+# modules...
+
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+sub rex {
+ my $opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @$opcode,($rex|0x40) if ($rex);
+}
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	my $imm=$1;
+	my $src=$2;
+	my $dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	my $imm=$1;
+	my $src=$2;
+	my $dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$dst,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdseed = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$dst,8);
+	push @opcode,0x0f,0xc7,0xf8|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
+# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
+# are using only two instructions hand-code them in order to be excused
+# from chasing assembler versions...
+
+sub rxb {
+ my $opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @$opcode,$rxb;
+}
+
+my $vprotd = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc2;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $vprotq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc3;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+# Intel Control-flow Enforcement Technology extension. All functions and
+# indirect branch targets will have to start with this instruction...
+
+my $endbranch = sub {
+    (0xf3,0x0f,0x1e,0xfa);
+};
+
+########################################################################
+
+{
+  my $comment = "//";
+  $comment = ";" if ($masm || $nasm);
+  print <<___;
+$comment This file is generated from a similarly-named Perl script in the BoringSSL
+$comment source tree. Do not edit by hand.
+
+___
+}
+
+if ($nasm) {
+    die "unknown target" unless ($win64);
+    print <<___;
+\%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+\%define XMMWORD
+\%define YMMWORD
+\%define ZMMWORD
+\%define _CET_ENDBR
+
+\%include "ring_core_generated/prefix_symbols_nasm.inc"
+___
+} elsif ($masm) {
+    print <<___;
+OPTION	DOTNAME
+___
+}
+
+if ($gas) {
+    my $target;
+    if ($elf) {
+        # The "elf" target is really ELF with SysV ABI, but every ELF platform
+        # uses the SysV ABI.
+        $target = "defined(__ELF__)";
+    } elsif ($apple) {
+        $target = "defined(__APPLE__)";
+    } else {
+        die "unknown target: $flavour";
+    }
+    print <<___;
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && $target
+___
+}
+
+sub process_line {
+    my $line = shift;
+    $line =~ s|\R$||;           # Better chomp
+
+    if ($nasm) {
+	$line =~ s|^#ifdef |%ifdef |;
+	$line =~ s|^#ifndef |%ifndef |;
+	$line =~ s|^#endif|%endif|;
+	$line =~ s|[#!].*$||;	# get rid of asm-style comments...
+    } else {
+	# Get rid of asm-style comments but not preprocessor directives. The
+	# former are identified by having a letter after the '#' and starting in
+	# the first column.
+	$line =~ s|!.*$||;
+	$line =~ s|(?<=.)#.*$||;
+	$line =~ s|^#([^a-z].*)?$||;
+    }
+
+    $line =~ s|/\*.*\*/||;	# ... and C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning
+    $line =~ s|\s+$||;		# ... and at the end
+
+    if (my $label=label->re(\$line))	{ print $label->out(); }
+
+    if (my $directive=directive->re(\$line)) {
+	printf "%s",$directive->out();
+    } elsif (my $opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+
+	if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	my @args;
+	ARGUMENT: while (1) {
+	    my $arg;
+
+	    ($arg=register->re(\$line, $opcode))||
+	    ($arg=const->re(\$line))		||
+	    ($arg=ea->re(\$line, $opcode))	||
+	    ($arg=expr->re(\$line, $opcode))	||
+	    last ARGUMENT;
+
+	    push @args,$arg;
+
+	    last ARGUMENT if ($line !~ /^,/);
+
+	    $line =~ s/^,\s*//;
+	} # ARGUMENT:
+
+	if ($#args>=0) {
+	    my $insn;
+	    my $sz=$opcode->size();
+
+	    if ($gas) {
+		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
+	    } else {
+		$insn = $opcode->out();
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
+		    if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
+		@args = reverse(@args);
+		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
+	    }
+	} else {
+	    printf "\t%s",$opcode->out();
+	}
+    }
+
+    print $line,"\n";
+}
+
+while(defined(my $line=<>)) {
+    process_line($line);
+}
+foreach my $line (split(/\n/, seh_directive->pdata_and_xdata())) {
+    process_line($line);
+}
+
+print "\n$current_segment\tENDS\n"	if ($current_segment && $masm);
+if ($masm) {
+    print "END\n";
+} elsif ($gas) {
+    print "#endif\n";
+} elsif ($nasm) {
+    print <<___;
+\%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+\%endif
+___
+} else {
+    die "unknown assembler";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
+
+#################################################
+# Cross-reference x86_64 ABI "card"
+#
+# 		Unix		Win64
+# %rax		*		*
+# %rbx		-		-
+# %rcx		#4		#1
+# %rdx		#3		#2
+# %rsi		#2		-
+# %rdi		#1		-
+# %rbp		-		-
+# %rsp		-		-
+# %r8		#5		#3
+# %r9		#6		#4
+# %r10		*		*
+# %r11		*		*
+# %r12		-		-
+# %r13		-		-
+# %r14		-		-
+# %r15		-		-
+#
+# (*)	volatile register
+# (-)	preserved by callee
+# (#)	Nth argument, volatile
+#
+# In Unix terms top of stack is argument transfer area for arguments
+# which could not be accommodated in registers. Or in other words 7th
+# [integer] argument resides at 8(%rsp) upon function entry point.
+# 128 bytes above %rsp constitute a "red zone" which is not touched
+# by signal handlers and can be used as temporal storage without
+# allocating a frame.
+#
+# In Win64 terms N*8 bytes on top of stack is argument transfer area,
+# which belongs to/can be overwritten by callee. N is the number of
+# arguments passed to callee, *but* not less than 4! This means that
+# upon function entry point 5th argument resides at 40(%rsp), as well
+# as that 32 bytes from 8(%rsp) can always be used as temporal
+# storage [without allocating a frame]. One can actually argue that
+# one can assume a "red zone" above stack pointer under Win64 as well.
+# Point is that at apparently no occasion Windows kernel would alter
+# the area above user stack pointer in true asynchronous manner...
+#
+# All the above means that if assembler programmer adheres to Unix
+# register and stack layout, but disregards the "red zone" existence,
+# it's possible to use following prologue and epilogue to "gear" from
+# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
+#
+# omnipotent_function:
+# ifdef WIN64
+#	movq	%rdi,8(%rsp)
+#	movq	%rsi,16(%rsp)
+#	movq	%rcx,%rdi	; if 1st argument is actually present
+#	movq	%rdx,%rsi	; if 2nd argument is actually ...
+#	movq	%r8,%rdx	; if 3rd argument is ...
+#	movq	%r9,%rcx	; if 4th argument ...
+#	movq	40(%rsp),%r8	; if 5th ...
+#	movq	48(%rsp),%r9	; if 6th ...
+# endif
+#	...
+# ifdef WIN64
+#	movq	8(%rsp),%rdi
+#	movq	16(%rsp),%rsi
+# endif
+#	ret
+#
+#################################################
+# Win64 SEH, Structured Exception Handling.
+#
+# Unlike on Unix systems(*) lack of Win64 stack unwinding information
+# has undesired side-effect at run-time: if an exception is raised in
+# assembler subroutine such as those in question (basically we're
+# referring to segmentation violations caused by malformed input
+# parameters), the application is briskly terminated without invoking
+# any exception handlers, most notably without generating memory dump
+# or any user notification whatsoever. This poses a problem. It's
+# possible to address it by registering custom language-specific
+# handler that would restore processor context to the state at
+# subroutine entry point and return "exception is not handled, keep
+# unwinding" code. Writing such handler can be a challenge... But it's
+# doable, though requires certain coding convention. Consider following
+# snippet:
+#
+# .type	function,@function
+# function:
+#	movq	%rsp,%rax	# copy rsp to volatile register
+#	pushq	%r15		# save non-volatile registers
+#	pushq	%rbx
+#	pushq	%rbp
+#	movq	%rsp,%r11
+#	subq	%rdi,%r11	# prepare [variable] stack frame
+#	andq	$-64,%r11
+#	movq	%rax,0(%r11)	# check for exceptions
+#	movq	%r11,%rsp	# allocate [variable] stack frame
+#	movq	%rax,0(%rsp)	# save original rsp value
+# magic_point:
+#	...
+#	movq	0(%rsp),%rcx	# pull original rsp value
+#	movq	-24(%rcx),%rbp	# restore non-volatile registers
+#	movq	-16(%rcx),%rbx
+#	movq	-8(%rcx),%r15
+#	movq	%rcx,%rsp	# restore original rsp
+# magic_epilogue:
+#	ret
+# .size function,.-function
+#
+# The key is that up to magic_point copy of original rsp value remains
+# in chosen volatile register and no non-volatile register, except for
+# rsp, is modified. While past magic_point rsp remains constant till
+# the very end of the function. In this case custom language-specific
+# exception handler would look like this:
+#
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+# {	ULONG64 *rsp = (ULONG64 *)context->Rax;
+#	ULONG64  rip = context->Rip;
+#
+#	if (rip >= magic_point)
+#	{   rsp = (ULONG64 *)context->Rsp;
+#	    if (rip < magic_epilogue)
+#	    {	rsp = (ULONG64 *)rsp[0];
+#		context->Rbp = rsp[-3];
+#		context->Rbx = rsp[-2];
+#		context->R15 = rsp[-1];
+#	    }
+#	}
+#	context->Rsp = (ULONG64)rsp;
+#	context->Rdi = rsp[1];
+#	context->Rsi = rsp[2];
+#
+#	memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
+#	RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
+#		dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
+#		&disp->HandlerData,&disp->EstablisherFrame,NULL);
+#	return ExceptionContinueSearch;
+# }
+#
+# It's appropriate to implement this handler in assembler, directly in
+# function's module. In order to do that one has to know members'
+# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
+# values. Here they are:
+#
+#	CONTEXT.Rax				120
+#	CONTEXT.Rcx				128
+#	CONTEXT.Rdx				136
+#	CONTEXT.Rbx				144
+#	CONTEXT.Rsp				152
+#	CONTEXT.Rbp				160
+#	CONTEXT.Rsi				168
+#	CONTEXT.Rdi				176
+#	CONTEXT.R8				184
+#	CONTEXT.R9				192
+#	CONTEXT.R10				200
+#	CONTEXT.R11				208
+#	CONTEXT.R12				216
+#	CONTEXT.R13				224
+#	CONTEXT.R14				232
+#	CONTEXT.R15				240
+#	CONTEXT.Rip				248
+#	CONTEXT.Xmm6				512
+#	sizeof(CONTEXT)				1232
+#	DISPATCHER_CONTEXT.ControlPc		0
+#	DISPATCHER_CONTEXT.ImageBase		8
+#	DISPATCHER_CONTEXT.FunctionEntry	16
+#	DISPATCHER_CONTEXT.EstablisherFrame	24
+#	DISPATCHER_CONTEXT.TargetIp		32
+#	DISPATCHER_CONTEXT.ContextRecord	40
+#	DISPATCHER_CONTEXT.LanguageHandler	48
+#	DISPATCHER_CONTEXT.HandlerData		56
+#	UNW_FLAG_NHANDLER			0
+#	ExceptionContinueSearch			1
+#
+# In order to tie the handler to the function one has to compose
+# couple of structures: one for .xdata segment and one for .pdata.
+#
+# UNWIND_INFO structure for .xdata segment would be
+#
+# function_unwind_info:
+#	.byte	9,0,0,0
+#	.rva	handler
+#
+# This structure designates exception handler for a function with
+# zero-length prologue, no stack frame or frame register.
+#
+# To facilitate composing of .pdata structures, auto-generated "gear"
+# prologue copies rsp value to rax and denotes next instruction with
+# .LSEH_begin_{function_name} label. This essentially defines the SEH
+# styling rule mentioned in the beginning. Position of this label is
+# chosen in such manner that possible exceptions raised in the "gear"
+# prologue would be accounted to caller and unwound from latter's frame.
+# End of function is marked with respective .LSEH_end_{function_name}
+# label. To summarize, .pdata segment would contain
+#
+#	.rva	.LSEH_begin_function
+#	.rva	.LSEH_end_function
+#	.rva	function_unwind_info
+#
+# Reference to function_unwind_info from .xdata segment is the anchor.
+# In case you wonder why references are 32-bit .rvas and not 64-bit
+# .quads. References put into these two segments are required to be
+# *relative* to the base address of the current binary module, a.k.a.
+# image base. No Win64 module, be it .exe or .dll, can be larger than
+# 2GB and thus such relative references can be and are accommodated in
+# 32 bits.
+#
+# Having reviewed the example function code, one can argue that "movq
+# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
+# rax would contain an undefined value. If this "offends" you, use
+# another register and refrain from modifying rax till magic_point is
+# reached, i.e. as if it was a non-volatile register. If more registers
+# are required prior [variable] frame setup is completed, note that
+# nobody says that you can have only one "magic point." You can
+# "liberate" non-volatile registers by denoting last stack off-load
+# instruction and reflecting it in finer grade unwind logic in handler.
+# After all, isn't it why it's called *language-specific* handler...
+#
+# SE handlers are also involved in unwinding stack when executable is
+# profiled or debugged. Profiling implies additional limitations that
+# are too subtle to discuss here. For now it's sufficient to say that
+# in order to simplify handlers one should either a) offload original
+# %rsp to stack (like discussed above); or b) if you have a register to
+# spare for frame pointer, choose volatile one.
+#
+# (*)	Note that we're talking about run-time, not debug-time. Lack of
+#	unwind information makes debugging hard on both Windows and
+#	Unix. "Unlike" refers to the fact that on Unix signal handler
+#	will always be invoked, core dumped and appropriate exit code
+#	returned to parent (for user notification).
diff --git a/ring-0.17.14/crypto/perlasm/x86asm.pl b/ring-0.17.14/crypto/perlasm/x86asm.pl
new file mode 100644
index 0000000000..7b685e0401
--- /dev/null
+++ b/ring-0.17.14/crypto/perlasm/x86asm.pl
@@ -0,0 +1,368 @@
+#! /usr/bin/env perl
+# Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# require 'x86asm.pl';
+# &asm_init(<flavor>[,$i386only]);
+# &function_begin("foo");
+# ...
+# &function_end("foo");
+# &asm_finish
+
+$out=();
+$i386=0;
+
+# AUTOLOAD is this context has quite unpleasant side effect, namely
+# that typos in function calls effectively go to assembler output,
+# but on the pros side we don't have to implement one subroutine per
+# each opcode...
+sub ::AUTOLOAD
+{ my $opcode = $AUTOLOAD;
+
+    die "more than 4 arguments passed to $opcode" if ($#_>3);
+
+    $opcode =~ s/.*:://;
+    if    ($opcode =~ /^push/) { $stack+=4; }
+    elsif ($opcode =~ /^pop/)  { $stack-=4; }
+
+    &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD";
+}
+
+# record_function_hit(int) writes a byte with value one to the given offset of
+# |BORINGSSL_function_hit|, but only if BORINGSSL_DISPATCH_TEST is defined.
+# This is used in impl_dispatch_test.cc to test whether the expected assembly
+# functions are triggered by high-level API calls.
+sub ::record_function_hit
+{ my($index)=@_;
+    &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST");
+    &push("ebx");
+    &push("edx");
+    &call(&label("pic_for_function_hit"));
+    &set_label("pic_for_function_hit");
+    &blindpop("ebx");
+    &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic_for_function_hit"),"ebx"));
+    &mov("edx", 1);
+    &movb(&BP(0, "ebx"), "dl");
+    &pop("edx");
+    &pop("ebx");
+    &preprocessor_endif();
+}
+
+sub ::emit
+{ my $opcode=shift;
+
+    if ($#_==-1)    { push(@out,"\t$opcode\n");				}
+    else            { push(@out,"\t$opcode\t".join(',',@_)."\n");	}
+}
+
+sub ::LB
+{   $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'";
+  $1."l";
+}
+sub ::HB
+{   $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'";
+  $1."h";
+}
+sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num);	}
+sub ::stack_pop	{ my $num=$_[0]*4; $stack-=$num; &add("esp",$num);	}
+sub ::blindpop	{ &pop($_[0]); $stack+=4;				}
+sub ::wparam	{ &DWP($stack+4*$_[0],"esp");				}
+sub ::swtmp	{ &DWP(4*$_[0],"esp");					}
+
+sub ::bswap
+{   if ($i386)	# emulate bswap for i386
+    {	&comment("bswap @_");
+	&xchg(&HB(@_),&LB(@_));
+	&ror (@_,16);
+	&xchg(&HB(@_),&LB(@_));
+    }
+    else
+    {	&generic("bswap",@_);	}
+}
+# These are made-up opcodes introduced over the years essentially
+# by ignorance, just alias them to real ones...
+sub ::movb	{ &mov(@_);	}
+sub ::xorb	{ &xor(@_);	}
+sub ::rotl	{ &rol(@_);	}
+sub ::rotr	{ &ror(@_);	}
+sub ::exch	{ &xchg(@_);	}
+sub ::halt	{ &hlt;		}
+sub ::movz	{ &movzx(@_);	}
+sub ::pushf	{ &pushfd;	}
+sub ::popf	{ &popfd;	}
+
+# 3 argument instructions
+sub ::movq
+{ my($p1,$p2,$optimize)=@_;
+
+    if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+    # movq between mmx registers can sink Intel CPUs
+    {	&::pshufw($p1,$p2,0xe4);		}
+    else
+    {	&::generic("movq",@_);			}
+}
+
+# SSE>2 instructions
+my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
+    else
+    {	&::generic("pextrd",@_);		}
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
+    else
+    {	&::generic("pinsrd",@_);		}
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
+    else
+    {	&::generic("pshufb",@_);		}
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("palignr",@_);		}
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("pclmulqdq",@_);		}
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
+    else
+    {	&::generic("rdrand",@_);	}
+}
+
+sub ::rdseed
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {	&::data_byte(0x0f,0xc7,0xf8|$regrm{$dst});	}
+    else
+    {	&::generic("rdrand",@_);	}
+}
+
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @opcode,$rxb;
+}
+
+sub ::vprotd
+{ my $args=join(',',@_);
+    if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/)
+    { my @opcode=(0x8f);
+	rxb(\@opcode,$1,$2,-1,0x08);
+	push @opcode,0x78,0xc2;
+	push @opcode,0xc0|($2&7)|(($1&7)<<3);		# ModR/M
+	my $c=$3;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	&::data_byte(@opcode);
+    }
+    else
+    {	&::generic("vprotd",@_);	}
+}
+
+sub ::endbranch
+{
+    &::data_byte(0xf3,0x0f,0x1e,0xfb);
+}
+
+# label management
+$lbdecor="L";		# local label decoration, set by package
+$label="000";
+
+sub ::islabel		# see is argument is a known label
+{ my $i;
+    foreach $i (values %label) { return $i if ($i eq $_[0]); }
+  $label{$_[0]};	# can be undef
+}
+
+sub ::label		# instantiate a function-scope label
+{   if (!defined($label{$_[0]}))
+    {	$label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++;   }
+  $label{$_[0]};
+}
+
+sub ::LABEL		# instantiate a file-scope label
+{   $label{$_[0]}=$_[1] if (!defined($label{$_[0]}));
+  $label{$_[0]};
+}
+
+sub ::static_label	{ &::LABEL($_[0],$lbdecor.$_[0]); }
+
+sub ::set_label_B	{ push(@out,"@_:\n"); }
+sub ::set_label
+{ my $label=&::label($_[0]);
+    &::align($_[1]) if ($_[1]>1);
+    &::set_label_B($label);
+  $label;
+}
+
+sub ::wipe_labels	# wipes function-scope labels
+{   foreach $i (keys %label)
+    {	delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/);	}
+}
+
+# subroutine management
+sub ::function_begin
+{   &function_begin_B(@_);
+    $stack=4;
+    &push("ebp");
+    &push("ebx");
+    &push("esi");
+    &push("edi");
+}
+
+sub ::function_end
+{   &pop("edi");
+    &pop("esi");
+    &pop("ebx");
+    &pop("ebp");
+    &ret();
+    &function_end_B(@_);
+    $stack=0;
+    &wipe_labels();
+}
+
+sub ::function_end_A
+{   &pop("edi");
+    &pop("esi");
+    &pop("ebx");
+    &pop("ebp");
+    &ret();
+    $stack+=16;	# readjust esp as if we didn't pop anything
+}
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+    push @str,0;
+    while ($#str>15) {
+	&data_byte(@str[0..15]);
+	foreach (0..15) { shift @str; }
+    }
+    &data_byte(@str) if (@str);
+}
+
+sub ::asm_finish
+{   &file_end();
+    my $comment = "//";
+    $comment = ";" if ($win32);
+    print <<___;
+$comment This file is generated from a similarly-named Perl script in the BoringSSL
+$comment source tree. Do not edit by hand.
+
+___
+    if ($win32) {
+        print <<___ unless $masm;
+\%include "ring_core_generated/prefix_symbols_nasm.inc"
+\%ifidn __OUTPUT_FORMAT__, win32
+___
+        print @out;
+        print <<___ unless $masm;
+\%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+\%endif
+___
+    } else {
+        my $target;
+        if ($elf) {
+            $target = "defined(__ELF__)";
+        } elsif ($macosx) {
+            $target = "defined(__APPLE__)";
+        } else {
+            die "unknown target";
+        }
+
+        print <<___;
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target
+___
+        print @out;
+        print <<___;
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target
+___
+    }
+}
+
+sub ::asm_init
+{ my ($type,$cpu)=@_;
+
+    $i386=$cpu;
+
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$mwerks=$android=0;
+    if    (($type eq "elf"))
+    {	$elf=1;			require "x86gas.pl";	}
+    elsif (($type eq "elf-1"))
+    {	$elf=-1;		require "x86gas.pl";	}
+    elsif (($type eq "a\.out"))
+    {	$aout=1;		require "x86gas.pl";	}
+    elsif (($type eq "coff" or $type eq "gaswin"))
+    {	$coff=1;		require "x86gas.pl";	}
+    elsif (($type eq "win32n"))
+    {	$win32=1;		require "x86nasm.pl";	}
+    elsif (($type eq "win32"))
+    {	$win32=1; $masm=1;	require "x86masm.pl";	}
+    elsif (($type eq "macosx"))
+    {	$aout=1; $macosx=1;	require "x86gas.pl";	}
+    elsif (($type eq "android"))
+    {	$elf=1; $android=1;	require "x86gas.pl";	}
+    else
+    {	print STDERR <<"EOF";
+Pick one target type from
+	elf	- Linux, FreeBSD, Solaris x86, etc.
+	a.out	- DJGPP, elder OpenBSD, etc.
+	coff	- GAS/COFF such as Win32 targets
+	win32n	- Windows 95/Windows NT NASM format
+	macosx	- Mac OS X
+EOF
+	exit(1);
+    }
+
+    $pic=0;
+    for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); }
+
+    &file();
+}
+
+sub ::hidden {}
+
+1;
diff --git a/ring-0.17.14/crypto/perlasm/x86gas.pl b/ring-0.17.14/crypto/perlasm/x86gas.pl
new file mode 100644
index 0000000000..62a5710a8f
--- /dev/null
+++ b/ring-0.17.14/crypto/perlasm/x86gas.pl
@@ -0,0 +1,285 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+package x86gas;
+
+*out=\@::out;
+
+$::lbdecor=$::aout?"L":".L";		# local label decoration
+$nmdecor=($::aout or $::coff)?"_":"";	# external name decoration
+
+$initseg="";
+
+$align=16;
+$align=log($align)/log(2) if ($::aout);
+$com_start="#" if ($::aout or $::coff);
+
+sub opsize()
+{ my $reg=shift;
+    if    ($reg =~ m/^%e/o)		{ "l"; }
+    elsif ($reg =~ m/^%[a-d][hl]$/o)	{ "b"; }
+    elsif ($reg =~ m/^%[yxm]/o)		{ undef; }
+    else				{ "w"; }
+}
+
+# swap arguments;
+# expand opcode with size suffix;
+# prefix numeric constants with $;
+sub ::generic
+{ my($opcode,@arg)=@_;
+  my($suffix,$dst,$src);
+
+    @arg=reverse(@arg);
+
+    for (@arg)
+    {	s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o;	# gp registers
+	s/^([xy]?mm[0-7])$/%$1/o;		# xmm/mmx registers
+	s/^(\-?[0-9]+)$/\$$1/o;			# constants
+	s/^(\-?0x[0-9a-f]+)$/\$$1/o;		# constants
+    }
+
+    $dst = $arg[$#arg]		if ($#arg>=0);
+    $src = $arg[$#arg-1]	if ($#arg>=1);
+    if    ($dst =~ m/^%/o)	{ $suffix=&opsize($dst); }
+    elsif ($src =~ m/^%/o)	{ $suffix=&opsize($src); }
+    else			{ $suffix="l";           }
+    undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
+
+    if ($#_==0)				{ &::emit($opcode);		}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
+    else				{ &::emit($opcode.$suffix,@arg);}
+
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::movzx	{ &::movzb(@_);			}
+sub ::pushfd	{ &::pushfl;			}
+sub ::popfd	{ &::popfl;			}
+sub ::cpuid	{ &::emit(".byte\t0x0f,0xa2");	}
+sub ::rdtsc	{ &::emit(".byte\t0x0f,0x31");	}
+
+sub ::call	{ &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr	{ &::generic("call","*$_[0]");	}
+sub ::jmp_ptr	{ &::generic("jmp","*$_[0]");	}
+
+*::bswap = sub	{ &::emit("bswap","%$_[0]");	} if (!$::i386);
+
+sub ::DWP
+{ my($addr,$reg1,$reg2,$idx)=@_;
+  my $ret="";
+
+    if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige;
+
+    $reg1 = "%$reg1" if ($reg1);
+    $reg2 = "%$reg2" if ($reg2);
+
+    $ret .= $addr if (($addr ne "") && ($addr ne 0));
+
+    if ($reg2)
+    {	$idx!= 0 or $idx=1;
+	$ret .= "($reg1,$reg2,$idx)";
+    }
+    elsif ($reg1)
+    {	$ret .= "($reg1)";	}
+
+  $ret;
+}
+sub ::QWP	{ &::DWP(@_);	}
+sub ::BP	{ &::DWP(@_);	}
+sub ::WP	{ &::DWP(@_);	}
+sub ::BC	{ @_;		}
+sub ::DWC	{ @_;		}
+
+sub ::file
+{   push(@out,".text\n");	}
+
+sub ::function_begin_B
+{ my $func=shift;
+  my $global=($func !~ /^_/);
+  my $begin="${::lbdecor}_${func}_begin";
+
+    &::LABEL($func,$global?"$begin":"$nmdecor$func");
+    $func=$nmdecor.$func;
+
+    push(@out,".globl\t$func\n")	if ($global);
+    if ($::macosx) {
+      push(@out,".private_extern\t$func\n");
+    } else {
+      push(@out,".hidden\t$func\n");
+    }
+    if ($::coff)
+    {	push(@out,".def\t$func;\t.scl\t".(3-$global).";\t.type\t32;\t.endef\n"); }
+    elsif (($::aout and !$::pic) or $::macosx)
+    { }
+    else
+    {	push(@out,".type	$func,\@function\n"); }
+    push(@out,".align\t$align\n");
+    push(@out,"$func:\n");
+    push(@out,"$begin:\n")		if ($global);
+    $::stack=4;
+}
+
+sub ::function_end_B
+{ my $func=shift;
+    push(@out,".size\t$nmdecor$func,.-".&::LABEL($func)."\n") if ($::elf);
+    $::stack=0;
+    &::wipe_labels();
+}
+
+sub ::comment
+	{
+	if (!defined($com_start) or $::elf)
+		{	# Regarding $::elf above...
+			# GNU and SVR4 as'es use different comment delimiters,
+		push(@out,"\n");	# so we just skip ELF comments...
+		return;
+		}
+	foreach (@_)
+		{
+		if (/^\s*$/)
+			{ push(@out,"\n"); }
+		else
+			{ push(@out,"\t$com_start $_ $com_end\n"); }
+		}
+	}
+
+sub ::external_label
+{   foreach(@_) { &::LABEL($_,$nmdecor.$_); }   }
+
+sub ::public_label
+{   push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
+
+sub ::file_end
+{   if ($::macosx)
+    {	if (%non_lazy_ptr)
+    	{   push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n");
+	    foreach $i (keys %non_lazy_ptr)
+	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
+	}
+    }
+    if (0 && grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16";
+	if ($::macosx)	{ push (@out,"$tmp,2\n"); }
+	elsif ($::elf)	{ push (@out,"$tmp,4\n"); }
+	else		{ push (@out,"$tmp\n"); }
+    }
+    push(@out,$initseg) if ($initseg);
+}
+
+sub ::data_byte	{   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
+sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
+
+sub ::align
+{ my $val=$_[0];
+    if ($::aout)
+    {	$val=int(log($val)/log(2));
+	$val.=",0x90";
+    }
+    push(@out,".align\t$val\n");
+}
+
+sub ::picmeup
+{ my($dst,$sym,$base,$reflabel)=@_;
+
+    if (($::pic && ($::elf || $::aout)) || $::macosx)
+    {	if (!defined($base))
+	{   &::call(&::label("PIC_me_up"));
+	    &::set_label("PIC_me_up");
+	    &::blindpop($dst);
+	    $base=$dst;
+	    $reflabel=&::label("PIC_me_up");
+	}
+	if ($::macosx)
+	{   my $indirect=&::static_label("$nmdecor$sym\$non_lazy_ptr");
+	    &::mov($dst,&::DWP("$indirect-$reflabel",$base));
+	    $non_lazy_ptr{"$nmdecor$sym"}=$indirect;
+	}
+	elsif ($sym eq "OPENSSL_ia32cap_P" && $::elf>0)
+	{   &::lea($dst,&::DWP("$sym-$reflabel",$base));   }
+	else
+	{   &::lea($dst,&::DWP("_GLOBAL_OFFSET_TABLE_+[.-$reflabel]",
+			    $base));
+	    &::mov($dst,&::DWP("$sym\@GOT",$dst));
+	}
+    }
+    else
+    {	&::lea($dst,&::DWP($sym));	}
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+
+    if ($::android)
+    {	$initseg.=<<___;
+.section	.init_array
+.align	4
+.long	$f
+___
+    }
+    elsif ($::elf)
+    {	$initseg.=<<___;
+.section	.init
+	call	$f
+___
+    }
+    elsif ($::coff)
+    {   $initseg.=<<___;	# applies to both Cygwin and Mingw
+.section	.ctors
+.long	$f
+___
+    }
+    elsif ($::macosx)
+    {	$initseg.=<<___;
+.mod_init_func
+.align 2
+.long   $f
+___
+    }
+    elsif ($::aout)
+    {	my $ctor="${nmdecor}_GLOBAL_\$I\$$f";
+	$initseg.=".text\n";
+	$initseg.=".type	$ctor,\@function\n" if ($::pic);
+	$initseg.=<<___;	# OpenBSD way...
+.globl	$ctor
+.align	2
+$ctor:
+	jmp	$f
+___
+    }
+}
+
+sub ::dataseg
+{   push(@out,".data\n");   }
+
+sub ::preprocessor_ifdef
+{ my($define)=@_;
+    push(@out,"#ifdef ${define}\n");
+}
+
+sub ::preprocessor_endif
+{ push(@out,"#endif\n");    }
+
+*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf);
+
+1;
diff --git a/ring-0.17.14/crypto/perlasm/x86nasm.pl b/ring-0.17.14/crypto/perlasm/x86nasm.pl
new file mode 100644
index 0000000000..61c70f48a1
--- /dev/null
+++ b/ring-0.17.14/crypto/perlasm/x86nasm.pl
@@ -0,0 +1,201 @@
+#! /usr/bin/env perl
+# Copyright 1999-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+package x86nasm;
+
+*out=\@::out;
+
+$::lbdecor="L\$";		# local label decoration
+$nmdecor="_";			# external name decoration
+$drdecor=$::mwerks?".":"";	# directive decoration
+
+$initseg="";
+
+sub ::generic
+{ my $opcode=shift;
+  my $tmp;
+
+    if (!$::mwerks)
+    {   if    ($opcode =~ m/^j/o && $#_==0) # optimize jumps
+	{   $_[0] = "NEAR $_[0]";   	}
+	elsif ($opcode eq "lea" && $#_==1)  # wipe storage qualifier from lea
+	{   $_[1] =~ s/^[^\[]*\[/\[/o;	}
+	elsif ($opcode eq "clflush" && $#_==0)
+	{   $_[0] =~ s/^[^\[]*\[/\[/o;	}
+    }
+    &::emit($opcode,@_);
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::call	{ &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr	{ &::emit("call",@_);	}
+sub ::jmp_ptr	{ &::emit("jmp",@_);	}
+
+sub get_mem
+{ my($size,$addr,$reg1,$reg2,$idx)=@_;
+  my($post,$ret);
+
+    if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
+    if ($size ne "")
+    {	$ret .= "$size";
+	$ret .= " PTR" if ($::mwerks);
+	$ret .= " ";
+    }
+    $ret .= "[";
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige;
+    # put address arithmetic expression in parenthesis
+    $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/);
+
+    if (($addr ne "") && ($addr ne 0))
+    {	if ($addr !~ /^-/)	{ $ret .= "$addr+"; }
+	else			{ $post=$addr;      }
+    }
+
+    if ($reg2 ne "")
+    {	$idx!=0 or $idx=1;
+	$ret .= "$reg2*$idx";
+	$ret .= "+$reg1" if ($reg1 ne "");
+    }
+    else
+    {	$ret .= "$reg1";   }
+
+    $ret .= "$post]";
+    $ret =~ s/\+\]/]/; # in case $addr was the only argument
+
+  $ret;
+}
+sub ::BP	{ &get_mem("BYTE",@_);  }
+sub ::DWP	{ &get_mem("DWORD",@_); }
+sub ::WP	{ &get_mem("WORD",@_);	}
+sub ::QWP	{ &get_mem("",@_);      }
+sub ::BC	{ (($::mwerks)?"":"BYTE ")."@_";  }
+sub ::DWC	{ (($::mwerks)?"":"DWORD ")."@_"; }
+
+sub ::file
+{   if ($::mwerks)	{ push(@out,".section\t.text,64\n"); }
+    else
+    { my $tmp=<<___;
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+\$\@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+___
+	push(@out,$tmp);
+    }
+}
+
+sub ::function_begin_B
+{ my $func=shift;
+  my $global=($func !~ /^_/);
+  my $begin="${::lbdecor}_${func}_begin";
+
+    $begin =~ s/^\@/./ if ($::mwerks);	# the torture never stops
+
+    &::LABEL($func,$global?"$begin":"$nmdecor$func");
+    $func=$nmdecor.$func;
+
+    push(@out,"${drdecor}global	$func\n")	if ($global);
+    push(@out,"${drdecor}align	16\n");
+    push(@out,"$func:\n");
+    push(@out,"$begin:\n")			if ($global);
+    $::stack=4;
+}
+
+sub ::function_end_B
+{   $::stack=0;
+    &::wipe_labels();
+}
+
+sub ::file_end
+{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
+    {	my $comm=<<___;
+${drdecor}segment	.bss
+${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 16
+___
+	# comment out OPENSSL_ia32cap_P declarations
+	grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
+	push (@out,$comm)
+    }
+    push (@out,$initseg) if ($initseg);
+}
+
+sub ::comment {   foreach (@_) { push(@out,"\t; $_\n"); }   }
+
+sub ::external_label
+{   foreach(@_)
+    {	push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n");   }
+}
+
+sub ::public_label
+{   push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");  }
+
+sub ::data_byte
+{   push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n");	}
+sub ::data_short
+{   push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n");	}
+sub ::data_word
+{   push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n");	}
+
+sub ::align
+{   push(@out,"${drdecor}align\t$_[0]\n");	}
+
+sub ::picmeup
+{ my($dst,$sym)=@_;
+    &::lea($dst,&::DWP($sym));
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+    if ($::win32)
+    {	$initseg=<<___;
+segment	.CRT\$XCU data align=4
+extern	$f
+dd	$f
+___
+    }
+}
+
+sub ::dataseg
+{   if ($mwerks)	{ push(@out,".section\t.data,4\n");   }
+    else		{ push(@out,"section\t.data align=4\n"); }
+}
+
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"%if	__NASM_VERSION_ID__ >= 0x02030000\n");
+    push(@out,"safeseh	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"%endif\n");
+}
+
+sub ::preprocessor_ifdef
+{ my($define)=@_;
+    push(@out,"%ifdef ${define}\n");
+}
+
+sub ::preprocessor_endif
+{ push(@out,"%endif\n");    }
+
+1;
diff --git a/ring-0.17.14/crypto/poly1305/poly1305.c b/ring-0.17.14/crypto/poly1305/poly1305.c
new file mode 100644
index 0000000000..67595362f8
--- /dev/null
+++ b/ring-0.17.14/crypto/poly1305/poly1305.c
@@ -0,0 +1,246 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// This implementation of poly1305 is by Andrew Moon
+// (https://github.com/floodyberry/poly1305-donna) and released as public
+// domain.
+
+#include <ring-core/base.h>
+
+#include "../internal.h"
+#include "ring-core/check.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }
+
+// Keep in sync with `poly1305_state_st` in ffi_fallback.rs.
+struct poly1305_state_st {
+  alignas(64) uint32_t r0;
+  uint32_t r1, r2, r3, r4;
+  uint32_t s1, s2, s3, s4;
+  uint32_t h0, h1, h2, h3, h4;
+  uint8_t key[16];
+};
+
+// poly1305_blocks updates |state| given some amount of input data. This
+// function may only be called with a |len| that is not a multiple of 16 at the
+// end of the data. Otherwise the input must be buffered into 16 byte blocks.
+static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
+                            size_t len) {
+  debug_assert_nonsecret((uintptr_t)state % 64 == 0);
+
+  uint32_t t0, t1, t2, t3;
+  uint64_t t[5];
+  uint32_t b;
+  uint64_t c;
+  size_t j;
+  uint8_t mp[16];
+
+  if (len < 16) {
+    goto poly1305_donna_atmost15bytes;
+  }
+
+poly1305_donna_16bytes:
+  t0 = CRYPTO_load_u32_le(in);
+  t1 = CRYPTO_load_u32_le(in + 4);
+  t2 = CRYPTO_load_u32_le(in + 8);
+  t3 = CRYPTO_load_u32_le(in + 12);
+
+  in += 16;
+  len -= 16;
+
+  state->h0 += t0 & 0x3ffffff;
+  state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+  state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+  state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+  state->h4 += (t3 >> 8) | (1 << 24);
+
+poly1305_donna_mul:
+  t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) +
+         mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) +
+         mul32x32_64(state->h4, state->s1);
+  t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) +
+         mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) +
+         mul32x32_64(state->h4, state->s2);
+  t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) +
+         mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) +
+         mul32x32_64(state->h4, state->s3);
+  t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) +
+         mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) +
+         mul32x32_64(state->h4, state->s4);
+  t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) +
+         mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) +
+         mul32x32_64(state->h4, state->r0);
+
+  state->h0 = (uint32_t)t[0] & 0x3ffffff;
+  c = (t[0] >> 26);
+  t[1] += c;
+  state->h1 = (uint32_t)t[1] & 0x3ffffff;
+  b = (uint32_t)(t[1] >> 26);
+  t[2] += b;
+  state->h2 = (uint32_t)t[2] & 0x3ffffff;
+  b = (uint32_t)(t[2] >> 26);
+  t[3] += b;
+  state->h3 = (uint32_t)t[3] & 0x3ffffff;
+  b = (uint32_t)(t[3] >> 26);
+  t[4] += b;
+  state->h4 = (uint32_t)t[4] & 0x3ffffff;
+  b = (uint32_t)(t[4] >> 26);
+  state->h0 += b * 5;
+
+  if (len >= 16) {
+    goto poly1305_donna_16bytes;
+  }
+
+// final bytes
+poly1305_donna_atmost15bytes:
+  if (!len) {
+    return;
+  }
+
+  for (j = 0; j < len; j++) {
+    mp[j] = in[j];
+  }
+  mp[j++] = 1;
+  for (; j < 16; j++) {
+    mp[j] = 0;
+  }
+  len = 0;
+
+  t0 = CRYPTO_load_u32_le(mp + 0);
+  t1 = CRYPTO_load_u32_le(mp + 4);
+  t2 = CRYPTO_load_u32_le(mp + 8);
+  t3 = CRYPTO_load_u32_le(mp + 12);
+
+  state->h0 += t0 & 0x3ffffff;
+  state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+  state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+  state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+  state->h4 += (t3 >> 8);
+
+  goto poly1305_donna_mul;
+}
+
+void CRYPTO_poly1305_init(struct poly1305_state_st *state, const uint8_t key[32]) {
+  debug_assert_nonsecret((uintptr_t)state % 64 == 0);
+
+  uint32_t t0, t1, t2, t3;
+
+  t0 = CRYPTO_load_u32_le(key + 0);
+  t1 = CRYPTO_load_u32_le(key + 4);
+  t2 = CRYPTO_load_u32_le(key + 8);
+  t3 = CRYPTO_load_u32_le(key + 12);
+
+  // precompute multipliers
+  state->r0 = t0 & 0x3ffffff;
+  t0 >>= 26;
+  t0 |= t1 << 6;
+  state->r1 = t0 & 0x3ffff03;
+  t1 >>= 20;
+  t1 |= t2 << 12;
+  state->r2 = t1 & 0x3ffc0ff;
+  t2 >>= 14;
+  t2 |= t3 << 18;
+  state->r3 = t2 & 0x3f03fff;
+  t3 >>= 8;
+  state->r4 = t3 & 0x00fffff;
+
+  state->s1 = state->r1 * 5;
+  state->s2 = state->r2 * 5;
+  state->s3 = state->r3 * 5;
+  state->s4 = state->r4 * 5;
+
+  // init state
+  state->h0 = 0;
+  state->h1 = 0;
+  state->h2 = 0;
+  state->h3 = 0;
+  state->h4 = 0;
+
+  OPENSSL_memcpy(state->key, key + 16, sizeof(state->key));
+}
+
+void CRYPTO_poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
+                            size_t in_len) {
+  // Work around a C language bug. See https://crbug.com/1019588.
+  if (in_len == 0) {
+    return;
+  }
+
+  poly1305_update(state, in, in_len);
+}
+
+void CRYPTO_poly1305_finish(struct poly1305_state_st *state, uint8_t mac[16]) {
+  uint32_t g0, g1, g2, g3, g4;
+  uint32_t b, nb;
+
+  b = state->h0 >> 26;
+  state->h0 = state->h0 & 0x3ffffff;
+  state->h1 += b;
+  b = state->h1 >> 26;
+  state->h1 = state->h1 & 0x3ffffff;
+  state->h2 += b;
+  b = state->h2 >> 26;
+  state->h2 = state->h2 & 0x3ffffff;
+  state->h3 += b;
+  b = state->h3 >> 26;
+  state->h3 = state->h3 & 0x3ffffff;
+  state->h4 += b;
+  b = state->h4 >> 26;
+  state->h4 = state->h4 & 0x3ffffff;
+  state->h0 += b * 5;
+
+  g0 = state->h0 + 5;
+  b = g0 >> 26;
+  g0 &= 0x3ffffff;
+  g1 = state->h1 + b;
+  b = g1 >> 26;
+  g1 &= 0x3ffffff;
+  g2 = state->h2 + b;
+  b = g2 >> 26;
+  g2 &= 0x3ffffff;
+  g3 = state->h3 + b;
+  b = g3 >> 26;
+  g3 &= 0x3ffffff;
+  g4 = state->h4 + b - (1 << 26);
+
+  b = (g4 >> 31) - 1;
+  nb = ~b;
+  state->h0 = (state->h0 & nb) | (g0 & b);
+  state->h1 = (state->h1 & nb) | (g1 & b);
+  state->h2 = (state->h2 & nb) | (g2 & b);
+  state->h3 = (state->h3 & nb) | (g3 & b);
+  state->h4 = (state->h4 & nb) | (g4 & b);
+
+  uint64_t f0 = ((state->h0) | (state->h1 << 26)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[0]);
+  uint64_t f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[4]);
+  uint64_t f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[8]);
+  uint64_t f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[12]);
+
+  CRYPTO_store_u32_le(&mac[0], (uint32_t)f0);
+  f1 += (f0 >> 32);
+  CRYPTO_store_u32_le(&mac[4], (uint32_t)f1);
+  f2 += (f1 >> 32);
+  CRYPTO_store_u32_le(&mac[8], (uint32_t)f2);
+  f3 += (f2 >> 32);
+  CRYPTO_store_u32_le(&mac[12], (uint32_t)f3);
+}
diff --git a/ring-0.17.14/crypto/poly1305/poly1305_arm.c b/ring-0.17.14/crypto/poly1305/poly1305_arm.c
new file mode 100644
index 0000000000..632eccafb1
--- /dev/null
+++ b/ring-0.17.14/crypto/poly1305/poly1305_arm.c
@@ -0,0 +1,302 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// This implementation was taken from the public domain, neon2 version in
+// SUPERCOP by D. J. Bernstein and Peter Schwabe.
+
+#include <ring-core/base.h>
+
+#include "../internal.h"
+
+
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+
+// Keep in sync with ffi_arm_neon.rs
+typedef struct {
+  uint32_t v[12];  // for alignment; only using 10
+} fe1305x2;
+
+#define addmulmod openssl_poly1305_neon2_addmulmod
+#define blocks openssl_poly1305_neon2_blocks
+
+extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y,
+                      const fe1305x2 *c);
+
+extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in,
+                  size_t inlen);
+
+static void freeze(fe1305x2 *r) {
+  int i;
+
+  uint32_t x0 = r->v[0];
+  uint32_t x1 = r->v[2];
+  uint32_t x2 = r->v[4];
+  uint32_t x3 = r->v[6];
+  uint32_t x4 = r->v[8];
+  uint32_t y0;
+  uint32_t y1;
+  uint32_t y2;
+  uint32_t y3;
+  uint32_t y4;
+  uint32_t swap;
+
+  for (i = 0; i < 3; ++i) {
+    x1 += x0 >> 26;
+    x0 &= 0x3ffffff;
+    x2 += x1 >> 26;
+    x1 &= 0x3ffffff;
+    x3 += x2 >> 26;
+    x2 &= 0x3ffffff;
+    x4 += x3 >> 26;
+    x3 &= 0x3ffffff;
+    x0 += 5 * (x4 >> 26);
+    x4 &= 0x3ffffff;
+  }
+
+  y0 = x0 + 5;
+  y1 = x1 + (y0 >> 26);
+  y0 &= 0x3ffffff;
+  y2 = x2 + (y1 >> 26);
+  y1 &= 0x3ffffff;
+  y3 = x3 + (y2 >> 26);
+  y2 &= 0x3ffffff;
+  y4 = x4 + (y3 >> 26);
+  y3 &= 0x3ffffff;
+  swap = -(y4 >> 26);
+  y4 &= 0x3ffffff;
+
+  y0 ^= x0;
+  y1 ^= x1;
+  y2 ^= x2;
+  y3 ^= x3;
+  y4 ^= x4;
+
+  y0 &= swap;
+  y1 &= swap;
+  y2 &= swap;
+  y3 &= swap;
+  y4 &= swap;
+
+  y0 ^= x0;
+  y1 ^= x1;
+  y2 ^= x2;
+  y3 ^= x3;
+  y4 ^= x4;
+
+  r->v[0] = y0;
+  r->v[2] = y1;
+  r->v[4] = y2;
+  r->v[6] = y3;
+  r->v[8] = y4;
+}
+
+static void store32(uint8_t out[4], uint32_t v) { OPENSSL_memcpy(out, &v, 4); }
+
+// load32 exists to avoid breaking strict aliasing rules in
+// fe1305x2_frombytearray.
+static uint32_t load32(const uint8_t t[4]) {
+  uint32_t tmp;
+  OPENSSL_memcpy(&tmp, t, sizeof(tmp));
+  return tmp;
+}
+
+static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) {
+  uint32_t x0 = x->v[0];
+  uint32_t x1 = x->v[2];
+  uint32_t x2 = x->v[4];
+  uint32_t x3 = x->v[6];
+  uint32_t x4 = x->v[8];
+
+  x1 += x0 >> 26;
+  x0 &= 0x3ffffff;
+  x2 += x1 >> 26;
+  x1 &= 0x3ffffff;
+  x3 += x2 >> 26;
+  x2 &= 0x3ffffff;
+  x4 += x3 >> 26;
+  x3 &= 0x3ffffff;
+
+  store32(r, x0 + (x1 << 26));
+  store32(r + 4, (x1 >> 6) + (x2 << 20));
+  store32(r + 8, (x2 >> 12) + (x3 << 14));
+  store32(r + 12, (x3 >> 18) + (x4 << 8));
+}
+
+static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) {
+  size_t i;
+  uint8_t t[17];
+
+  for (i = 0; (i < 16) && (i < xlen); i++) {
+    t[i] = x[i];
+  }
+  xlen -= i;
+  x += i;
+  t[i++] = 1;
+  for (; i < 17; i++) {
+    t[i] = 0;
+  }
+
+  r->v[0] = 0x3ffffff & load32(t);
+  r->v[2] = 0x3ffffff & (load32(t + 3) >> 2);
+  r->v[4] = 0x3ffffff & (load32(t + 6) >> 4);
+  r->v[6] = 0x3ffffff & (load32(t + 9) >> 6);
+  r->v[8] = load32(t + 13);
+
+  if (xlen) {
+    for (i = 0; (i < 16) && (i < xlen); i++) {
+      t[i] = x[i];
+    }
+    t[i++] = 1;
+    for (; i < 17; i++) {
+      t[i] = 0;
+    }
+
+    r->v[1] = 0x3ffffff & load32(t);
+    r->v[3] = 0x3ffffff & (load32(t + 3) >> 2);
+    r->v[5] = 0x3ffffff & (load32(t + 6) >> 4);
+    r->v[7] = 0x3ffffff & (load32(t + 9) >> 6);
+    r->v[9] = load32(t + 13);
+  } else {
+    r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0;
+  }
+}
+
+static const alignas(16) fe1305x2 zero;
+
+// Keep in sync with ffi_arm_neon.rs
+struct poly1305_state_st {
+  alignas(16) fe1305x2 r;
+  fe1305x2 h;
+  fe1305x2 c;
+  fe1305x2 precomp[2];
+  uint8_t data[128];
+
+  uint8_t buf[32];
+  size_t buf_used;
+  uint8_t key[16];
+};
+
+OPENSSL_STATIC_ASSERT(sizeof(fe1305x2) == 48, "fe1305x2 size is different than expected");
+
+void CRYPTO_poly1305_init_neon(struct poly1305_state_st *st, const uint8_t key[32]) {
+  fe1305x2 *const r = &st->r;
+  fe1305x2 *const h = &st->h;
+  fe1305x2 *const precomp = &st->precomp[0];
+
+  r->v[1] = r->v[0] = 0x3ffffff & load32(key);
+  r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2);
+  r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4);
+  r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6);
+  r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8);
+
+  for (size_t j = 0; j < 10; j++) {
+    h->v[j] = 0;  // XXX: should fast-forward a bit
+  }
+
+  addmulmod(precomp, r, r, &zero);                  // precompute r^2
+  addmulmod(precomp + 1, precomp, precomp, &zero);  // precompute r^4
+
+  OPENSSL_memcpy(st->key, key + 16, 16);
+  st->buf_used = 0;
+}
+
+void CRYPTO_poly1305_update_neon(struct poly1305_state_st *st, const uint8_t *in,
+                                 size_t in_len) {
+  fe1305x2 *const h = &st->h;
+  fe1305x2 *const c = &st->c;
+  fe1305x2 *const precomp = &st->precomp[0];
+
+  if (st->buf_used) {
+    size_t todo = 32 - st->buf_used;
+    if (todo > in_len) {
+      todo = in_len;
+    }
+    for (size_t i = 0; i < todo; i++) {
+      st->buf[st->buf_used + i] = in[i];
+    }
+    st->buf_used += todo;
+    in_len -= todo;
+    in += todo;
+
+    if (st->buf_used == sizeof(st->buf) && in_len) {
+      addmulmod(h, h, precomp, &zero);
+      fe1305x2_frombytearray(c, st->buf, sizeof(st->buf));
+      for (size_t i = 0; i < 10; i++) {
+        h->v[i] += c->v[i];
+      }
+      st->buf_used = 0;
+    }
+  }
+
+  while (in_len > 32) {
+    size_t tlen = 1048576;
+    if (in_len < tlen) {
+      tlen = in_len;
+    }
+    tlen -= blocks(h, precomp, in, tlen);
+    in_len -= tlen;
+    in += tlen;
+  }
+
+  if (in_len) {
+    for (size_t i = 0; i < in_len; i++) {
+      st->buf[i] = in[i];
+    }
+    st->buf_used = in_len;
+  }
+}
+
+void CRYPTO_poly1305_finish_neon(struct poly1305_state_st *st, uint8_t mac[16]) {
+  fe1305x2 *const r = &st->r;
+  fe1305x2 *const h = &st->h;
+  fe1305x2 *const c = &st->c;
+  fe1305x2 *const precomp = &st->precomp[0];
+
+  addmulmod(h, h, precomp, &zero);
+
+  if (st->buf_used > 16) {
+    fe1305x2_frombytearray(c, st->buf, st->buf_used);
+    precomp->v[1] = r->v[1];
+    precomp->v[3] = r->v[3];
+    precomp->v[5] = r->v[5];
+    precomp->v[7] = r->v[7];
+    precomp->v[9] = r->v[9];
+    addmulmod(h, h, precomp, c);
+  } else if (st->buf_used > 0) {
+    fe1305x2_frombytearray(c, st->buf, st->buf_used);
+    r->v[1] = 1;
+    r->v[3] = 0;
+    r->v[5] = 0;
+    r->v[7] = 0;
+    r->v[9] = 0;
+    addmulmod(h, h, r, c);
+  }
+
+  h->v[0] += h->v[1];
+  h->v[2] += h->v[3];
+  h->v[4] += h->v[5];
+  h->v[6] += h->v[7];
+  h->v[8] += h->v[9];
+  freeze(h);
+
+  fe1305x2_frombytearray(c, st->key, 16);
+  c->v[8] ^= (1 << 24);
+
+  h->v[0] += c->v[0];
+  h->v[2] += c->v[2];
+  h->v[4] += c->v[4];
+  h->v[6] += c->v[6];
+  h->v[8] += c->v[8];
+  fe1305x2_tobytearray(mac, h);
+}
diff --git a/ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S b/ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S
new file mode 100644
index 0000000000..df464d068d
--- /dev/null
+++ b/ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S
@@ -0,0 +1,2019 @@
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+
+#pragma GCC diagnostic ignored "-Wlanguage-extension-token"
+
+# This implementation was taken from the public domain, neon2 version in
+# SUPERCOP by D. J. Bernstein and Peter Schwabe.
+
+# qhasm: int32 input_0
+
+# qhasm: int32 input_1
+
+# qhasm: int32 input_2
+
+# qhasm: int32 input_3
+
+# qhasm: stack32 input_4
+
+# qhasm: stack32 input_5
+
+# qhasm: stack32 input_6
+
+# qhasm: stack32 input_7
+
+# qhasm: int32 caller_r4
+
+# qhasm: int32 caller_r5
+
+# qhasm: int32 caller_r6
+
+# qhasm: int32 caller_r7
+
+# qhasm: int32 caller_r8
+
+# qhasm: int32 caller_r9
+
+# qhasm: int32 caller_r10
+
+# qhasm: int32 caller_r11
+
+# qhasm: int32 caller_r12
+
+# qhasm: int32 caller_r14
+
+# qhasm: reg128 caller_q4
+
+# qhasm: reg128 caller_q5
+
+# qhasm: reg128 caller_q6
+
+# qhasm: reg128 caller_q7
+
+# qhasm: startcode
+.fpu neon
+.text
+
+# qhasm: reg128 r0
+
+# qhasm: reg128 r1
+
+# qhasm: reg128 r2
+
+# qhasm: reg128 r3
+
+# qhasm: reg128 r4
+
+# qhasm: reg128 x01
+
+# qhasm: reg128 x23
+
+# qhasm: reg128 x4
+
+# qhasm: reg128 y0
+
+# qhasm: reg128 y12
+
+# qhasm: reg128 y34
+
+# qhasm: reg128 5y12
+
+# qhasm: reg128 5y34
+
+# qhasm: stack128 y0_stack
+
+# qhasm: stack128 y12_stack
+
+# qhasm: stack128 y34_stack
+
+# qhasm: stack128 5y12_stack
+
+# qhasm: stack128 5y34_stack
+
+# qhasm: reg128 z0
+
+# qhasm: reg128 z12
+
+# qhasm: reg128 z34
+
+# qhasm: reg128 5z12
+
+# qhasm: reg128 5z34
+
+# qhasm: stack128 z0_stack
+
+# qhasm: stack128 z12_stack
+
+# qhasm: stack128 z34_stack
+
+# qhasm: stack128 5z12_stack
+
+# qhasm: stack128 5z34_stack
+
+# qhasm: stack128 two24
+
+# qhasm: int32 ptr
+
+# qhasm: reg128 c01
+
+# qhasm: reg128 c23
+
+# qhasm: reg128 d01
+
+# qhasm: reg128 d23
+
+# qhasm: reg128 t0
+
+# qhasm: reg128 t1
+
+# qhasm: reg128 t2
+
+# qhasm: reg128 t3
+
+# qhasm: reg128 t4
+
+# qhasm: reg128 mask
+
+# qhasm: reg128 u0
+
+# qhasm: reg128 u1
+
+# qhasm: reg128 u2
+
+# qhasm: reg128 u3
+
+# qhasm: reg128 u4
+
+# qhasm: reg128 v01
+
+# qhasm: reg128 mid
+
+# qhasm: reg128 v23
+
+# qhasm: reg128 v4
+
+# qhasm: int32 len
+
+# qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks
+.align 4
+.global openssl_poly1305_neon2_blocks
+.hidden openssl_poly1305_neon2_blocks
+.type openssl_poly1305_neon2_blocks STT_FUNC
+openssl_poly1305_neon2_blocks:
+vpush {q4,q5,q6,q7}
+mov r12,sp
+sub sp,sp,#192
+bic sp,sp,#31
+
+# qhasm: len = input_3
+# asm 1: mov >len=int32#4,<input_3=int32#4
+# asm 2: mov >len=r3,<input_3=r3
+mov r3,r3
+
+# qhasm: new y0
+
+# qhasm: y0  = mem64[input_1]y0[1]; input_1 += 8
+# asm 1: vld1.8 {<y0=reg128#1%bot},[<input_1=int32#2]!
+# asm 2: vld1.8 {<y0=d0},[<input_1=r1]!
+vld1.8 {d0},[r1]!
+
+# qhasm: y12 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>y12=d2->y12=d3},[<input_1=r1]!
+vld1.8 {d2-d3},[r1]!
+
+# qhasm: y34 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>y34=d4->y34=d5},[<input_1=r1]!
+vld1.8 {d4-d5},[r1]!
+
+# qhasm: input_1 += 8
+# asm 1: add >input_1=int32#2,<input_1=int32#2,#8
+# asm 2: add >input_1=r1,<input_1=r1,#8
+add r1,r1,#8
+
+# qhasm: new z0
+
+# qhasm: z0  = mem64[input_1]z0[1]; input_1 += 8
+# asm 1: vld1.8 {<z0=reg128#4%bot},[<input_1=int32#2]!
+# asm 2: vld1.8 {<z0=d6},[<input_1=r1]!
+vld1.8 {d6},[r1]!
+
+# qhasm: z12 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>z12=reg128#5%bot->z12=reg128#5%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>z12=d8->z12=d9},[<input_1=r1]!
+vld1.8 {d8-d9},[r1]!
+
+# qhasm: z34 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>z34=d10->z34=d11},[<input_1=r1]!
+vld1.8 {d10-d11},[r1]!
+
+# qhasm: 2x mask = 0xffffffff
+# asm 1: vmov.i64 >mask=reg128#7,#0xffffffff
+# asm 2: vmov.i64 >mask=q6,#0xffffffff
+vmov.i64 q6,#0xffffffff
+
+# qhasm: 2x u4 = 0xff
+# asm 1: vmov.i64 >u4=reg128#8,#0xff
+# asm 2: vmov.i64 >u4=q7,#0xff
+vmov.i64 q7,#0xff
+
+# qhasm: x01 aligned= mem128[input_0];input_0+=16
+# asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[<input_0=int32#1,: 128]!
+# asm 2: vld1.8 {>x01=d16->x01=d17},[<input_0=r0,: 128]!
+vld1.8 {d16-d17},[r0,: 128]!
+
+# qhasm: x23 aligned= mem128[input_0];input_0+=16
+# asm 1: vld1.8 {>x23=reg128#10%bot->x23=reg128#10%top},[<input_0=int32#1,: 128]!
+# asm 2: vld1.8 {>x23=d18->x23=d19},[<input_0=r0,: 128]!
+vld1.8 {d18-d19},[r0,: 128]!
+
+# qhasm: x4  aligned= mem64[input_0]x4[1]
+# asm 1: vld1.8 {<x4=reg128#11%bot},[<input_0=int32#1,: 64]
+# asm 2: vld1.8 {<x4=d20},[<input_0=r0,: 64]
+vld1.8 {d20},[r0,: 64]
+
+# qhasm: input_0 -= 32
+# asm 1: sub >input_0=int32#1,<input_0=int32#1,#32
+# asm 2: sub >input_0=r0,<input_0=r0,#32
+sub r0,r0,#32
+
+# qhasm: 2x mask unsigned>>=6
+# asm 1: vshr.u64 >mask=reg128#7,<mask=reg128#7,#6
+# asm 2: vshr.u64 >mask=q6,<mask=q6,#6
+vshr.u64 q6,q6,#6
+
+# qhasm: 2x u4 unsigned>>= 7
+# asm 1: vshr.u64 >u4=reg128#8,<u4=reg128#8,#7
+# asm 2: vshr.u64 >u4=q7,<u4=q7,#7
+vshr.u64 q7,q7,#7
+
+# qhasm: 4x 5y12 = y12 << 2
+# asm 1: vshl.i32 >5y12=reg128#12,<y12=reg128#2,#2
+# asm 2: vshl.i32 >5y12=q11,<y12=q1,#2
+vshl.i32 q11,q1,#2
+
+# qhasm: 4x 5y34 = y34 << 2
+# asm 1: vshl.i32 >5y34=reg128#13,<y34=reg128#3,#2
+# asm 2: vshl.i32 >5y34=q12,<y34=q2,#2
+vshl.i32 q12,q2,#2
+
+# qhasm: 4x 5y12 += y12
+# asm 1: vadd.i32 >5y12=reg128#12,<5y12=reg128#12,<y12=reg128#2
+# asm 2: vadd.i32 >5y12=q11,<5y12=q11,<y12=q1
+vadd.i32 q11,q11,q1
+
+# qhasm: 4x 5y34 += y34
+# asm 1: vadd.i32 >5y34=reg128#13,<5y34=reg128#13,<y34=reg128#3
+# asm 2: vadd.i32 >5y34=q12,<5y34=q12,<y34=q2
+vadd.i32 q12,q12,q2
+
+# qhasm: 2x u4 <<= 24
+# asm 1: vshl.i64 >u4=reg128#8,<u4=reg128#8,#24
+# asm 2: vshl.i64 >u4=q7,<u4=q7,#24
+vshl.i64 q7,q7,#24
+
+# qhasm: 4x 5z12 = z12 << 2
+# asm 1: vshl.i32 >5z12=reg128#14,<z12=reg128#5,#2
+# asm 2: vshl.i32 >5z12=q13,<z12=q4,#2
+vshl.i32 q13,q4,#2
+
+# qhasm: 4x 5z34 = z34 << 2
+# asm 1: vshl.i32 >5z34=reg128#15,<z34=reg128#6,#2
+# asm 2: vshl.i32 >5z34=q14,<z34=q5,#2
+vshl.i32 q14,q5,#2
+
+# qhasm: 4x 5z12 += z12
+# asm 1: vadd.i32 >5z12=reg128#14,<5z12=reg128#14,<z12=reg128#5
+# asm 2: vadd.i32 >5z12=q13,<5z12=q13,<z12=q4
+vadd.i32 q13,q13,q4
+
+# qhasm: 4x 5z34 += z34
+# asm 1: vadd.i32 >5z34=reg128#15,<5z34=reg128#15,<z34=reg128#6
+# asm 2: vadd.i32 >5z34=q14,<5z34=q14,<z34=q5
+vadd.i32 q14,q14,q5
+
+# qhasm: new two24
+
+# qhasm: new y0_stack
+
+# qhasm: new y12_stack
+
+# qhasm: new y34_stack
+
+# qhasm: new 5y12_stack
+
+# qhasm: new 5y34_stack
+
+# qhasm: new z0_stack
+
+# qhasm: new z12_stack
+
+# qhasm: new z34_stack
+
+# qhasm: new 5z12_stack
+
+# qhasm: new 5z34_stack
+
+# qhasm: ptr = &two24
+# asm 1: lea >ptr=int32#2,<two24=stack128#1
+# asm 2: lea >ptr=r1,<two24=[sp,#0]
+add r1,sp,#0
+
+# qhasm: mem128[ptr] aligned= u4
+# asm 1: vst1.8 {<u4=reg128#8%bot-<u4=reg128#8%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<u4=d14-<u4=d15},[<ptr=r1,: 128]
+vst1.8 {d14-d15},[r1,: 128]
+
+# qhasm: r4 = u4
+# asm 1: vmov >r4=reg128#16,<u4=reg128#8
+# asm 2: vmov >r4=q15,<u4=q7
+vmov q15,q7
+
+# qhasm: r0 = u4
+# asm 1: vmov >r0=reg128#8,<u4=reg128#8
+# asm 2: vmov >r0=q7,<u4=q7
+vmov q7,q7
+
+# qhasm: ptr = &y0_stack
+# asm 1: lea >ptr=int32#2,<y0_stack=stack128#2
+# asm 2: lea >ptr=r1,<y0_stack=[sp,#16]
+add r1,sp,#16
+
+# qhasm: mem128[ptr] aligned= y0
+# asm 1: vst1.8 {<y0=reg128#1%bot-<y0=reg128#1%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<y0=d0-<y0=d1},[<ptr=r1,: 128]
+vst1.8 {d0-d1},[r1,: 128]
+
+# qhasm: ptr = &y12_stack
+# asm 1: lea >ptr=int32#2,<y12_stack=stack128#3
+# asm 2: lea >ptr=r1,<y12_stack=[sp,#32]
+add r1,sp,#32
+
+# qhasm: mem128[ptr] aligned= y12
+# asm 1: vst1.8 {<y12=reg128#2%bot-<y12=reg128#2%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<y12=d2-<y12=d3},[<ptr=r1,: 128]
+vst1.8 {d2-d3},[r1,: 128]
+
+# qhasm: ptr = &y34_stack
+# asm 1: lea >ptr=int32#2,<y34_stack=stack128#4
+# asm 2: lea >ptr=r1,<y34_stack=[sp,#48]
+add r1,sp,#48
+
+# qhasm: mem128[ptr] aligned= y34
+# asm 1: vst1.8 {<y34=reg128#3%bot-<y34=reg128#3%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<y34=d4-<y34=d5},[<ptr=r1,: 128]
+vst1.8 {d4-d5},[r1,: 128]
+
+# qhasm: ptr = &z0_stack
+# asm 1: lea >ptr=int32#2,<z0_stack=stack128#7
+# asm 2: lea >ptr=r1,<z0_stack=[sp,#96]
+add r1,sp,#96
+
+# qhasm: mem128[ptr] aligned= z0
+# asm 1: vst1.8 {<z0=reg128#4%bot-<z0=reg128#4%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<z0=d6-<z0=d7},[<ptr=r1,: 128]
+vst1.8 {d6-d7},[r1,: 128]
+
+# qhasm: ptr = &z12_stack
+# asm 1: lea >ptr=int32#2,<z12_stack=stack128#8
+# asm 2: lea >ptr=r1,<z12_stack=[sp,#112]
+add r1,sp,#112
+
+# qhasm: mem128[ptr] aligned= z12
+# asm 1: vst1.8 {<z12=reg128#5%bot-<z12=reg128#5%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<z12=d8-<z12=d9},[<ptr=r1,: 128]
+vst1.8 {d8-d9},[r1,: 128]
+
+# qhasm: ptr = &z34_stack
+# asm 1: lea >ptr=int32#2,<z34_stack=stack128#9
+# asm 2: lea >ptr=r1,<z34_stack=[sp,#128]
+add r1,sp,#128
+
+# qhasm: mem128[ptr] aligned= z34
+# asm 1: vst1.8 {<z34=reg128#6%bot-<z34=reg128#6%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<z34=d10-<z34=d11},[<ptr=r1,: 128]
+vst1.8 {d10-d11},[r1,: 128]
+
+# qhasm: ptr = &5y12_stack
+# asm 1: lea >ptr=int32#2,<5y12_stack=stack128#5
+# asm 2: lea >ptr=r1,<5y12_stack=[sp,#64]
+add r1,sp,#64
+
+# qhasm: mem128[ptr] aligned= 5y12
+# asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5y12=d22-<5y12=d23},[<ptr=r1,: 128]
+vst1.8 {d22-d23},[r1,: 128]
+
+# qhasm: ptr = &5y34_stack
+# asm 1: lea >ptr=int32#2,<5y34_stack=stack128#6
+# asm 2: lea >ptr=r1,<5y34_stack=[sp,#80]
+add r1,sp,#80
+
+# qhasm: mem128[ptr] aligned= 5y34
+# asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5y34=d24-<5y34=d25},[<ptr=r1,: 128]
+vst1.8 {d24-d25},[r1,: 128]
+
+# qhasm: ptr = &5z12_stack
+# asm 1: lea >ptr=int32#2,<5z12_stack=stack128#10
+# asm 2: lea >ptr=r1,<5z12_stack=[sp,#144]
+add r1,sp,#144
+
+# qhasm: mem128[ptr] aligned= 5z12
+# asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5z12=d26-<5z12=d27},[<ptr=r1,: 128]
+vst1.8 {d26-d27},[r1,: 128]
+
+# qhasm: ptr = &5z34_stack
+# asm 1: lea >ptr=int32#2,<5z34_stack=stack128#11
+# asm 2: lea >ptr=r1,<5z34_stack=[sp,#160]
+add r1,sp,#160
+
+# qhasm: mem128[ptr] aligned= 5z34
+# asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5z34=d28-<5z34=d29},[<ptr=r1,: 128]
+vst1.8 {d28-d29},[r1,: 128]
+
+# qhasm:                       unsigned>? len - 64
+# asm 1: cmp <len=int32#4,#64
+# asm 2: cmp <len=r3,#64
+cmp r3,#64
+
+# qhasm: goto below64bytes if !unsigned>
+bls ._below64bytes
+
+# qhasm: input_2 += 32
+# asm 1: add >input_2=int32#2,<input_2=int32#3,#32
+# asm 2: add >input_2=r1,<input_2=r2,#32
+add r1,r2,#32
+
+# qhasm: mainloop2:
+._mainloop2:
+
+# qhasm:   c01 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c01=reg128#1%bot->c01=reg128#1%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>c01=d0->c01=d1},[<input_2=r1]!
+vld1.8 {d0-d1},[r1]!
+
+# qhasm:   c23 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c23=reg128#2%bot->c23=reg128#2%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>c23=d2->c23=d3},[<input_2=r1]!
+vld1.8 {d2-d3},[r1]!
+
+# qhasm: r4[0,1] += x01[0] unsigned*  z34[2];  r4[2,3] += x01[1] unsigned*  z34[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%bot,<z34=reg128#6%top
+# asm 2: vmlal.u32 <r4=q15,<x01=d16,<z34=d11
+vmlal.u32 q15,d16,d11
+
+# qhasm:   ptr = &z12_stack
+# asm 1: lea >ptr=int32#3,<z12_stack=stack128#8
+# asm 2: lea >ptr=r2,<z12_stack=[sp,#112]
+add r2,sp,#112
+
+# qhasm:   z12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>z12=reg128#3%bot->z12=reg128#3%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>z12=d4->z12=d5},[<ptr=r2,: 128]
+vld1.8 {d4-d5},[r2,: 128]
+
+# qhasm: r4[0,1] += x01[2] unsigned* z34[0];  r4[2,3] += x01[3] unsigned* z34[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%top,<z34=reg128#6%bot
+# asm 2: vmlal.u32 <r4=q15,<x01=d17,<z34=d10
+vmlal.u32 q15,d17,d10
+
+# qhasm:   ptr = &z0_stack
+# asm 1: lea >ptr=int32#3,<z0_stack=stack128#7
+# asm 2: lea >ptr=r2,<z0_stack=[sp,#96]
+add r2,sp,#96
+
+# qhasm:   z0 aligned= mem128[ptr]
+# asm 1: vld1.8 {>z0=reg128#4%bot->z0=reg128#4%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>z0=d6->z0=d7},[<ptr=r2,: 128]
+vld1.8 {d6-d7},[r2,: 128]
+
+# qhasm: r4[0,1] += x23[0] unsigned* z12[2];  r4[2,3] += x23[1] unsigned* z12[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%bot,<z12=reg128#3%top
+# asm 2: vmlal.u32 <r4=q15,<x23=d18,<z12=d5
+vmlal.u32 q15,d18,d5
+
+# qhasm:   c01 c23 = c01[0]c01[1]c01[2]c23[2]c23[0]c23[1]c01[3]c23[3]
+# asm 1: vtrn.32 <c01=reg128#1%top,<c23=reg128#2%top
+# asm 2: vtrn.32 <c01=d1,<c23=d3
+vtrn.32 d1,d3
+
+# qhasm: r4[0,1] += x23[2] unsigned* z12[0];  r4[2,3] += x23[3] unsigned* z12[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%top,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r4=q15,<x23=d19,<z12=d4
+vmlal.u32 q15,d19,d4
+
+# qhasm: r4[0,1] +=  x4[0] unsigned* z0[0];  r4[2,3] +=  x4[1] unsigned* z0[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<x4=reg128#11%bot,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r4=q15,<x4=d20,<z0=d6
+vmlal.u32 q15,d20,d6
+
+# qhasm: r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 
+# asm 1: vshll.u32 >r3=reg128#5,<c23=reg128#2%top,#18
+# asm 2: vshll.u32 >r3=q4,<c23=d3,#18
+vshll.u32 q4,d3,#18
+
+# qhasm:   c01 c23 = c01[0]c23[0]c01[2]c01[3]c01[1]c23[1]c23[2]c23[3]
+# asm 1: vtrn.32 <c01=reg128#1%bot,<c23=reg128#2%bot
+# asm 2: vtrn.32 <c01=d0,<c23=d2
+vtrn.32 d0,d2
+
+# qhasm: r3[0,1] += x01[0] unsigned* z34[0];   r3[2,3] += x01[1] unsigned* z34[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%bot,<z34=reg128#6%bot
+# asm 2: vmlal.u32 <r3=q4,<x01=d16,<z34=d10
+vmlal.u32 q4,d16,d10
+
+# qhasm: r3[0,1] += x01[2] unsigned* z12[2];   r3[2,3] += x01[3] unsigned* z12[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%top,<z12=reg128#3%top
+# asm 2: vmlal.u32 <r3=q4,<x01=d17,<z12=d5
+vmlal.u32 q4,d17,d5
+
+# qhasm:   r0 = r0[1]c01[0]r0[2,3] 
+# asm 1: vext.32 <r0=reg128#8%bot,<r0=reg128#8%bot,<c01=reg128#1%bot,#1
+# asm 2: vext.32 <r0=d14,<r0=d14,<c01=d0,#1
+vext.32 d14,d14,d0,#1
+
+# qhasm: r3[0,1] += x23[0] unsigned* z12[0];   r3[2,3] += x23[1] unsigned* z12[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%bot,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r3=q4,<x23=d18,<z12=d4
+vmlal.u32 q4,d18,d4
+
+# qhasm: 								input_2 -= 64
+# asm 1: sub >input_2=int32#2,<input_2=int32#2,#64
+# asm 2: sub >input_2=r1,<input_2=r1,#64
+sub r1,r1,#64
+
+# qhasm: r3[0,1] += x23[2] unsigned* z0[0];   r3[2,3] += x23[3] unsigned* z0[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%top,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r3=q4,<x23=d19,<z0=d6
+vmlal.u32 q4,d19,d6
+
+# qhasm:   ptr = &5z34_stack
+# asm 1: lea >ptr=int32#3,<5z34_stack=stack128#11
+# asm 2: lea >ptr=r2,<5z34_stack=[sp,#160]
+add r2,sp,#160
+
+# qhasm:   5z34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5z34=d10->5z34=d11},[<ptr=r2,: 128]
+vld1.8 {d10-d11},[r2,: 128]
+
+# qhasm: r3[0,1] +=  x4[0] unsigned*  5z34[2]; r3[2,3] +=  x4[1] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<x4=reg128#11%bot,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r3=q4,<x4=d20,<5z34=d11
+vmlal.u32 q4,d20,d11
+
+# qhasm:   r0 = r0[1]r0[0]r0[3]r0[2] 
+# asm 1: vrev64.i32 >r0=reg128#8,<r0=reg128#8
+# asm 2: vrev64.i32 >r0=q7,<r0=q7
+vrev64.i32 q7,q7
+
+# qhasm:   r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 
+# asm 1: vshll.u32 >r2=reg128#14,<c01=reg128#1%top,#12
+# asm 2: vshll.u32 >r2=q13,<c01=d1,#12
+vshll.u32 q13,d1,#12
+
+# qhasm:   		d01 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>d01=reg128#12%bot->d01=reg128#12%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>d01=d22->d01=d23},[<input_2=r1]!
+vld1.8 {d22-d23},[r1]!
+
+# qhasm: r2[0,1] += x01[0] unsigned* z12[2];   r2[2,3] += x01[1] unsigned* z12[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%bot,<z12=reg128#3%top
+# asm 2: vmlal.u32 <r2=q13,<x01=d16,<z12=d5
+vmlal.u32 q13,d16,d5
+
+# qhasm: r2[0,1] += x01[2] unsigned* z12[0];   r2[2,3] += x01[3] unsigned* z12[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%top,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r2=q13,<x01=d17,<z12=d4
+vmlal.u32 q13,d17,d4
+
+# qhasm: r2[0,1] += x23[0] unsigned* z0[0];   r2[2,3] += x23[1] unsigned* z0[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%bot,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r2=q13,<x23=d18,<z0=d6
+vmlal.u32 q13,d18,d6
+
+# qhasm: r2[0,1] += x23[2] unsigned*  5z34[2]; r2[2,3] += x23[3] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%top,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r2=q13,<x23=d19,<5z34=d11
+vmlal.u32 q13,d19,d11
+
+# qhasm: r2[0,1] +=  x4[0] unsigned* 5z34[0]; r2[2,3] +=  x4[1] unsigned* 5z34[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<x4=reg128#11%bot,<5z34=reg128#6%bot
+# asm 2: vmlal.u32 <r2=q13,<x4=d20,<5z34=d10
+vmlal.u32 q13,d20,d10
+
+# qhasm:   r0 = r0[0,1]c01[1]r0[2] 
+# asm 1: vext.32 <r0=reg128#8%top,<c01=reg128#1%bot,<r0=reg128#8%top,#1
+# asm 2: vext.32 <r0=d15,<c01=d0,<r0=d15,#1
+vext.32 d15,d0,d15,#1
+
+# qhasm:   r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 
+# asm 1: vshll.u32 >r1=reg128#15,<c23=reg128#2%bot,#6
+# asm 2: vshll.u32 >r1=q14,<c23=d2,#6
+vshll.u32 q14,d2,#6
+
+# qhasm: r1[0,1] += x01[0] unsigned* z12[0];   r1[2,3] += x01[1] unsigned* z12[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%bot,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r1=q14,<x01=d16,<z12=d4
+vmlal.u32 q14,d16,d4
+
+# qhasm: r1[0,1] += x01[2] unsigned* z0[0];   r1[2,3] += x01[3] unsigned* z0[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%top,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r1=q14,<x01=d17,<z0=d6
+vmlal.u32 q14,d17,d6
+
+# qhasm: r1[0,1] += x23[0] unsigned*  5z34[2]; r1[2,3] += x23[1] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%bot,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r1=q14,<x23=d18,<5z34=d11
+vmlal.u32 q14,d18,d11
+
+# qhasm: r1[0,1] += x23[2] unsigned* 5z34[0]; r1[2,3] += x23[3] unsigned* 5z34[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%top,<5z34=reg128#6%bot
+# asm 2: vmlal.u32 <r1=q14,<x23=d19,<5z34=d10
+vmlal.u32 q14,d19,d10
+
+# qhasm: ptr = &5z12_stack
+# asm 1: lea >ptr=int32#3,<5z12_stack=stack128#10
+# asm 2: lea >ptr=r2,<5z12_stack=[sp,#144]
+add r2,sp,#144
+
+# qhasm: 5z12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5z12=d0->5z12=d1},[<ptr=r2,: 128]
+vld1.8 {d0-d1},[r2,: 128]
+
+# qhasm: r1[0,1] +=  x4[0] unsigned* 5z12[2]; r1[2,3] +=  x4[1] unsigned* 5z12[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<x4=reg128#11%bot,<5z12=reg128#1%top
+# asm 2: vmlal.u32 <r1=q14,<x4=d20,<5z12=d1
+vmlal.u32 q14,d20,d1
+
+# qhasm:   		d23 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>d23=reg128#2%bot->d23=reg128#2%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>d23=d2->d23=d3},[<input_2=r1]!
+vld1.8 {d2-d3},[r1]!
+
+# qhasm:   		input_2 += 32
+# asm 1: add >input_2=int32#2,<input_2=int32#2,#32
+# asm 2: add >input_2=r1,<input_2=r1,#32
+add r1,r1,#32
+
+# qhasm: r0[0,1] +=  x4[0] unsigned* 5z12[0]; r0[2,3] +=  x4[1] unsigned* 5z12[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<x4=reg128#11%bot,<5z12=reg128#1%bot
+# asm 2: vmlal.u32 <r0=q7,<x4=d20,<5z12=d0
+vmlal.u32 q7,d20,d0
+
+# qhasm: r0[0,1] += x23[0] unsigned* 5z34[0]; r0[2,3] += x23[1] unsigned* 5z34[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%bot,<5z34=reg128#6%bot
+# asm 2: vmlal.u32 <r0=q7,<x23=d18,<5z34=d10
+vmlal.u32 q7,d18,d10
+
+# qhasm:   		d01 d23 = d01[0] d23[0] d01[1] d23[1] 
+# asm 1: vswp <d23=reg128#2%bot,<d01=reg128#12%top
+# asm 2: vswp <d23=d2,<d01=d23
+vswp d2,d23
+
+# qhasm: r0[0,1] += x23[2] unsigned* 5z12[2]; r0[2,3] += x23[3] unsigned* 5z12[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%top,<5z12=reg128#1%top
+# asm 2: vmlal.u32 <r0=q7,<x23=d19,<5z12=d1
+vmlal.u32 q7,d19,d1
+
+# qhasm: r0[0,1] += x01[0] unsigned* z0[0];   r0[2,3] += x01[1] unsigned* z0[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%bot,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r0=q7,<x01=d16,<z0=d6
+vmlal.u32 q7,d16,d6
+
+# qhasm:   		new mid
+
+# qhasm:   		2x v4 = d23 unsigned>> 40  
+# asm 1: vshr.u64 >v4=reg128#4,<d23=reg128#2,#40
+# asm 2: vshr.u64 >v4=q3,<d23=q1,#40
+vshr.u64 q3,q1,#40
+
+# qhasm:   		mid = d01[1]d23[0] mid[2,3] 
+# asm 1: vext.32 <mid=reg128#1%bot,<d01=reg128#12%bot,<d23=reg128#2%bot,#1
+# asm 2: vext.32 <mid=d0,<d01=d22,<d23=d2,#1
+vext.32 d0,d22,d2,#1
+
+# qhasm:   		new v23
+
+# qhasm:   		v23[2] = d23[0,1] unsigned>> 14; v23[3] = d23[2,3] unsigned>> 14
+# asm 1: vshrn.u64 <v23=reg128#10%top,<d23=reg128#2,#14
+# asm 2: vshrn.u64 <v23=d19,<d23=q1,#14
+vshrn.u64 d19,q1,#14
+
+# qhasm:   		mid = mid[0,1] d01[3]d23[2] 
+# asm 1: vext.32 <mid=reg128#1%top,<d01=reg128#12%top,<d23=reg128#2%top,#1
+# asm 2: vext.32 <mid=d1,<d01=d23,<d23=d3,#1
+vext.32 d1,d23,d3,#1
+
+# qhasm:   		new v01
+
+# qhasm:   		v01[2] = d01[0,1] unsigned>> 26; v01[3] = d01[2,3] unsigned>> 26
+# asm 1: vshrn.u64 <v01=reg128#11%top,<d01=reg128#12,#26
+# asm 2: vshrn.u64 <v01=d21,<d01=q11,#26
+vshrn.u64 d21,q11,#26
+
+# qhasm:   		v01 = d01[1]d01[0] v01[2,3] 
+# asm 1: vext.32 <v01=reg128#11%bot,<d01=reg128#12%bot,<d01=reg128#12%bot,#1
+# asm 2: vext.32 <v01=d20,<d01=d22,<d01=d22,#1
+vext.32 d20,d22,d22,#1
+
+# qhasm: r0[0,1] += x01[2] unsigned*  5z34[2]; r0[2,3] += x01[3] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%top,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r0=q7,<x01=d17,<5z34=d11
+vmlal.u32 q7,d17,d11
+
+# qhasm:   		v01 = v01[1]d01[2] v01[2,3] 
+# asm 1: vext.32 <v01=reg128#11%bot,<v01=reg128#11%bot,<d01=reg128#12%top,#1
+# asm 2: vext.32 <v01=d20,<v01=d20,<d01=d23,#1
+vext.32 d20,d20,d23,#1
+
+# qhasm:   		v23[0] = mid[0,1] unsigned>> 20; v23[1] = mid[2,3] unsigned>> 20
+# asm 1: vshrn.u64 <v23=reg128#10%bot,<mid=reg128#1,#20
+# asm 2: vshrn.u64 <v23=d18,<mid=q0,#20
+vshrn.u64 d18,q0,#20
+
+# qhasm:   		v4 = v4[0]v4[2]v4[1]v4[3]  
+# asm 1: vtrn.32 <v4=reg128#4%bot,<v4=reg128#4%top
+# asm 2: vtrn.32 <v4=d6,<v4=d7
+vtrn.32 d6,d7
+
+# qhasm:   		4x v01 &= 0x03ffffff
+# asm 1: vand.i32 <v01=reg128#11,#0x03ffffff
+# asm 2: vand.i32 <v01=q10,#0x03ffffff
+vand.i32 q10,#0x03ffffff
+
+# qhasm: ptr = &y34_stack
+# asm 1: lea >ptr=int32#3,<y34_stack=stack128#4
+# asm 2: lea >ptr=r2,<y34_stack=[sp,#48]
+add r2,sp,#48
+
+# qhasm: y34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>y34=d4->y34=d5},[<ptr=r2,: 128]
+vld1.8 {d4-d5},[r2,: 128]
+
+# qhasm:   		4x v23 &= 0x03ffffff
+# asm 1: vand.i32 <v23=reg128#10,#0x03ffffff
+# asm 2: vand.i32 <v23=q9,#0x03ffffff
+vand.i32 q9,#0x03ffffff
+
+# qhasm: ptr = &y12_stack
+# asm 1: lea >ptr=int32#3,<y12_stack=stack128#3
+# asm 2: lea >ptr=r2,<y12_stack=[sp,#32]
+add r2,sp,#32
+
+# qhasm: y12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>y12=d2->y12=d3},[<ptr=r2,: 128]
+vld1.8 {d2-d3},[r2,: 128]
+
+# qhasm:   		4x v4 |= 0x01000000
+# asm 1: vorr.i32 <v4=reg128#4,#0x01000000
+# asm 2: vorr.i32 <v4=q3,#0x01000000
+vorr.i32 q3,#0x01000000
+
+# qhasm: ptr = &y0_stack
+# asm 1: lea >ptr=int32#3,<y0_stack=stack128#2
+# asm 2: lea >ptr=r2,<y0_stack=[sp,#16]
+add r2,sp,#16
+
+# qhasm: y0 aligned= mem128[ptr]
+# asm 1: vld1.8 {>y0=reg128#1%bot->y0=reg128#1%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>y0=d0->y0=d1},[<ptr=r2,: 128]
+vld1.8 {d0-d1},[r2,: 128]
+
+# qhasm: r4[0,1] += v01[0] unsigned*  y34[2];  r4[2,3] += v01[1] unsigned*  y34[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%bot,<y34=reg128#3%top
+# asm 2: vmlal.u32 <r4=q15,<v01=d20,<y34=d5
+vmlal.u32 q15,d20,d5
+
+# qhasm: r4[0,1] += v01[2] unsigned* y34[0];  r4[2,3] += v01[3] unsigned* y34[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%top,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r4=q15,<v01=d21,<y34=d4
+vmlal.u32 q15,d21,d4
+
+# qhasm: r4[0,1] += v23[0] unsigned* y12[2];  r4[2,3] += v23[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r4=q15,<v23=d18,<y12=d3
+vmlal.u32 q15,d18,d3
+
+# qhasm: r4[0,1] += v23[2] unsigned* y12[0];  r4[2,3] += v23[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r4=q15,<v23=d19,<y12=d2
+vmlal.u32 q15,d19,d2
+
+# qhasm: r4[0,1] +=  v4[0] unsigned* y0[0];  r4[2,3] +=  v4[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<v4=reg128#4%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r4=q15,<v4=d6,<y0=d0
+vmlal.u32 q15,d6,d0
+
+# qhasm: ptr = &5y34_stack
+# asm 1: lea >ptr=int32#3,<5y34_stack=stack128#6
+# asm 2: lea >ptr=r2,<5y34_stack=[sp,#80]
+add r2,sp,#80
+
+# qhasm: 5y34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5y34=d24->5y34=d25},[<ptr=r2,: 128]
+vld1.8 {d24-d25},[r2,: 128]
+
+# qhasm: r3[0,1] += v01[0] unsigned* y34[0];   r3[2,3] += v01[1] unsigned* y34[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%bot,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r3=q4,<v01=d20,<y34=d4
+vmlal.u32 q4,d20,d4
+
+# qhasm: r3[0,1] += v01[2] unsigned* y12[2];   r3[2,3] += v01[3] unsigned* y12[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%top,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r3=q4,<v01=d21,<y12=d3
+vmlal.u32 q4,d21,d3
+
+# qhasm: r3[0,1] += v23[0] unsigned* y12[0];   r3[2,3] += v23[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r3=q4,<v23=d18,<y12=d2
+vmlal.u32 q4,d18,d2
+
+# qhasm: r3[0,1] += v23[2] unsigned* y0[0];   r3[2,3] += v23[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r3=q4,<v23=d19,<y0=d0
+vmlal.u32 q4,d19,d0
+
+# qhasm: r3[0,1] +=  v4[0] unsigned*  5y34[2]; r3[2,3] +=  v4[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<v4=reg128#4%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r3=q4,<v4=d6,<5y34=d25
+vmlal.u32 q4,d6,d25
+
+# qhasm: ptr = &5y12_stack
+# asm 1: lea >ptr=int32#3,<5y12_stack=stack128#5
+# asm 2: lea >ptr=r2,<5y12_stack=[sp,#64]
+add r2,sp,#64
+
+# qhasm: 5y12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5y12=d22->5y12=d23},[<ptr=r2,: 128]
+vld1.8 {d22-d23},[r2,: 128]
+
+# qhasm: r0[0,1] +=  v4[0] unsigned* 5y12[0]; r0[2,3] +=  v4[1] unsigned* 5y12[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<v4=reg128#4%bot,<5y12=reg128#12%bot
+# asm 2: vmlal.u32 <r0=q7,<v4=d6,<5y12=d22
+vmlal.u32 q7,d6,d22
+
+# qhasm: r0[0,1] += v23[0] unsigned* 5y34[0]; r0[2,3] += v23[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r0=q7,<v23=d18,<5y34=d24
+vmlal.u32 q7,d18,d24
+
+# qhasm: r0[0,1] += v23[2] unsigned* 5y12[2]; r0[2,3] += v23[3] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%top,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r0=q7,<v23=d19,<5y12=d23
+vmlal.u32 q7,d19,d23
+
+# qhasm: r0[0,1] += v01[0] unsigned* y0[0];   r0[2,3] += v01[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r0=q7,<v01=d20,<y0=d0
+vmlal.u32 q7,d20,d0
+
+# qhasm: r0[0,1] += v01[2] unsigned*  5y34[2]; r0[2,3] += v01[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r0=q7,<v01=d21,<5y34=d25
+vmlal.u32 q7,d21,d25
+
+# qhasm: r1[0,1] += v01[0] unsigned* y12[0];   r1[2,3] += v01[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r1=q14,<v01=d20,<y12=d2
+vmlal.u32 q14,d20,d2
+
+# qhasm: r1[0,1] += v01[2] unsigned* y0[0];   r1[2,3] += v01[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r1=q14,<v01=d21,<y0=d0
+vmlal.u32 q14,d21,d0
+
+# qhasm: r1[0,1] += v23[0] unsigned*  5y34[2]; r1[2,3] += v23[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r1=q14,<v23=d18,<5y34=d25
+vmlal.u32 q14,d18,d25
+
+# qhasm: r1[0,1] += v23[2] unsigned* 5y34[0]; r1[2,3] += v23[3] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%top,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r1=q14,<v23=d19,<5y34=d24
+vmlal.u32 q14,d19,d24
+
+# qhasm: r1[0,1] +=  v4[0] unsigned* 5y12[2]; r1[2,3] +=  v4[1] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<v4=reg128#4%bot,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r1=q14,<v4=d6,<5y12=d23
+vmlal.u32 q14,d6,d23
+
+# qhasm: r2[0,1] += v01[0] unsigned* y12[2];   r2[2,3] += v01[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r2=q13,<v01=d20,<y12=d3
+vmlal.u32 q13,d20,d3
+
+# qhasm: r2[0,1] += v01[2] unsigned* y12[0];   r2[2,3] += v01[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r2=q13,<v01=d21,<y12=d2
+vmlal.u32 q13,d21,d2
+
+# qhasm: r2[0,1] += v23[0] unsigned* y0[0];   r2[2,3] += v23[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r2=q13,<v23=d18,<y0=d0
+vmlal.u32 q13,d18,d0
+
+# qhasm: r2[0,1] += v23[2] unsigned*  5y34[2]; r2[2,3] += v23[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r2=q13,<v23=d19,<5y34=d25
+vmlal.u32 q13,d19,d25
+
+# qhasm: r2[0,1] +=  v4[0] unsigned* 5y34[0]; r2[2,3] +=  v4[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<v4=reg128#4%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r2=q13,<v4=d6,<5y34=d24
+vmlal.u32 q13,d6,d24
+
+# qhasm: 				ptr = &two24
+# asm 1: lea >ptr=int32#3,<two24=stack128#1
+# asm 2: lea >ptr=r2,<two24=[sp,#0]
+add r2,sp,#0
+
+# qhasm: 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#4,<r0=reg128#8,#26
+# asm 2: vshr.u64 >t1=q3,<r0=q7,#26
+vshr.u64 q3,q7,#26
+
+# qhasm:   				len -= 64
+# asm 1: sub >len=int32#4,<len=int32#4,#64
+# asm 2: sub >len=r3,<len=r3,#64
+sub r3,r3,#64
+
+# qhasm:    r0 &= mask
+# asm 1: vand >r0=reg128#6,<r0=reg128#8,<mask=reg128#7
+# asm 2: vand >r0=q5,<r0=q7,<mask=q6
+vand q5,q7,q6
+
+# qhasm: 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#15,<t1=reg128#4
+# asm 2: vadd.i64 >r1=q3,<r1=q14,<t1=q3
+vadd.i64 q3,q14,q3
+
+# qhasm: 		2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#5,#26
+# asm 2: vshr.u64 >t4=q7,<r3=q4,#26
+vshr.u64 q7,q4,#26
+
+# qhasm: 		   r3 &= mask
+# asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7
+# asm 2: vand >r3=q4,<r3=q4,<mask=q6
+vand q4,q4,q6
+
+# qhasm: 		2x x4 = r4 + t4
+# asm 1: vadd.i64 >x4=reg128#8,<r4=reg128#16,<t4=reg128#8
+# asm 2: vadd.i64 >x4=q7,<r4=q15,<t4=q7
+vadd.i64 q7,q15,q7
+
+# qhasm: 				r4 aligned= mem128[ptr]
+# asm 1: vld1.8 {>r4=reg128#16%bot->r4=reg128#16%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>r4=d30->r4=d31},[<ptr=r2,: 128]
+vld1.8 {d30-d31},[r2,: 128]
+
+# qhasm: 2x t2 = r1 unsigned>> 26
+# asm 1: vshr.u64 >t2=reg128#9,<r1=reg128#4,#26
+# asm 2: vshr.u64 >t2=q8,<r1=q3,#26
+vshr.u64 q8,q3,#26
+
+# qhasm:    r1 &= mask
+# asm 1: vand >r1=reg128#4,<r1=reg128#4,<mask=reg128#7
+# asm 2: vand >r1=q3,<r1=q3,<mask=q6
+vand q3,q3,q6
+
+# qhasm: 		2x t0 = x4 unsigned>> 26
+# asm 1: vshr.u64 >t0=reg128#10,<x4=reg128#8,#26
+# asm 2: vshr.u64 >t0=q9,<x4=q7,#26
+vshr.u64 q9,q7,#26
+
+# qhasm: 2x r2 += t2
+# asm 1: vadd.i64 >r2=reg128#9,<r2=reg128#14,<t2=reg128#9
+# asm 2: vadd.i64 >r2=q8,<r2=q13,<t2=q8
+vadd.i64 q8,q13,q8
+
+# qhasm: 		   x4 &= mask
+# asm 1: vand >x4=reg128#11,<x4=reg128#8,<mask=reg128#7
+# asm 2: vand >x4=q10,<x4=q7,<mask=q6
+vand q10,q7,q6
+
+# qhasm: 		2x x01 = r0 + t0
+# asm 1: vadd.i64 >x01=reg128#6,<r0=reg128#6,<t0=reg128#10
+# asm 2: vadd.i64 >x01=q5,<r0=q5,<t0=q9
+vadd.i64 q5,q5,q9
+
+# qhasm: 				r0 aligned= mem128[ptr]
+# asm 1: vld1.8 {>r0=reg128#8%bot->r0=reg128#8%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>r0=d14->r0=d15},[<ptr=r2,: 128]
+vld1.8 {d14-d15},[r2,: 128]
+
+# qhasm: 				ptr = &z34_stack
+# asm 1: lea >ptr=int32#3,<z34_stack=stack128#9
+# asm 2: lea >ptr=r2,<z34_stack=[sp,#128]
+add r2,sp,#128
+
+# qhasm: 		2x t0 <<= 2
+# asm 1: vshl.i64 >t0=reg128#10,<t0=reg128#10,#2
+# asm 2: vshl.i64 >t0=q9,<t0=q9,#2
+vshl.i64 q9,q9,#2
+
+# qhasm: 2x t3 = r2 unsigned>> 26
+# asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#9,#26
+# asm 2: vshr.u64 >t3=q13,<r2=q8,#26
+vshr.u64 q13,q8,#26
+
+# qhasm: 		2x x01 += t0
+# asm 1: vadd.i64 >x01=reg128#15,<x01=reg128#6,<t0=reg128#10
+# asm 2: vadd.i64 >x01=q14,<x01=q5,<t0=q9
+vadd.i64 q14,q5,q9
+
+# qhasm: 				z34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>z34=d10->z34=d11},[<ptr=r2,: 128]
+vld1.8 {d10-d11},[r2,: 128]
+
+# qhasm:    x23 = r2 & mask
+# asm 1: vand >x23=reg128#10,<r2=reg128#9,<mask=reg128#7
+# asm 2: vand >x23=q9,<r2=q8,<mask=q6
+vand q9,q8,q6
+
+# qhasm: 2x r3 += t3
+# asm 1: vadd.i64 >r3=reg128#5,<r3=reg128#5,<t3=reg128#14
+# asm 2: vadd.i64 >r3=q4,<r3=q4,<t3=q13
+vadd.i64 q4,q4,q13
+
+# qhasm: 								input_2 += 32
+# asm 1: add >input_2=int32#2,<input_2=int32#2,#32
+# asm 2: add >input_2=r1,<input_2=r1,#32
+add r1,r1,#32
+
+# qhasm: 		2x t1 = x01 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#14,<x01=reg128#15,#26
+# asm 2: vshr.u64 >t1=q13,<x01=q14,#26
+vshr.u64 q13,q14,#26
+
+# qhasm: 						x23 = x23[0,2,1,3]
+# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
+# asm 2: vtrn.32 <x23=d18,<x23=d19
+vtrn.32 d18,d19
+
+# qhasm: 		   x01 = x01 & mask
+# asm 1: vand >x01=reg128#9,<x01=reg128#15,<mask=reg128#7
+# asm 2: vand >x01=q8,<x01=q14,<mask=q6
+vand q8,q14,q6
+
+# qhasm: 		2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#4,<t1=reg128#14
+# asm 2: vadd.i64 >r1=q3,<r1=q3,<t1=q13
+vadd.i64 q3,q3,q13
+
+# qhasm: 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#14,<r3=reg128#5,#26
+# asm 2: vshr.u64 >t4=q13,<r3=q4,#26
+vshr.u64 q13,q4,#26
+
+# qhasm: 						x01 = x01[0,2,1,3]
+# asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top
+# asm 2: vtrn.32 <x01=d16,<x01=d17
+vtrn.32 d16,d17
+
+# qhasm:    r3 &= mask
+# asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7
+# asm 2: vand >r3=q4,<r3=q4,<mask=q6
+vand q4,q4,q6
+
+# qhasm: 						r1 = r1[0,2,1,3]
+# asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top
+# asm 2: vtrn.32 <r1=d6,<r1=d7
+vtrn.32 d6,d7
+
+# qhasm: 2x x4 += t4
+# asm 1: vadd.i64 >x4=reg128#11,<x4=reg128#11,<t4=reg128#14
+# asm 2: vadd.i64 >x4=q10,<x4=q10,<t4=q13
+vadd.i64 q10,q10,q13
+
+# qhasm: 						r3 = r3[0,2,1,3]
+# asm 1: vtrn.32 <r3=reg128#5%bot,<r3=reg128#5%top
+# asm 2: vtrn.32 <r3=d8,<r3=d9
+vtrn.32 d8,d9
+
+# qhasm: 						x01 = x01[0,1] r1[0,1]
+# asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0
+# asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0
+vext.32 d17,d6,d6,#0
+
+# qhasm: 						x23 = x23[0,1] r3[0,1]
+# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#5%bot,<r3=reg128#5%bot,#0
+# asm 2: vext.32 <x23=d19,<r3=d8,<r3=d8,#0
+vext.32 d19,d8,d8,#0
+
+# qhasm: 						x4 = x4[0,2,1,3]
+# asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top
+# asm 2: vtrn.32 <x4=d20,<x4=d21
+vtrn.32 d20,d21
+
+# qhasm:                   unsigned>? len - 64
+# asm 1: cmp <len=int32#4,#64
+# asm 2: cmp <len=r3,#64
+cmp r3,#64
+
+# qhasm: goto mainloop2 if unsigned>
+bhi ._mainloop2
+
+# qhasm: input_2 -= 32
+# asm 1: sub >input_2=int32#3,<input_2=int32#2,#32
+# asm 2: sub >input_2=r2,<input_2=r1,#32
+sub r2,r1,#32
+
+# qhasm: below64bytes:
+._below64bytes:
+
+# qhasm:              unsigned>? len - 32
+# asm 1: cmp <len=int32#4,#32
+# asm 2: cmp <len=r3,#32
+cmp r3,#32
+
+# qhasm: goto end if !unsigned>
+bls ._end
+
+# qhasm: mainloop:
+._mainloop:
+
+# qhasm:   new r0
+
+# qhasm: ptr = &two24
+# asm 1: lea >ptr=int32#2,<two24=stack128#1
+# asm 2: lea >ptr=r1,<two24=[sp,#0]
+add r1,sp,#0
+
+# qhasm: r4 aligned= mem128[ptr]
+# asm 1: vld1.8 {>r4=reg128#5%bot->r4=reg128#5%top},[<ptr=int32#2,: 128]
+# asm 2: vld1.8 {>r4=d8->r4=d9},[<ptr=r1,: 128]
+vld1.8 {d8-d9},[r1,: 128]
+
+# qhasm: u4 aligned= mem128[ptr]
+# asm 1: vld1.8 {>u4=reg128#6%bot->u4=reg128#6%top},[<ptr=int32#2,: 128]
+# asm 2: vld1.8 {>u4=d10->u4=d11},[<ptr=r1,: 128]
+vld1.8 {d10-d11},[r1,: 128]
+
+# qhasm:   c01 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c01=reg128#8%bot->c01=reg128#8%top},[<input_2=int32#3]!
+# asm 2: vld1.8 {>c01=d14->c01=d15},[<input_2=r2]!
+vld1.8 {d14-d15},[r2]!
+
+# qhasm: r4[0,1] += x01[0] unsigned*  y34[2];  r4[2,3] += x01[1] unsigned*  y34[3]
+# asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%bot,<y34=reg128#3%top
+# asm 2: vmlal.u32 <r4=q4,<x01=d16,<y34=d5
+vmlal.u32 q4,d16,d5
+
+# qhasm:   c23 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_2=int32#3]!
+# asm 2: vld1.8 {>c23=d26->c23=d27},[<input_2=r2]!
+vld1.8 {d26-d27},[r2]!
+
+# qhasm: r4[0,1] += x01[2] unsigned* y34[0];  r4[2,3] += x01[3] unsigned* y34[1]
+# asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%top,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r4=q4,<x01=d17,<y34=d4
+vmlal.u32 q4,d17,d4
+
+# qhasm:   r0 = u4[1]c01[0]r0[2,3] 
+# asm 1: vext.32 <r0=reg128#4%bot,<u4=reg128#6%bot,<c01=reg128#8%bot,#1
+# asm 2: vext.32 <r0=d6,<u4=d10,<c01=d14,#1
+vext.32 d6,d10,d14,#1
+
+# qhasm: r4[0,1] += x23[0] unsigned* y12[2];  r4[2,3] += x23[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r4=q4,<x23=d18,<y12=d3
+vmlal.u32 q4,d18,d3
+
+# qhasm:   r0 = r0[0,1]u4[1]c23[0] 
+# asm 1: vext.32 <r0=reg128#4%top,<u4=reg128#6%bot,<c23=reg128#14%bot,#1
+# asm 2: vext.32 <r0=d7,<u4=d10,<c23=d26,#1
+vext.32 d7,d10,d26,#1
+
+# qhasm: r4[0,1] += x23[2] unsigned* y12[0];  r4[2,3] += x23[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r4=q4,<x23=d19,<y12=d2
+vmlal.u32 q4,d19,d2
+
+# qhasm:   r0 = r0[1]r0[0]r0[3]r0[2] 
+# asm 1: vrev64.i32 >r0=reg128#4,<r0=reg128#4
+# asm 2: vrev64.i32 >r0=q3,<r0=q3
+vrev64.i32 q3,q3
+
+# qhasm: r4[0,1] +=  x4[0] unsigned* y0[0];  r4[2,3] +=  x4[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r4=reg128#5,<x4=reg128#11%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r4=q4,<x4=d20,<y0=d0
+vmlal.u32 q4,d20,d0
+
+# qhasm: r0[0,1] +=  x4[0] unsigned* 5y12[0]; r0[2,3] +=  x4[1] unsigned* 5y12[1]
+# asm 1: vmlal.u32 <r0=reg128#4,<x4=reg128#11%bot,<5y12=reg128#12%bot
+# asm 2: vmlal.u32 <r0=q3,<x4=d20,<5y12=d22
+vmlal.u32 q3,d20,d22
+
+# qhasm: r0[0,1] += x23[0] unsigned* 5y34[0]; r0[2,3] += x23[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r0=q3,<x23=d18,<5y34=d24
+vmlal.u32 q3,d18,d24
+
+# qhasm: r0[0,1] += x23[2] unsigned* 5y12[2]; r0[2,3] += x23[3] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%top,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r0=q3,<x23=d19,<5y12=d23
+vmlal.u32 q3,d19,d23
+
+# qhasm:   c01 c23 = c01[0]c23[0]c01[2]c23[2]c01[1]c23[1]c01[3]c23[3] 
+# asm 1: vtrn.32 <c01=reg128#8,<c23=reg128#14
+# asm 2: vtrn.32 <c01=q7,<c23=q13
+vtrn.32 q7,q13
+
+# qhasm: r0[0,1] += x01[0] unsigned* y0[0];   r0[2,3] += x01[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r0=q3,<x01=d16,<y0=d0
+vmlal.u32 q3,d16,d0
+
+# qhasm:   r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 
+# asm 1: vshll.u32 >r3=reg128#6,<c23=reg128#14%top,#18
+# asm 2: vshll.u32 >r3=q5,<c23=d27,#18
+vshll.u32 q5,d27,#18
+
+# qhasm: r0[0,1] += x01[2] unsigned*  5y34[2]; r0[2,3] += x01[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r0=q3,<x01=d17,<5y34=d25
+vmlal.u32 q3,d17,d25
+
+# qhasm: r3[0,1] += x01[0] unsigned* y34[0];   r3[2,3] += x01[1] unsigned* y34[1]
+# asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%bot,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r3=q5,<x01=d16,<y34=d4
+vmlal.u32 q5,d16,d4
+
+# qhasm: r3[0,1] += x01[2] unsigned* y12[2];   r3[2,3] += x01[3] unsigned* y12[3]
+# asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%top,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r3=q5,<x01=d17,<y12=d3
+vmlal.u32 q5,d17,d3
+
+# qhasm: r3[0,1] += x23[0] unsigned* y12[0];   r3[2,3] += x23[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r3=q5,<x23=d18,<y12=d2
+vmlal.u32 q5,d18,d2
+
+# qhasm: r3[0,1] += x23[2] unsigned* y0[0];   r3[2,3] += x23[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r3=q5,<x23=d19,<y0=d0
+vmlal.u32 q5,d19,d0
+
+# qhasm:   r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 
+# asm 1: vshll.u32 >r1=reg128#14,<c23=reg128#14%bot,#6
+# asm 2: vshll.u32 >r1=q13,<c23=d26,#6
+vshll.u32 q13,d26,#6
+
+# qhasm: r3[0,1] +=  x4[0] unsigned*  5y34[2]; r3[2,3] +=  x4[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r3=reg128#6,<x4=reg128#11%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r3=q5,<x4=d20,<5y34=d25
+vmlal.u32 q5,d20,d25
+
+# qhasm: r1[0,1] += x01[0] unsigned* y12[0];   r1[2,3] += x01[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r1=q13,<x01=d16,<y12=d2
+vmlal.u32 q13,d16,d2
+
+# qhasm: r1[0,1] += x01[2] unsigned* y0[0];   r1[2,3] += x01[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r1=q13,<x01=d17,<y0=d0
+vmlal.u32 q13,d17,d0
+
+# qhasm: r1[0,1] += x23[0] unsigned*  5y34[2]; r1[2,3] += x23[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r1=q13,<x23=d18,<5y34=d25
+vmlal.u32 q13,d18,d25
+
+# qhasm: r1[0,1] += x23[2] unsigned* 5y34[0]; r1[2,3] += x23[3] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%top,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r1=q13,<x23=d19,<5y34=d24
+vmlal.u32 q13,d19,d24
+
+# qhasm:   r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 
+# asm 1: vshll.u32 >r2=reg128#8,<c01=reg128#8%top,#12
+# asm 2: vshll.u32 >r2=q7,<c01=d15,#12
+vshll.u32 q7,d15,#12
+
+# qhasm: r1[0,1] +=  x4[0] unsigned* 5y12[2]; r1[2,3] +=  x4[1] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r1=reg128#14,<x4=reg128#11%bot,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r1=q13,<x4=d20,<5y12=d23
+vmlal.u32 q13,d20,d23
+
+# qhasm: r2[0,1] += x01[0] unsigned* y12[2];   r2[2,3] += x01[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r2=q7,<x01=d16,<y12=d3
+vmlal.u32 q7,d16,d3
+
+# qhasm: r2[0,1] += x01[2] unsigned* y12[0];   r2[2,3] += x01[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r2=q7,<x01=d17,<y12=d2
+vmlal.u32 q7,d17,d2
+
+# qhasm: r2[0,1] += x23[0] unsigned* y0[0];   r2[2,3] += x23[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r2=q7,<x23=d18,<y0=d0
+vmlal.u32 q7,d18,d0
+
+# qhasm: r2[0,1] += x23[2] unsigned*  5y34[2]; r2[2,3] += x23[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r2=q7,<x23=d19,<5y34=d25
+vmlal.u32 q7,d19,d25
+
+# qhasm: r2[0,1] +=  x4[0] unsigned* 5y34[0]; r2[2,3] +=  x4[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r2=reg128#8,<x4=reg128#11%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r2=q7,<x4=d20,<5y34=d24
+vmlal.u32 q7,d20,d24
+
+# qhasm: 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#9,<r0=reg128#4,#26
+# asm 2: vshr.u64 >t1=q8,<r0=q3,#26
+vshr.u64 q8,q3,#26
+
+# qhasm:    r0 &= mask
+# asm 1: vand >r0=reg128#4,<r0=reg128#4,<mask=reg128#7
+# asm 2: vand >r0=q3,<r0=q3,<mask=q6
+vand q3,q3,q6
+
+# qhasm: 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#9,<r1=reg128#14,<t1=reg128#9
+# asm 2: vadd.i64 >r1=q8,<r1=q13,<t1=q8
+vadd.i64 q8,q13,q8
+
+# qhasm: 		2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#10,<r3=reg128#6,#26
+# asm 2: vshr.u64 >t4=q9,<r3=q5,#26
+vshr.u64 q9,q5,#26
+
+# qhasm: 		   r3 &= mask
+# asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7
+# asm 2: vand >r3=q5,<r3=q5,<mask=q6
+vand q5,q5,q6
+
+# qhasm: 		2x r4 += t4
+# asm 1: vadd.i64 >r4=reg128#5,<r4=reg128#5,<t4=reg128#10
+# asm 2: vadd.i64 >r4=q4,<r4=q4,<t4=q9
+vadd.i64 q4,q4,q9
+
+# qhasm: 2x t2 = r1 unsigned>> 26
+# asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#9,#26
+# asm 2: vshr.u64 >t2=q9,<r1=q8,#26
+vshr.u64 q9,q8,#26
+
+# qhasm:    r1 &= mask
+# asm 1: vand >r1=reg128#11,<r1=reg128#9,<mask=reg128#7
+# asm 2: vand >r1=q10,<r1=q8,<mask=q6
+vand q10,q8,q6
+
+# qhasm: 		2x t0 = r4 unsigned>> 26
+# asm 1: vshr.u64 >t0=reg128#9,<r4=reg128#5,#26
+# asm 2: vshr.u64 >t0=q8,<r4=q4,#26
+vshr.u64 q8,q4,#26
+
+# qhasm: 2x r2 += t2
+# asm 1: vadd.i64 >r2=reg128#8,<r2=reg128#8,<t2=reg128#10
+# asm 2: vadd.i64 >r2=q7,<r2=q7,<t2=q9
+vadd.i64 q7,q7,q9
+
+# qhasm: 		   r4 &= mask
+# asm 1: vand >r4=reg128#5,<r4=reg128#5,<mask=reg128#7
+# asm 2: vand >r4=q4,<r4=q4,<mask=q6
+vand q4,q4,q6
+
+# qhasm: 		2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8
+vadd.i64 q3,q3,q8
+
+# qhasm: 		2x t0 <<= 2
+# asm 1: vshl.i64 >t0=reg128#9,<t0=reg128#9,#2
+# asm 2: vshl.i64 >t0=q8,<t0=q8,#2
+vshl.i64 q8,q8,#2
+
+# qhasm: 2x t3 = r2 unsigned>> 26
+# asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#8,#26
+# asm 2: vshr.u64 >t3=q13,<r2=q7,#26
+vshr.u64 q13,q7,#26
+
+# qhasm: 		2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8
+vadd.i64 q3,q3,q8
+
+# qhasm:    x23 = r2 & mask
+# asm 1: vand >x23=reg128#10,<r2=reg128#8,<mask=reg128#7
+# asm 2: vand >x23=q9,<r2=q7,<mask=q6
+vand q9,q7,q6
+
+# qhasm: 2x r3 += t3
+# asm 1: vadd.i64 >r3=reg128#6,<r3=reg128#6,<t3=reg128#14
+# asm 2: vadd.i64 >r3=q5,<r3=q5,<t3=q13
+vadd.i64 q5,q5,q13
+
+# qhasm: 		2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#8,<r0=reg128#4,#26
+# asm 2: vshr.u64 >t1=q7,<r0=q3,#26
+vshr.u64 q7,q3,#26
+
+# qhasm: 		   x01 = r0 & mask
+# asm 1: vand >x01=reg128#9,<r0=reg128#4,<mask=reg128#7
+# asm 2: vand >x01=q8,<r0=q3,<mask=q6
+vand q8,q3,q6
+
+# qhasm: 		2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#11,<t1=reg128#8
+# asm 2: vadd.i64 >r1=q3,<r1=q10,<t1=q7
+vadd.i64 q3,q10,q7
+
+# qhasm: 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#6,#26
+# asm 2: vshr.u64 >t4=q7,<r3=q5,#26
+vshr.u64 q7,q5,#26
+
+# qhasm:    r3 &= mask
+# asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7
+# asm 2: vand >r3=q5,<r3=q5,<mask=q6
+vand q5,q5,q6
+
+# qhasm: 2x x4 = r4 + t4
+# asm 1: vadd.i64 >x4=reg128#11,<r4=reg128#5,<t4=reg128#8
+# asm 2: vadd.i64 >x4=q10,<r4=q4,<t4=q7
+vadd.i64 q10,q4,q7
+
+# qhasm:   len -= 32
+# asm 1: sub >len=int32#4,<len=int32#4,#32
+# asm 2: sub >len=r3,<len=r3,#32
+sub r3,r3,#32
+
+# qhasm: x01 = x01[0,2,1,3]
+# asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top
+# asm 2: vtrn.32 <x01=d16,<x01=d17
+vtrn.32 d16,d17
+
+# qhasm: x23 = x23[0,2,1,3]
+# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
+# asm 2: vtrn.32 <x23=d18,<x23=d19
+vtrn.32 d18,d19
+
+# qhasm: r1 = r1[0,2,1,3]
+# asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top
+# asm 2: vtrn.32 <r1=d6,<r1=d7
+vtrn.32 d6,d7
+
+# qhasm: r3 = r3[0,2,1,3]
+# asm 1: vtrn.32 <r3=reg128#6%bot,<r3=reg128#6%top
+# asm 2: vtrn.32 <r3=d10,<r3=d11
+vtrn.32 d10,d11
+
+# qhasm: x4 = x4[0,2,1,3]
+# asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top
+# asm 2: vtrn.32 <x4=d20,<x4=d21
+vtrn.32 d20,d21
+
+# qhasm: x01 = x01[0,1] r1[0,1]
+# asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0
+# asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0
+vext.32 d17,d6,d6,#0
+
+# qhasm: x23 = x23[0,1] r3[0,1]
+# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#6%bot,<r3=reg128#6%bot,#0
+# asm 2: vext.32 <x23=d19,<r3=d10,<r3=d10,#0
+vext.32 d19,d10,d10,#0
+
+# qhasm: unsigned>? len - 32
+# asm 1: cmp <len=int32#4,#32
+# asm 2: cmp <len=r3,#32
+cmp r3,#32
+
+# qhasm: goto mainloop if unsigned>
+bhi ._mainloop
+
+# qhasm: end:
+._end:
+
+# qhasm: mem128[input_0] = x01;input_0+=16
+# asm 1: vst1.8 {<x01=reg128#9%bot-<x01=reg128#9%top},[<input_0=int32#1]!
+# asm 2: vst1.8 {<x01=d16-<x01=d17},[<input_0=r0]!
+vst1.8 {d16-d17},[r0]!
+
+# qhasm: mem128[input_0] = x23;input_0+=16
+# asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1]!
+# asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0]!
+vst1.8 {d18-d19},[r0]!
+
+# qhasm: mem64[input_0] = x4[0]
+# asm 1: vst1.8 <x4=reg128#11%bot,[<input_0=int32#1]
+# asm 2: vst1.8 <x4=d20,[<input_0=r0]
+vst1.8 d20,[r0]
+
+# qhasm: len = len
+# asm 1: mov >len=int32#1,<len=int32#4
+# asm 2: mov >len=r0,<len=r3
+mov r0,r3
+
+# qhasm: qpopreturn len
+mov sp,r12
+vpop {q4,q5,q6,q7}
+bx lr
+
+# qhasm: int32 input_0
+
+# qhasm: int32 input_1
+
+# qhasm: int32 input_2
+
+# qhasm: int32 input_3
+
+# qhasm: stack32 input_4
+
+# qhasm: stack32 input_5
+
+# qhasm: stack32 input_6
+
+# qhasm: stack32 input_7
+
+# qhasm: int32 caller_r4
+
+# qhasm: int32 caller_r5
+
+# qhasm: int32 caller_r6
+
+# qhasm: int32 caller_r7
+
+# qhasm: int32 caller_r8
+
+# qhasm: int32 caller_r9
+
+# qhasm: int32 caller_r10
+
+# qhasm: int32 caller_r11
+
+# qhasm: int32 caller_r12
+
+# qhasm: int32 caller_r14
+
+# qhasm: reg128 caller_q4
+
+# qhasm: reg128 caller_q5
+
+# qhasm: reg128 caller_q6
+
+# qhasm: reg128 caller_q7
+
+# qhasm: reg128 r0
+
+# qhasm: reg128 r1
+
+# qhasm: reg128 r2
+
+# qhasm: reg128 r3
+
+# qhasm: reg128 r4
+
+# qhasm: reg128 x01
+
+# qhasm: reg128 x23
+
+# qhasm: reg128 x4
+
+# qhasm: reg128 y01
+
+# qhasm: reg128 y23
+
+# qhasm: reg128 y4
+
+# qhasm: reg128 _5y01
+
+# qhasm: reg128 _5y23
+
+# qhasm: reg128 _5y4
+
+# qhasm: reg128 c01
+
+# qhasm: reg128 c23
+
+# qhasm: reg128 c4
+
+# qhasm: reg128 t0
+
+# qhasm: reg128 t1
+
+# qhasm: reg128 t2
+
+# qhasm: reg128 t3
+
+# qhasm: reg128 t4
+
+# qhasm: reg128 mask
+
+# qhasm: enter crypto_onetimeauth_poly1305_neon2_addmulmod
+.align 2
+.global openssl_poly1305_neon2_addmulmod
+.hidden openssl_poly1305_neon2_addmulmod
+.type openssl_poly1305_neon2_addmulmod STT_FUNC
+openssl_poly1305_neon2_addmulmod:
+sub sp,sp,#0
+
+# qhasm: 				2x mask = 0xffffffff
+# asm 1: vmov.i64 >mask=reg128#1,#0xffffffff
+# asm 2: vmov.i64 >mask=q0,#0xffffffff
+vmov.i64 q0,#0xffffffff
+
+# qhasm:   y01 aligned= mem128[input_2];input_2+=16
+# asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[<input_2=int32#3,: 128]!
+# asm 2: vld1.8 {>y01=d2->y01=d3},[<input_2=r2,: 128]!
+vld1.8 {d2-d3},[r2,: 128]!
+
+# qhasm: 4x _5y01 = y01 << 2
+# asm 1: vshl.i32 >_5y01=reg128#3,<y01=reg128#2,#2
+# asm 2: vshl.i32 >_5y01=q2,<y01=q1,#2
+vshl.i32 q2,q1,#2
+
+# qhasm:   y23 aligned= mem128[input_2];input_2+=16
+# asm 1: vld1.8 {>y23=reg128#4%bot->y23=reg128#4%top},[<input_2=int32#3,: 128]!
+# asm 2: vld1.8 {>y23=d6->y23=d7},[<input_2=r2,: 128]!
+vld1.8 {d6-d7},[r2,: 128]!
+
+# qhasm: 4x _5y23 = y23 << 2
+# asm 1: vshl.i32 >_5y23=reg128#9,<y23=reg128#4,#2
+# asm 2: vshl.i32 >_5y23=q8,<y23=q3,#2
+vshl.i32 q8,q3,#2
+
+# qhasm:   y4  aligned= mem64[input_2]y4[1]
+# asm 1: vld1.8 {<y4=reg128#10%bot},[<input_2=int32#3,: 64]
+# asm 2: vld1.8 {<y4=d18},[<input_2=r2,: 64]
+vld1.8 {d18},[r2,: 64]
+
+# qhasm: 4x _5y4 = y4 << 2
+# asm 1: vshl.i32 >_5y4=reg128#11,<y4=reg128#10,#2
+# asm 2: vshl.i32 >_5y4=q10,<y4=q9,#2
+vshl.i32 q10,q9,#2
+
+# qhasm:   x01 aligned= mem128[input_1];input_1+=16
+# asm 1: vld1.8 {>x01=reg128#12%bot->x01=reg128#12%top},[<input_1=int32#2,: 128]!
+# asm 2: vld1.8 {>x01=d22->x01=d23},[<input_1=r1,: 128]!
+vld1.8 {d22-d23},[r1,: 128]!
+
+# qhasm: 4x _5y01 += y01
+# asm 1: vadd.i32 >_5y01=reg128#3,<_5y01=reg128#3,<y01=reg128#2
+# asm 2: vadd.i32 >_5y01=q2,<_5y01=q2,<y01=q1
+vadd.i32 q2,q2,q1
+
+# qhasm:   x23 aligned= mem128[input_1];input_1+=16
+# asm 1: vld1.8 {>x23=reg128#13%bot->x23=reg128#13%top},[<input_1=int32#2,: 128]!
+# asm 2: vld1.8 {>x23=d24->x23=d25},[<input_1=r1,: 128]!
+vld1.8 {d24-d25},[r1,: 128]!
+
+# qhasm: 4x _5y23 += y23
+# asm 1: vadd.i32 >_5y23=reg128#9,<_5y23=reg128#9,<y23=reg128#4
+# asm 2: vadd.i32 >_5y23=q8,<_5y23=q8,<y23=q3
+vadd.i32 q8,q8,q3
+
+# qhasm: 4x _5y4 += y4
+# asm 1: vadd.i32 >_5y4=reg128#11,<_5y4=reg128#11,<y4=reg128#10
+# asm 2: vadd.i32 >_5y4=q10,<_5y4=q10,<y4=q9
+vadd.i32 q10,q10,q9
+
+# qhasm:   c01 aligned= mem128[input_3];input_3+=16
+# asm 1: vld1.8 {>c01=reg128#14%bot->c01=reg128#14%top},[<input_3=int32#4,: 128]!
+# asm 2: vld1.8 {>c01=d26->c01=d27},[<input_3=r3,: 128]!
+vld1.8 {d26-d27},[r3,: 128]!
+
+# qhasm: 4x x01 += c01
+# asm 1: vadd.i32 >x01=reg128#12,<x01=reg128#12,<c01=reg128#14
+# asm 2: vadd.i32 >x01=q11,<x01=q11,<c01=q13
+vadd.i32 q11,q11,q13
+
+# qhasm:   c23 aligned= mem128[input_3];input_3+=16
+# asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_3=int32#4,: 128]!
+# asm 2: vld1.8 {>c23=d26->c23=d27},[<input_3=r3,: 128]!
+vld1.8 {d26-d27},[r3,: 128]!
+
+# qhasm: 4x x23 += c23
+# asm 1: vadd.i32 >x23=reg128#13,<x23=reg128#13,<c23=reg128#14
+# asm 2: vadd.i32 >x23=q12,<x23=q12,<c23=q13
+vadd.i32 q12,q12,q13
+
+# qhasm:   x4  aligned= mem64[input_1]x4[1]
+# asm 1: vld1.8 {<x4=reg128#14%bot},[<input_1=int32#2,: 64]
+# asm 2: vld1.8 {<x4=d26},[<input_1=r1,: 64]
+vld1.8 {d26},[r1,: 64]
+
+# qhasm: 				2x mask unsigned>>=6
+# asm 1: vshr.u64 >mask=reg128#1,<mask=reg128#1,#6
+# asm 2: vshr.u64 >mask=q0,<mask=q0,#6
+vshr.u64 q0,q0,#6
+
+# qhasm:   c4  aligned= mem64[input_3]c4[1]
+# asm 1: vld1.8 {<c4=reg128#15%bot},[<input_3=int32#4,: 64]
+# asm 2: vld1.8 {<c4=d28},[<input_3=r3,: 64]
+vld1.8 {d28},[r3,: 64]
+
+# qhasm: 4x x4 += c4
+# asm 1: vadd.i32 >x4=reg128#14,<x4=reg128#14,<c4=reg128#15
+# asm 2: vadd.i32 >x4=q13,<x4=q13,<c4=q14
+vadd.i32 q13,q13,q14
+
+# qhasm: r0[0,1]  = x01[0] unsigned* y01[0];   r0[2,3]  = x01[1] unsigned* y01[1]
+# asm 1: vmull.u32 >r0=reg128#15,<x01=reg128#12%bot,<y01=reg128#2%bot
+# asm 2: vmull.u32 >r0=q14,<x01=d22,<y01=d2
+vmull.u32 q14,d22,d2
+
+# qhasm: r0[0,1] += x01[2] unsigned*  _5y4[0]; r0[2,3] += x01[3] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r0=reg128#15,<x01=reg128#12%top,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r0=q14,<x01=d23,<_5y4=d20
+vmlal.u32 q14,d23,d20
+
+# qhasm: r0[0,1] += x23[0] unsigned* _5y23[2]; r0[2,3] += x23[1] unsigned* _5y23[3]
+# asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%bot,<_5y23=reg128#9%top
+# asm 2: vmlal.u32 <r0=q14,<x23=d24,<_5y23=d17
+vmlal.u32 q14,d24,d17
+
+# qhasm: r0[0,1] += x23[2] unsigned* _5y23[0]; r0[2,3] += x23[3] unsigned* _5y23[1]
+# asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%top,<_5y23=reg128#9%bot
+# asm 2: vmlal.u32 <r0=q14,<x23=d25,<_5y23=d16
+vmlal.u32 q14,d25,d16
+
+# qhasm: r0[0,1] +=  x4[0] unsigned* _5y01[2]; r0[2,3] +=  x4[1] unsigned* _5y01[3]
+# asm 1: vmlal.u32 <r0=reg128#15,<x4=reg128#14%bot,<_5y01=reg128#3%top
+# asm 2: vmlal.u32 <r0=q14,<x4=d26,<_5y01=d5
+vmlal.u32 q14,d26,d5
+
+# qhasm: r1[0,1]  = x01[0] unsigned* y01[2];   r1[2,3]  = x01[1] unsigned* y01[3]
+# asm 1: vmull.u32 >r1=reg128#3,<x01=reg128#12%bot,<y01=reg128#2%top
+# asm 2: vmull.u32 >r1=q2,<x01=d22,<y01=d3
+vmull.u32 q2,d22,d3
+
+# qhasm: r1[0,1] += x01[2] unsigned* y01[0];   r1[2,3] += x01[3] unsigned* y01[1]
+# asm 1: vmlal.u32 <r1=reg128#3,<x01=reg128#12%top,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r1=q2,<x01=d23,<y01=d2
+vmlal.u32 q2,d23,d2
+
+# qhasm: r1[0,1] += x23[0] unsigned*  _5y4[0]; r1[2,3] += x23[1] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%bot,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r1=q2,<x23=d24,<_5y4=d20
+vmlal.u32 q2,d24,d20
+
+# qhasm: r1[0,1] += x23[2] unsigned* _5y23[2]; r1[2,3] += x23[3] unsigned* _5y23[3]
+# asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%top,<_5y23=reg128#9%top
+# asm 2: vmlal.u32 <r1=q2,<x23=d25,<_5y23=d17
+vmlal.u32 q2,d25,d17
+
+# qhasm: r1[0,1] +=  x4[0] unsigned* _5y23[0]; r1[2,3] +=  x4[1] unsigned* _5y23[1]
+# asm 1: vmlal.u32 <r1=reg128#3,<x4=reg128#14%bot,<_5y23=reg128#9%bot
+# asm 2: vmlal.u32 <r1=q2,<x4=d26,<_5y23=d16
+vmlal.u32 q2,d26,d16
+
+# qhasm: r2[0,1]  = x01[0] unsigned* y23[0];   r2[2,3]  = x01[1] unsigned* y23[1]
+# asm 1: vmull.u32 >r2=reg128#16,<x01=reg128#12%bot,<y23=reg128#4%bot
+# asm 2: vmull.u32 >r2=q15,<x01=d22,<y23=d6
+vmull.u32 q15,d22,d6
+
+# qhasm: r2[0,1] += x01[2] unsigned* y01[2];   r2[2,3] += x01[3] unsigned* y01[3]
+# asm 1: vmlal.u32 <r2=reg128#16,<x01=reg128#12%top,<y01=reg128#2%top
+# asm 2: vmlal.u32 <r2=q15,<x01=d23,<y01=d3
+vmlal.u32 q15,d23,d3
+
+# qhasm: r2[0,1] += x23[0] unsigned* y01[0];   r2[2,3] += x23[1] unsigned* y01[1]
+# asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%bot,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r2=q15,<x23=d24,<y01=d2
+vmlal.u32 q15,d24,d2
+
+# qhasm: r2[0,1] += x23[2] unsigned*  _5y4[0]; r2[2,3] += x23[3] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%top,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r2=q15,<x23=d25,<_5y4=d20
+vmlal.u32 q15,d25,d20
+
+# qhasm: r2[0,1] +=  x4[0] unsigned* _5y23[2]; r2[2,3] +=  x4[1] unsigned* _5y23[3]
+# asm 1: vmlal.u32 <r2=reg128#16,<x4=reg128#14%bot,<_5y23=reg128#9%top
+# asm 2: vmlal.u32 <r2=q15,<x4=d26,<_5y23=d17
+vmlal.u32 q15,d26,d17
+
+# qhasm: r3[0,1]  = x01[0] unsigned* y23[2];   r3[2,3]  = x01[1] unsigned* y23[3]
+# asm 1: vmull.u32 >r3=reg128#9,<x01=reg128#12%bot,<y23=reg128#4%top
+# asm 2: vmull.u32 >r3=q8,<x01=d22,<y23=d7
+vmull.u32 q8,d22,d7
+
+# qhasm: r3[0,1] += x01[2] unsigned* y23[0];   r3[2,3] += x01[3] unsigned* y23[1]
+# asm 1: vmlal.u32 <r3=reg128#9,<x01=reg128#12%top,<y23=reg128#4%bot
+# asm 2: vmlal.u32 <r3=q8,<x01=d23,<y23=d6
+vmlal.u32 q8,d23,d6
+
+# qhasm: r3[0,1] += x23[0] unsigned* y01[2];   r3[2,3] += x23[1] unsigned* y01[3]
+# asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%bot,<y01=reg128#2%top
+# asm 2: vmlal.u32 <r3=q8,<x23=d24,<y01=d3
+vmlal.u32 q8,d24,d3
+
+# qhasm: r3[0,1] += x23[2] unsigned* y01[0];   r3[2,3] += x23[3] unsigned* y01[1]
+# asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%top,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r3=q8,<x23=d25,<y01=d2
+vmlal.u32 q8,d25,d2
+
+# qhasm: r3[0,1] +=  x4[0] unsigned*  _5y4[0]; r3[2,3] +=  x4[1] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r3=reg128#9,<x4=reg128#14%bot,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r3=q8,<x4=d26,<_5y4=d20
+vmlal.u32 q8,d26,d20
+
+# qhasm: r4[0,1]  = x01[0] unsigned*  y4[0];  r4[2,3]  = x01[1] unsigned*  y4[1]
+# asm 1: vmull.u32 >r4=reg128#10,<x01=reg128#12%bot,<y4=reg128#10%bot
+# asm 2: vmull.u32 >r4=q9,<x01=d22,<y4=d18
+vmull.u32 q9,d22,d18
+
+# qhasm: r4[0,1] += x01[2] unsigned* y23[2];  r4[2,3] += x01[3] unsigned* y23[3]
+# asm 1: vmlal.u32 <r4=reg128#10,<x01=reg128#12%top,<y23=reg128#4%top
+# asm 2: vmlal.u32 <r4=q9,<x01=d23,<y23=d7
+vmlal.u32 q9,d23,d7
+
+# qhasm: r4[0,1] += x23[0] unsigned* y23[0];  r4[2,3] += x23[1] unsigned* y23[1]
+# asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%bot,<y23=reg128#4%bot
+# asm 2: vmlal.u32 <r4=q9,<x23=d24,<y23=d6
+vmlal.u32 q9,d24,d6
+
+# qhasm: r4[0,1] += x23[2] unsigned* y01[2];  r4[2,3] += x23[3] unsigned* y01[3]
+# asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%top,<y01=reg128#2%top
+# asm 2: vmlal.u32 <r4=q9,<x23=d25,<y01=d3
+vmlal.u32 q9,d25,d3
+
+# qhasm: r4[0,1] +=  x4[0] unsigned* y01[0];  r4[2,3] +=  x4[1] unsigned* y01[1]
+# asm 1: vmlal.u32 <r4=reg128#10,<x4=reg128#14%bot,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r4=q9,<x4=d26,<y01=d2
+vmlal.u32 q9,d26,d2
+
+# qhasm: 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#2,<r0=reg128#15,#26
+# asm 2: vshr.u64 >t1=q1,<r0=q14,#26
+vshr.u64 q1,q14,#26
+
+# qhasm:    r0 &= mask
+# asm 1: vand >r0=reg128#4,<r0=reg128#15,<mask=reg128#1
+# asm 2: vand >r0=q3,<r0=q14,<mask=q0
+vand q3,q14,q0
+
+# qhasm: 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#3,<t1=reg128#2
+# asm 2: vadd.i64 >r1=q1,<r1=q2,<t1=q1
+vadd.i64 q1,q2,q1
+
+# qhasm:                 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#3,<r3=reg128#9,#26
+# asm 2: vshr.u64 >t4=q2,<r3=q8,#26
+vshr.u64 q2,q8,#26
+
+# qhasm:                    r3 &= mask
+# asm 1: vand >r3=reg128#9,<r3=reg128#9,<mask=reg128#1
+# asm 2: vand >r3=q8,<r3=q8,<mask=q0
+vand q8,q8,q0
+
+# qhasm:                 2x r4 += t4
+# asm 1: vadd.i64 >r4=reg128#3,<r4=reg128#10,<t4=reg128#3
+# asm 2: vadd.i64 >r4=q2,<r4=q9,<t4=q2
+vadd.i64 q2,q9,q2
+
+# qhasm: 2x t2 = r1 unsigned>> 26
+# asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#2,#26
+# asm 2: vshr.u64 >t2=q9,<r1=q1,#26
+vshr.u64 q9,q1,#26
+
+# qhasm:    r1 &= mask
+# asm 1: vand >r1=reg128#2,<r1=reg128#2,<mask=reg128#1
+# asm 2: vand >r1=q1,<r1=q1,<mask=q0
+vand q1,q1,q0
+
+# qhasm:                 2x t0 = r4 unsigned>> 26
+# asm 1: vshr.u64 >t0=reg128#11,<r4=reg128#3,#26
+# asm 2: vshr.u64 >t0=q10,<r4=q2,#26
+vshr.u64 q10,q2,#26
+
+# qhasm: 2x r2 += t2
+# asm 1: vadd.i64 >r2=reg128#10,<r2=reg128#16,<t2=reg128#10
+# asm 2: vadd.i64 >r2=q9,<r2=q15,<t2=q9
+vadd.i64 q9,q15,q9
+
+# qhasm:                    r4 &= mask
+# asm 1: vand >r4=reg128#3,<r4=reg128#3,<mask=reg128#1
+# asm 2: vand >r4=q2,<r4=q2,<mask=q0
+vand q2,q2,q0
+
+# qhasm:                 2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10
+vadd.i64 q3,q3,q10
+
+# qhasm:                 2x t0 <<= 2
+# asm 1: vshl.i64 >t0=reg128#11,<t0=reg128#11,#2
+# asm 2: vshl.i64 >t0=q10,<t0=q10,#2
+vshl.i64 q10,q10,#2
+
+# qhasm: 2x t3 = r2 unsigned>> 26
+# asm 1: vshr.u64 >t3=reg128#12,<r2=reg128#10,#26
+# asm 2: vshr.u64 >t3=q11,<r2=q9,#26
+vshr.u64 q11,q9,#26
+
+# qhasm:                 2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10
+vadd.i64 q3,q3,q10
+
+# qhasm:    x23 = r2 & mask
+# asm 1: vand >x23=reg128#10,<r2=reg128#10,<mask=reg128#1
+# asm 2: vand >x23=q9,<r2=q9,<mask=q0
+vand q9,q9,q0
+
+# qhasm: 2x r3 += t3
+# asm 1: vadd.i64 >r3=reg128#9,<r3=reg128#9,<t3=reg128#12
+# asm 2: vadd.i64 >r3=q8,<r3=q8,<t3=q11
+vadd.i64 q8,q8,q11
+
+# qhasm:                 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#11,<r0=reg128#4,#26
+# asm 2: vshr.u64 >t1=q10,<r0=q3,#26
+vshr.u64 q10,q3,#26
+
+# qhasm: 				x23 = x23[0,2,1,3]
+# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
+# asm 2: vtrn.32 <x23=d18,<x23=d19
+vtrn.32 d18,d19
+
+# qhasm:                    x01 = r0 & mask
+# asm 1: vand >x01=reg128#4,<r0=reg128#4,<mask=reg128#1
+# asm 2: vand >x01=q3,<r0=q3,<mask=q0
+vand q3,q3,q0
+
+# qhasm:                 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#2,<t1=reg128#11
+# asm 2: vadd.i64 >r1=q1,<r1=q1,<t1=q10
+vadd.i64 q1,q1,q10
+
+# qhasm: 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#11,<r3=reg128#9,#26
+# asm 2: vshr.u64 >t4=q10,<r3=q8,#26
+vshr.u64 q10,q8,#26
+
+# qhasm: 				x01 = x01[0,2,1,3]
+# asm 1: vtrn.32 <x01=reg128#4%bot,<x01=reg128#4%top
+# asm 2: vtrn.32 <x01=d6,<x01=d7
+vtrn.32 d6,d7
+
+# qhasm:    r3 &= mask
+# asm 1: vand >r3=reg128#1,<r3=reg128#9,<mask=reg128#1
+# asm 2: vand >r3=q0,<r3=q8,<mask=q0
+vand q0,q8,q0
+
+# qhasm: 				r1 = r1[0,2,1,3]
+# asm 1: vtrn.32 <r1=reg128#2%bot,<r1=reg128#2%top
+# asm 2: vtrn.32 <r1=d2,<r1=d3
+vtrn.32 d2,d3
+
+# qhasm: 2x x4 = r4 + t4
+# asm 1: vadd.i64 >x4=reg128#3,<r4=reg128#3,<t4=reg128#11
+# asm 2: vadd.i64 >x4=q2,<r4=q2,<t4=q10
+vadd.i64 q2,q2,q10
+
+# qhasm: 				r3 = r3[0,2,1,3]
+# asm 1: vtrn.32 <r3=reg128#1%bot,<r3=reg128#1%top
+# asm 2: vtrn.32 <r3=d0,<r3=d1
+vtrn.32 d0,d1
+
+# qhasm: 				x01 = x01[0,1] r1[0,1]
+# asm 1: vext.32 <x01=reg128#4%top,<r1=reg128#2%bot,<r1=reg128#2%bot,#0
+# asm 2: vext.32 <x01=d7,<r1=d2,<r1=d2,#0
+vext.32 d7,d2,d2,#0
+
+# qhasm: 				x23 = x23[0,1] r3[0,1]
+# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#1%bot,<r3=reg128#1%bot,#0
+# asm 2: vext.32 <x23=d19,<r3=d0,<r3=d0,#0
+vext.32 d19,d0,d0,#0
+
+# qhasm: 				x4 = x4[0,2,1,3]
+# asm 1: vtrn.32 <x4=reg128#3%bot,<x4=reg128#3%top
+# asm 2: vtrn.32 <x4=d4,<x4=d5
+vtrn.32 d4,d5
+
+# qhasm: mem128[input_0] aligned= x01;input_0+=16
+# asm 1: vst1.8 {<x01=reg128#4%bot-<x01=reg128#4%top},[<input_0=int32#1,: 128]!
+# asm 2: vst1.8 {<x01=d6-<x01=d7},[<input_0=r0,: 128]!
+vst1.8 {d6-d7},[r0,: 128]!
+
+# qhasm: mem128[input_0] aligned= x23;input_0+=16
+# asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1,: 128]!
+# asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0,: 128]!
+vst1.8 {d18-d19},[r0,: 128]!
+
+# qhasm: mem64[input_0] aligned= x4[0]
+# asm 1: vst1.8 <x4=reg128#3%bot,[<input_0=int32#1,: 64]
+# asm 2: vst1.8 <x4=d4,[<input_0=r0,: 64]
+vst1.8 d4,[r0,: 64]
+
+# qhasm: return
+add sp,sp,#0
+bx lr
+
+#endif  /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */
diff --git a/ring-0.17.14/include/ring-core/aes.h b/ring-0.17.14/include/ring-core/aes.h
new file mode 100644
index 0000000000..a87c771a4f
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/aes.h
@@ -0,0 +1,34 @@
+// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_AES_H
+#define OPENSSL_HEADER_AES_H
+
+#include <ring-core/base.h>
+
+// Raw AES functions.
+
+
+// AES_MAXNR is the maximum number of AES rounds.
+#define AES_MAXNR 14
+
+// aes_key_st should be an opaque type, but EVP requires that the size be
+// known.
+struct aes_key_st {
+  uint32_t rd_key[4 * (AES_MAXNR + 1)];
+  unsigned rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+#endif  // OPENSSL_HEADER_AES_H
diff --git a/ring-0.17.14/include/ring-core/asm_base.h b/ring-0.17.14/include/ring-core/asm_base.h
new file mode 100644
index 0000000000..ef8332680f
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/asm_base.h
@@ -0,0 +1,204 @@
+// Copyright 2023 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_ASM_BASE_H
+#define OPENSSL_HEADER_ASM_BASE_H
+
+#include <ring-core/target.h>
+
+
+// This header contains symbols and common sections used by assembly files. It
+// is included as a public header to simplify the build, but is not intended for
+// external use.
+//
+// Every assembly file must include this header. Some linker features require
+// all object files to be tagged with some section metadata. This header file,
+// when included in assembly, adds that metadata. It also makes defines like
+// |OPENSSL_X86_64| available and includes the prefixing macros.
+//
+// Including this header in an assembly file imples:
+//
+// - The file does not require an executable stack.
+//
+// - The file, on aarch64, uses the macros defined below to be compatible with
+//   BTI and PAC.
+//
+// - The file, on x86_64, requires the program to be compatible with Intel IBT
+//   and SHSTK
+
+#if defined(__ASSEMBLER__)
+
+#include <ring_core_generated/prefix_symbols_asm.h>
+
+#if defined(__ELF__)
+// Every ELF object file, even empty ones, should disable executable stacks. See
+// https://www.airs.com/blog/archives/518.
+.pushsection .note.GNU-stack, "", %progbits
+.popsection
+#endif
+
+#if defined(__CET__) && defined(OPENSSL_X86_64)
+// Clang and GCC define __CET__ and provide <cet.h> when they support Intel's
+// Indirect Branch Tracking.
+// https://lpc.events/event/7/contributions/729/attachments/496/903/CET-LPC-2020.pdf
+//
+// cet.h defines _CET_ENDBR which is used to mark function entry points for IBT.
+// and adds the assembly marker. The value of _CET_ENDBR is made dependant on if
+// '-fcf-protection' is passed to the compiler. _CET_ENDBR is only required when
+// the function is the target of an indirect jump, but BoringSSL chooses to mark
+// all assembly entry points because it is easier, and allows BoringSSL's ABI
+// tester to call the assembly entry points via an indirect jump.
+#include <cet.h>
+#else
+#define _CET_ENDBR
+#endif
+
+#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
+
+// We require the ARM assembler provide |__ARM_ARCH| from Arm C Language
+// Extensions (ACLE). This is supported in GCC 4.8+ and Clang 3.2+. MSVC does
+// not implement ACLE, but we require Clang's assembler on Windows.
+#if !defined(__ARM_ARCH)
+#error "ARM assembler must define __ARM_ARCH"
+#endif
+
+// Even when building for 32-bit ARM, support for aarch64 crypto instructions
+// will be included.
+//
+// TODO(davidben): Remove this and the corresponding ifdefs? This is only
+// defined because some OpenSSL assembly files would allow disabling the NEON
+// code entirely. I think we'd prefer to do that by lifting the dispatch to C
+// anyway.
+#define __ARM_MAX_ARCH__ 8
+
+// Support macros for
+//   - Armv8.3-A Pointer Authentication and
+//   - Armv8.5-A Branch Target Identification
+// features which require emitting a .note.gnu.property section with the
+// appropriate architecture-dependent feature bits set.
+//
+// |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+// PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+// used immediately before saving the LR register (x30) to the stack.
+// |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+// it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+// with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+// have the same value at the two points. For example:
+//
+//   .global f
+//   f:
+//     AARCH64_SIGN_LINK_REGISTER
+//     stp x29, x30, [sp, #-96]!
+//     mov x29, sp
+//     ...
+//     ldp x29, x30, [sp], #96
+//     AARCH64_VALIDATE_LINK_REGISTER
+//     ret
+//
+// |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+// |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+// indirect call target. In particular, all symbols exported from a file must
+// begin with one of these macros. For example, a leaf function that does not
+// save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+//
+//   .globl return_zero
+//   return_zero:
+//     AARCH64_VALID_CALL_TARGET
+//     mov x0, #0
+//     ret
+//
+// A non-leaf function which does not immediately save LR may need both macros
+// because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+// may jump to an alternate implementation before setting up the stack:
+//
+//   .globl with_early_jump
+//   with_early_jump:
+//     AARCH64_VALID_CALL_TARGET
+//     cmp x0, #128
+//     b.lt .Lwith_early_jump_128
+//     AARCH64_SIGN_LINK_REGISTER
+//     stp x29, x30, [sp, #-96]!
+//     mov x29, sp
+//     ...
+//     ldp x29, x30, [sp], #96
+//     AARCH64_VALIDATE_LINK_REGISTER
+//     ret
+//
+//  .Lwith_early_jump_128:
+//     ...
+//     ret
+//
+// These annotations are only required with indirect calls. Private symbols that
+// are only the target of direct calls do not require annotations. Also note
+// that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+// indirect jumps (BR). Indirect jumps in assembly are currently not supported
+// and would require a macro for BTI 'j'.
+//
+// Although not necessary, it is safe to use these macros in 32-bit ARM
+// assembly. This may be used to simplify dual 32-bit and 64-bit files.
+//
+// References:
+// - "ELF for the Arm® 64-bit Architecture"
+//   https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+// - "Providing protection for complex software"
+//   https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has Branch Target Identification
+#define AARCH64_VALID_CALL_TARGET hint #34  // BTI 'c'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0  // No Branch Target Identification
+#define AARCH64_VALID_CALL_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT) && \
+    (__ARM_FEATURE_PAC_DEFAULT & 1) == 1  // Signed with A-key
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+  (1 << 1)                                       // Has Pointer Authentication
+#define AARCH64_SIGN_LINK_REGISTER hint #25      // PACIASP
+#define AARCH64_VALIDATE_LINK_REGISTER hint #29  // AUTIASP
+#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
+    (__ARM_FEATURE_PAC_DEFAULT & 2) == 2  // Signed with B-key
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+  (1 << 1)                                       // Has Pointer Authentication
+#define AARCH64_SIGN_LINK_REGISTER hint #27      // PACIBSP
+#define AARCH64_VALIDATE_LINK_REGISTER hint #31  // AUTIBSP
+#else
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0  // No Pointer Authentication
+#if GNU_PROPERTY_AARCH64_BTI != 0
+#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
+#else
+#define AARCH64_SIGN_LINK_REGISTER
+#endif
+#define AARCH64_VALIDATE_LINK_REGISTER
+#endif
+
+#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
+.pushsection .note.gnu.property, "a";
+.balign 8;
+.long 4;
+.long 0x10;
+.long 0x5;
+.asciz "GNU";
+.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+.long 4;
+.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
+.long 0;
+.popsection;
+#endif
+#endif  // ARM || AARCH64
+
+#endif  // __ASSEMBLER__
+
+#endif  // OPENSSL_HEADER_ASM_BASE_H
diff --git a/ring-0.17.14/include/ring-core/base.h b/ring-0.17.14/include/ring-core/base.h
new file mode 100644
index 0000000000..d18a55e278
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/base.h
@@ -0,0 +1,66 @@
+// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_BASE_H
+#define OPENSSL_HEADER_BASE_H
+
+
+// This file should be the first included by all BoringSSL headers.
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push, 3)
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#include <ring-core/target.h>  // IWYU pragma: export
+
+#include <ring_core_generated/prefix_symbols.h>
+
+#include <ring-core/type_check.h>
+
+#if defined(__APPLE__)
+// Note |TARGET_OS_MAC| is set for all Apple OS variants. |TARGET_OS_OSX|
+// targets macOS specifically.
+#if defined(TARGET_OS_OSX) && TARGET_OS_OSX
+#define OPENSSL_MACOS
+#endif
+#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
+#define OPENSSL_IOS
+#endif
+#endif
+
+// *ring* doesn't support the `BORINGSSL_SHARED_LIBRARY` configuration, so
+// the default (usually "hidden") visibility is always used, even for exported
+// items.
+#define OPENSSL_EXPORT
+
+// `ring::c` would need to be customized on any platform where these assertions
+// fail. Keep in sync with `ring::c`.
+OPENSSL_STATIC_ASSERT(sizeof(int32_t) == sizeof(int), "int isn't 32 bits.");
+OPENSSL_STATIC_ASSERT(sizeof(uint32_t) == sizeof(unsigned int), "unsigned int isn't 32 bits.");
+OPENSSL_STATIC_ASSERT(sizeof(size_t) == sizeof(uintptr_t), "uintptr_t and size_t differ.");
+OPENSSL_STATIC_ASSERT(sizeof(size_t) <= sizeof(uint64_t), "size_t is larger than uint64_t.");
+OPENSSL_STATIC_ASSERT(sizeof(size_t) >= sizeof(uint32_t), "size_t is smaller than uint32_t.");
+
+#endif  // OPENSSL_HEADER_BASE_H
diff --git a/ring-0.17.14/include/ring-core/check.h b/ring-0.17.14/include/ring-core/check.h
new file mode 100644
index 0000000000..998289a1d0
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/check.h
@@ -0,0 +1,52 @@
+// Copyright 2020 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#ifndef RING_CHECK_H
+#define RING_CHECK_H
+
+// |debug_assert_nonsecret| is like |assert| and should be used (only) when the
+// assertion does not have any potential to leak a secret. |NDEBUG| controls this
+// exactly like |assert|. It is emulated when there is no assert.h to make
+// cross-building easier.
+//
+// When reviewing uses of |debug_assert_nonsecret|, verify that the check
+// really does not have potential to leak a secret.
+
+#if !defined(RING_CORE_NOSTDLIBINC)
+# include <assert.h>
+# define debug_assert_nonsecret(x) assert(x)
+#else
+# if !defined(NDEBUG)
+#  define debug_assert_nonsecret(x) ((x) ? ((void)0) : __builtin_trap())
+# else
+#  define debug_assert_nonsecret(x) ((void)0)
+# endif
+#endif
+
+// |dev_assert_secret| is like |assert| and should be used (only) when the
+// assertion operates on secret data in a way that has the potential to leak
+// the secret. |dev_assert_secret| can only be enabled by changing the |#if 0|
+// here to |#if 1| (or equivalent) when |NDEBUG| is not defined. This is not
+// controlled only through |NDEBUG| so that such checks do not leak into debug
+// builds that may make it into production use.
+//
+// When reviewing uses of |dev_assert_secret|, verify that the check really
+// does have the potential to leak a secret.
+#if 0 // DO NOT COMMIT CHANGES TO THIS LINE.
+# define dev_assert_secret debug_assert_nonsecret
+#else
+# define dev_assert_secret(x) ((void)0)
+#endif
+
+#endif // RING_CHECK_H
diff --git a/ring-0.17.14/include/ring-core/mem.h b/ring-0.17.14/include/ring-core/mem.h
new file mode 100644
index 0000000000..8f8f32ec84
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/mem.h
@@ -0,0 +1,27 @@
+// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_MEM_H
+#define OPENSSL_HEADER_MEM_H
+
+#include <ring-core/base.h>
+
+// CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+// takes an amount of time dependent on |len|, but independent of the contents
+// of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+// defined order as the return value when a != b is undefined, other than to be
+// non-zero.
+OPENSSL_EXPORT int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
+#endif  // OPENSSL_HEADER_MEM_H
diff --git a/ring-0.17.14/include/ring-core/target.h b/ring-0.17.14/include/ring-core/target.h
new file mode 100644
index 0000000000..fe40db7722
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/target.h
@@ -0,0 +1,97 @@
+// Copyright 2023 The BoringSSL Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_TARGET_H
+#define OPENSSL_HEADER_TARGET_H
+
+// Preprocessor symbols that define the target platform.
+//
+// This file may be included in C, C++, and assembler and must be compatible
+// with each environment. It is separated out only to share code between
+// <ring-core/base.h> and <ring-core/asm_base.h>. Prefer to include those headers
+// instead.
+
+#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64)
+#define OPENSSL_64_BIT
+#define OPENSSL_X86_64
+#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#define OPENSSL_32_BIT
+#define OPENSSL_X86
+#elif defined(__AARCH64EL__) || defined(_M_ARM64)
+#define OPENSSL_64_BIT
+#define OPENSSL_AARCH64
+#elif defined(__ARMEL__) || defined(_M_ARM)
+#define OPENSSL_32_BIT
+#define OPENSSL_ARM
+// All of following architectures are only supported when `__BYTE_ORDER__` can be used to detect
+// endianness (in crypto/internal.h).
+#elif !defined(__BYTE_ORDER__)
+#error "Cannot determine endianness because __BYTE_ORDER__ is not defined"
+// Targets are assumed to be little-endian unless __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__.
+#elif !(defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) && \
+      !(defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+#error "Unsupported endianness"
+#elif defined(__LP64__)
+#define OPENSSL_64_BIT
+#elif defined(__ILP32__)
+#define OPENSSL_32_BIT
+// Versions of GCC before 10.0 didn't define `__ILP32__` for all 32-bit targets.
+#elif defined(__MIPSEL__) || defined(__MIPSEB__) || defined(__PPC__) || defined(__powerpc__) || defined(__csky__) || defined(__XTENSA__)
+#define OPENSSL_32_BIT
+#else
+#error "Unknown target CPU"
+#endif
+
+#if defined(__APPLE__)
+#define OPENSSL_APPLE
+#endif
+
+#if defined(_WIN32)
+#define OPENSSL_WINDOWS
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define OPENSSL_ASAN
+#endif
+#if __has_feature(thread_sanitizer)
+#define OPENSSL_TSAN
+#endif
+#if __has_feature(memory_sanitizer)
+#define OPENSSL_MSAN
+#define OPENSSL_ASM_INCOMPATIBLE
+#endif
+#if __has_feature(hwaddress_sanitizer)
+#define OPENSSL_HWASAN
+#endif
+#endif
+
+// Disable 32-bit Arm assembly on Apple platforms. The last iOS version that
+// supported 32-bit Arm was iOS 10.
+#if defined(OPENSSL_APPLE) && defined(OPENSSL_ARM)
+#define OPENSSL_ASM_INCOMPATIBLE
+#endif
+
+#if defined(OPENSSL_ASM_INCOMPATIBLE)
+#undef OPENSSL_ASM_INCOMPATIBLE
+#if !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif  // OPENSSL_ASM_INCOMPATIBLE
+
+#if !defined(OPENSSL_X86_64) && !defined(OPENSSL_AARCH64)
+#define OPENSSL_SMALL
+#endif
+
+#endif  // OPENSSL_HEADER_TARGET_H
diff --git a/ring-0.17.14/include/ring-core/type_check.h b/ring-0.17.14/include/ring-core/type_check.h
new file mode 100644
index 0000000000..532fc8e8ff
--- /dev/null
+++ b/ring-0.17.14/include/ring-core/type_check.h
@@ -0,0 +1,32 @@
+// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPENSSL_HEADER_TYPE_CHECK_H
+#define OPENSSL_HEADER_TYPE_CHECK_H
+
+#include <ring-core/base.h>
+
+
+#if defined(__cplusplus) || (defined(_MSC_VER) && !defined(__clang__))
+// In C++ and non-clang MSVC, |static_assert| is a keyword.
+#define OPENSSL_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
+#else
+// C11 defines the |_Static_assert| keyword and the |static_assert| macro in
+// assert.h. While the former is available at all versions in Clang and GCC, the
+// later depends on libc and, in glibc, depends on being built in C11 mode. We
+// do not require this, for now, so use |_Static_assert| directly.
+#define OPENSSL_STATIC_ASSERT(cond, msg) _Static_assert(cond, msg)
+#endif
+
+#endif  // OPENSSL_HEADER_TYPE_CHECK_H
diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S
new file mode 100644
index 0000000000..597264bcca
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S
@@ -0,0 +1,1172 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section	.rodata
+.align	16
+
+
+.Lbswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad	1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+.align	32
+
+.Lctr_pattern:
+.quad	0, 0
+.quad	1, 0
+.Linc_2blocks:
+.quad	2, 0
+.quad	2, 0
+
+.text	
+.globl	gcm_init_vpclmulqdq_avx2
+.hidden gcm_init_vpclmulqdq_avx2
+.type	gcm_init_vpclmulqdq_avx2,@function
+.align	32
+gcm_init_vpclmulqdq_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+
+
+	vpshufd	$0x4e,(%rsi),%xmm3
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+	vpand	.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm6
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x01,%xmm3,%xmm3,%xmm1
+	vpclmulqdq	$0x10,%xmm3,%xmm3,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm6,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm6,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm5,%xmm5
+	vpxor	%xmm0,%xmm5,%xmm5
+
+
+
+	vinserti128	$1,%xmm3,%ymm5,%ymm3
+	vinserti128	$1,%xmm5,%ymm5,%ymm5
+
+
+.byte	0xc4,0xe3,0x65,0x44,0xc5,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcd,0x01
+.byte	0xc4,0xe3,0x65,0x44,0xd5,0x10
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x65,0x44,0xe5,0x11
+.byte	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128+32(%rdi)
+
+
+.byte	0xc4,0xe3,0x5d,0x44,0xc5,0x00
+.byte	0xc4,0xe3,0x5d,0x44,0xcd,0x01
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x10
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x5d,0x44,0xdd,0x11
+.byte	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm3,%ymm3
+
+.byte	0xc4,0xe3,0x65,0x44,0xc5,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcd,0x01
+.byte	0xc4,0xe3,0x65,0x44,0xd5,0x10
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x65,0x44,0xe5,0x11
+.byte	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,0(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128(%rdi)
+
+	vzeroupper
+	ret
+
+.cfi_endproc	
+.size	gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2
+.globl	gcm_ghash_vpclmulqdq_avx2_1
+.hidden gcm_ghash_vpclmulqdq_avx2_1
+.type	gcm_ghash_vpclmulqdq_avx2_1,@function
+.align	32
+gcm_ghash_vpclmulqdq_avx2_1:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm6
+	vmovdqu	.Lgfpoly(%rip),%xmm7
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+
+
+
+.Lghash_lastblock:
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vmovdqu	128-16(%rsi),%xmm0
+	vpclmulqdq	$0x00,%xmm0,%xmm5,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpclmulqdq	$0x10,%xmm0,%xmm5,%xmm3
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x01,%xmm1,%xmm7,%xmm3
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm2,%xmm7,%xmm1
+	vpshufd	$0x4e,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm5,%xmm5
+
+
+.Lghash_done:
+
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
+	ret
+
+.cfi_endproc	
+.size	gcm_ghash_vpclmulqdq_avx2_1, . - gcm_ghash_vpclmulqdq_avx2_1
+.globl	aes_gcm_enc_update_vaes_avx2
+.hidden aes_gcm_enc_update_vaes_avx2
+.type	aes_gcm_enc_update_vaes_avx2,@function
+.align	32
+aes_gcm_enc_update_vaes_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+8(%rip)
+#endif
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func1
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_first_4_vecs__func1
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+.byte	0xc4,0x62,0x1d,0xdd,0xe2
+.byte	0xc4,0x62,0x15,0xdd,0xeb
+.byte	0xc4,0x62,0x0d,0xdd,0xf5
+.byte	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	jbe	.Lghash_last_ciphertext_4x__func1
+.align	16
+.Lcrypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func1
+	je	.Laes192__func1
+
+	vbroadcasti128	-208(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-192(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.Laes192__func1:
+	vbroadcasti128	-176(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-160(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.Laes128__func1:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+.byte	0xc4,0xe3,0x65,0x44,0xec,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vbroadcasti128	-144(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vmovdqu	96(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+.byte	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+	subq	$-128,%rsi
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+.byte	0xc4,0x62,0x1d,0xdd,0xe2
+.byte	0xc4,0x62,0x15,0xdd,0xeb
+.byte	0xc4,0x62,0x0d,0xdd,0xf5
+.byte	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+.byte	0xc4,0xe3,0x65,0x44,0xec,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	96(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+.byte	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	subq	$-128,%rsi
+.Lcrypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func1
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func1
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func1:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func1
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm0,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+.byte	0xc4,0xe3,0x1d,0x44,0xea,0x00
+.byte	0xc4,0xe3,0x1d,0x44,0xf2,0x01
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xfa,0x11
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func1
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func1:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func1:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func1
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func1
+	je	.Lxor_two_blocks__func1
+
+.Lxor_three_blocks__func1:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_two_blocks__func1:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_one_block__func1:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func1:
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func1:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xdd,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x6d,0x44,0xde,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func1:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2
+.globl	aes_gcm_dec_update_vaes_avx2
+.hidden aes_gcm_dec_update_vaes_avx2
+.type	aes_gcm_dec_update_vaes_avx2,@function
+.align	32
+aes_gcm_dec_update_vaes_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func2
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+.align	16
+.Lcrypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func2
+	je	.Laes192__func2
+
+	vbroadcasti128	-208(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-192(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.Laes192__func2:
+	vbroadcasti128	-176(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-160(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.Laes128__func2:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+.byte	0xc4,0xe3,0x65,0x44,0xec,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vbroadcasti128	-144(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	32(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	64(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vmovdqu	96(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+.byte	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+.byte	0xc4,0x62,0x1d,0xdd,0xe2
+.byte	0xc4,0x62,0x15,0xdd,0xeb
+.byte	0xc4,0x62,0x0d,0xdd,0xf5
+.byte	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func2
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func2
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func2:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func2
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%ymm0,%ymm3,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+.byte	0xc4,0xe3,0x1d,0x44,0xea,0x00
+.byte	0xc4,0xe3,0x1d,0x44,0xf2,0x01
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xfa,0x11
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func2
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func2:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func2:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func2
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func2
+	je	.Lxor_two_blocks__func2
+
+.Lxor_three_blocks__func2:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%xmm0,%xmm3,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_two_blocks__func2:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_one_block__func2:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm2,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func2:
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func2:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xdd,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x6d,0x44,0xde,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func2:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2
+#endif
diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S
new file mode 100644
index 0000000000..9cb6127936
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S
@@ -0,0 +1,1167 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section	__DATA,__const
+.p2align	4
+
+
+L$bswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+L$gfpoly:
+.quad	1, 0xc200000000000000
+
+
+L$gfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+.p2align	5
+
+L$ctr_pattern:
+.quad	0, 0
+.quad	1, 0
+L$inc_2blocks:
+.quad	2, 0
+.quad	2, 0
+
+.text	
+.globl	_gcm_init_vpclmulqdq_avx2
+.private_extern _gcm_init_vpclmulqdq_avx2
+
+.p2align	5
+_gcm_init_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+
+
+	vpshufd	$0x4e,(%rsi),%xmm3
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+	vpand	L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm6
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x01,%xmm3,%xmm3,%xmm1
+	vpclmulqdq	$0x10,%xmm3,%xmm3,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm6,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm6,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm5,%xmm5
+	vpxor	%xmm0,%xmm5,%xmm5
+
+
+
+	vinserti128	$1,%xmm3,%ymm5,%ymm3
+	vinserti128	$1,%xmm5,%ymm5,%ymm5
+
+
+.byte	0xc4,0xe3,0x65,0x44,0xc5,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcd,0x01
+.byte	0xc4,0xe3,0x65,0x44,0xd5,0x10
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x65,0x44,0xe5,0x11
+.byte	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128+32(%rdi)
+
+
+.byte	0xc4,0xe3,0x5d,0x44,0xc5,0x00
+.byte	0xc4,0xe3,0x5d,0x44,0xcd,0x01
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x10
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x5d,0x44,0xdd,0x11
+.byte	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm3,%ymm3
+
+.byte	0xc4,0xe3,0x65,0x44,0xc5,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcd,0x01
+.byte	0xc4,0xe3,0x65,0x44,0xd5,0x10
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+.byte	0xc4,0xe3,0x65,0x44,0xe5,0x11
+.byte	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,0(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128(%rdi)
+
+	vzeroupper
+	ret
+
+
+
+.globl	_gcm_ghash_vpclmulqdq_avx2_1
+.private_extern _gcm_ghash_vpclmulqdq_avx2_1
+
+.p2align	5
+_gcm_ghash_vpclmulqdq_avx2_1:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+	vmovdqu	L$bswap_mask(%rip),%xmm6
+	vmovdqu	L$gfpoly(%rip),%xmm7
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+
+
+
+L$ghash_lastblock:
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vmovdqu	128-16(%rsi),%xmm0
+	vpclmulqdq	$0x00,%xmm0,%xmm5,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpclmulqdq	$0x10,%xmm0,%xmm5,%xmm3
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x01,%xmm1,%xmm7,%xmm3
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm2,%xmm7,%xmm1
+	vpshufd	$0x4e,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm5,%xmm5
+
+
+L$ghash_done:
+
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
+	ret
+
+
+
+.globl	_aes_gcm_enc_update_vaes_avx2
+.private_extern _aes_gcm_enc_update_vaes_avx2
+
+.p2align	5
+_aes_gcm_enc_update_vaes_avx2:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+8(%rip)
+#endif
+	vbroadcasti128	L$bswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	L$ctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	L$crypt_loop_4x_done__func1
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+
+
+
+	vmovdqu	L$inc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_first_4_vecs__func1
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+.byte	0xc4,0x62,0x1d,0xdd,0xe2
+.byte	0xc4,0x62,0x15,0xdd,0xeb
+.byte	0xc4,0x62,0x0d,0xdd,0xf5
+.byte	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	jbe	L$ghash_last_ciphertext_4x__func1
+.p2align	4
+L$crypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	L$inc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func1
+	je	L$aes192__func1
+
+	vbroadcasti128	-208(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-192(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+L$aes192__func1:
+	vbroadcasti128	-176(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-160(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+L$aes128__func1:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+.byte	0xc4,0xe3,0x65,0x44,0xec,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vbroadcasti128	-144(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vmovdqu	96(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm4
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+.byte	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+	subq	$-128,%rsi
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+.byte	0xc4,0x62,0x1d,0xdd,0xe2
+.byte	0xc4,0x62,0x15,0xdd,0xeb
+.byte	0xc4,0x62,0x0d,0xdd,0xf5
+.byte	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	L$crypt_loop_4x__func1
+L$ghash_last_ciphertext_4x__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+.byte	0xc4,0xe3,0x65,0x44,0xec,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	96(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm4
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+.byte	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	subq	$-128,%rsi
+L$crypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	L$done__func1
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	L$lessthan64bytes__func1
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_1__func1:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_1__func1
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm0,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+.byte	0xc4,0xe3,0x1d,0x44,0xea,0x00
+.byte	0xc4,0xe3,0x1d,0x44,0xf2,0x01
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xfa,0x11
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	L$reduce__func1
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+L$lessthan64bytes__func1:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_2__func1:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_2__func1
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+
+
+	cmpq	$32,%rdx
+	jb	L$xor_one_block__func1
+	je	L$xor_two_blocks__func1
+
+L$xor_three_blocks__func1:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	L$ghash_mul_one_vec_unreduced__func1
+
+L$xor_two_blocks__func1:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	L$ghash_mul_one_vec_unreduced__func1
+
+L$xor_one_block__func1:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+L$ghash_mul_one_vec_unreduced__func1:
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+L$reduce__func1:
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xdd,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x6d,0x44,0xde,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+L$done__func1:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+.globl	_aes_gcm_dec_update_vaes_avx2
+.private_extern _aes_gcm_dec_update_vaes_avx2
+
+.p2align	5
+_aes_gcm_dec_update_vaes_avx2:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+	vbroadcasti128	L$bswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	L$ctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	L$crypt_loop_4x_done__func2
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+.p2align	4
+L$crypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	L$inc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func2
+	je	L$aes192__func2
+
+	vbroadcasti128	-208(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-192(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+L$aes192__func2:
+	vbroadcasti128	-176(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	-160(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+L$aes128__func2:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+.byte	0xc4,0xe3,0x65,0x44,0xec,0x00
+.byte	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vbroadcasti128	-144(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	32(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	64(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vmovdqu	96(%r9),%ymm4
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	%ymm2,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+.byte	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm4
+.byte	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+
+.byte	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+.byte	0xc4,0x62,0x0d,0xdc,0xf2
+.byte	0xc4,0x62,0x05,0xdc,0xfa
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+.byte	0xc4,0x62,0x1d,0xdd,0xe2
+.byte	0xc4,0x62,0x15,0xdd,0xeb
+.byte	0xc4,0x62,0x0d,0xdd,0xf5
+.byte	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	L$crypt_loop_4x__func2
+L$crypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	L$done__func2
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	L$lessthan64bytes__func2
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_1__func2:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_1__func2
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%ymm0,%ymm3,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+.byte	0xc4,0xe3,0x1d,0x44,0xea,0x00
+.byte	0xc4,0xe3,0x1d,0x44,0xf2,0x01
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xfa,0x11
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x15,0x44,0xe3,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	L$reduce__func2
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+L$lessthan64bytes__func2:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_2__func2:
+	vbroadcasti128	(%rax),%ymm2
+.byte	0xc4,0x62,0x1d,0xdc,0xe2
+.byte	0xc4,0x62,0x15,0xdc,0xea
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_2__func2
+.byte	0xc4,0x42,0x1d,0xdd,0xe2
+.byte	0xc4,0x42,0x15,0xdd,0xea
+
+
+
+
+	cmpq	$32,%rdx
+	jb	L$xor_one_block__func2
+	je	L$xor_two_blocks__func2
+
+L$xor_three_blocks__func2:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%xmm0,%xmm3,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	L$ghash_mul_one_vec_unreduced__func2
+
+L$xor_two_blocks__func2:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	L$ghash_mul_one_vec_unreduced__func2
+
+L$xor_one_block__func2:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm2,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+L$ghash_mul_one_vec_unreduced__func2:
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x00
+	vpxor	%ymm4,%ymm5,%ymm5
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x01
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	%ymm4,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x1d,0x44,0xe2,0x11
+	vpxor	%ymm4,%ymm7,%ymm7
+
+L$reduce__func2:
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm2
+.byte	0xc4,0xe3,0x6d,0x44,0xdd,0x01
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+.byte	0xc4,0xe3,0x6d,0x44,0xde,0x01
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+L$done__func2:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+#endif
diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm
new file mode 100644
index 0000000000..f07e5a3d6a
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm
@@ -0,0 +1,1423 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.rdata rdata align=8
+ALIGN	16
+
+
+$L$bswap_mask:
+	DQ	0x08090a0b0c0d0e0f,0x0001020304050607
+
+
+
+
+
+
+
+
+$L$gfpoly:
+	DQ	1,0xc200000000000000
+
+
+$L$gfpoly_and_internal_carrybit:
+	DQ	1,0xc200000000000001
+
+ALIGN	32
+
+$L$ctr_pattern:
+	DQ	0,0
+	DQ	1,0
+$L$inc_2blocks:
+	DQ	2,0
+	DQ	2,0
+
+section	.text code align=64
+
+global	gcm_init_vpclmulqdq_avx2
+
+ALIGN	32
+gcm_init_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1:
+_CET_ENDBR
+	sub	rsp,24
+$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3:
+
+$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4:
+
+
+
+	vpshufd	xmm3,XMMWORD[rdx],0x4e
+
+
+
+
+
+	vpshufd	xmm0,xmm3,0xd3
+	vpsrad	xmm0,xmm0,31
+	vpaddq	xmm3,xmm3,xmm3
+	vpand	xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit]
+	vpxor	xmm3,xmm3,xmm0
+
+	vbroadcasti128	ymm6,XMMWORD[$L$gfpoly]
+
+
+	vpclmulqdq	xmm0,xmm3,xmm3,0x00
+	vpclmulqdq	xmm1,xmm3,xmm3,0x01
+	vpclmulqdq	xmm2,xmm3,xmm3,0x10
+	vpxor	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm2,xmm6,xmm0,0x01
+	vpshufd	xmm0,xmm0,0x4e
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm5,xmm3,xmm3,0x11
+	vpclmulqdq	xmm0,xmm6,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpxor	xmm5,xmm5,xmm1
+	vpxor	xmm5,xmm5,xmm0
+
+
+
+	vinserti128	ymm3,ymm5,xmm3,1
+	vinserti128	ymm5,ymm5,xmm5,1
+
+
+	DB	0xc4,0xe3,0x65,0x44,0xc5,0x00
+	DB	0xc4,0xe3,0x65,0x44,0xcd,0x01
+	DB	0xc4,0xe3,0x65,0x44,0xd5,0x10
+	vpxor	ymm1,ymm1,ymm2
+	DB	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpxor	ymm1,ymm1,ymm0
+	vpxor	ymm1,ymm1,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xe5,0x11
+	DB	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm4,ymm4,ymm1
+	vpxor	ymm4,ymm4,ymm0
+
+
+
+	vmovdqu	YMMWORD[96+rcx],ymm3
+	vmovdqu	YMMWORD[64+rcx],ymm4
+
+
+
+	vpunpcklqdq	ymm0,ymm4,ymm3
+	vpunpckhqdq	ymm1,ymm4,ymm3
+	vpxor	ymm0,ymm0,ymm1
+	vmovdqu	YMMWORD[(128+32)+rcx],ymm0
+
+
+	DB	0xc4,0xe3,0x5d,0x44,0xc5,0x00
+	DB	0xc4,0xe3,0x5d,0x44,0xcd,0x01
+	DB	0xc4,0xe3,0x5d,0x44,0xd5,0x10
+	vpxor	ymm1,ymm1,ymm2
+	DB	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpxor	ymm1,ymm1,ymm0
+	vpxor	ymm1,ymm1,ymm2
+	DB	0xc4,0xe3,0x5d,0x44,0xdd,0x11
+	DB	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm3,ymm3,ymm1
+	vpxor	ymm3,ymm3,ymm0
+
+	DB	0xc4,0xe3,0x65,0x44,0xc5,0x00
+	DB	0xc4,0xe3,0x65,0x44,0xcd,0x01
+	DB	0xc4,0xe3,0x65,0x44,0xd5,0x10
+	vpxor	ymm1,ymm1,ymm2
+	DB	0xc4,0xe3,0x4d,0x44,0xd0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpxor	ymm1,ymm1,ymm0
+	vpxor	ymm1,ymm1,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xe5,0x11
+	DB	0xc4,0xe3,0x4d,0x44,0xc1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm4,ymm4,ymm1
+	vpxor	ymm4,ymm4,ymm0
+
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vmovdqu	YMMWORD[rcx],ymm4
+
+
+
+	vpunpcklqdq	ymm0,ymm4,ymm3
+	vpunpckhqdq	ymm1,ymm4,ymm3
+	vpxor	ymm0,ymm0,ymm1
+	vmovdqu	YMMWORD[128+rcx],ymm0
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	add	rsp,24
+	ret
+$L$SEH_end_gcm_init_vpclmulqdq_avx2_5:
+
+
+global	gcm_ghash_vpclmulqdq_avx2_1
+
+ALIGN	32
+gcm_ghash_vpclmulqdq_avx2_1:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1:
+_CET_ENDBR
+	sub	rsp,72
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7:
+
+
+
+
+	vmovdqu	xmm6,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm7,XMMWORD[$L$gfpoly]
+
+
+	vmovdqu	xmm5,XMMWORD[rcx]
+	vpshufb	xmm5,xmm5,xmm6
+
+
+
+$L$ghash_lastblock:
+	vmovdqu	xmm0,XMMWORD[r8]
+	vpshufb	xmm0,xmm0,xmm6
+	vpxor	xmm5,xmm5,xmm0
+	vmovdqu	xmm0,XMMWORD[((128-16))+rdx]
+	vpclmulqdq	xmm1,xmm5,xmm0,0x00
+	vpclmulqdq	xmm2,xmm5,xmm0,0x01
+	vpclmulqdq	xmm3,xmm5,xmm0,0x10
+	vpxor	xmm2,xmm2,xmm3
+	vpclmulqdq	xmm3,xmm7,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpxor	xmm2,xmm2,xmm1
+	vpxor	xmm2,xmm2,xmm3
+	vpclmulqdq	xmm5,xmm5,xmm0,0x11
+	vpclmulqdq	xmm1,xmm7,xmm2,0x01
+	vpshufd	xmm2,xmm2,0x4e
+	vpxor	xmm5,xmm5,xmm2
+	vpxor	xmm5,xmm5,xmm1
+
+
+$L$ghash_done:
+
+	vpshufb	xmm5,xmm5,xmm6
+	vmovdqu	XMMWORD[rcx],xmm5
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	add	rsp,72
+	ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8:
+
+
+global	aes_gcm_enc_update_vaes_avx2
+
+ALIGN	32
+aes_gcm_enc_update_vaes_avx2:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+8))],1
+%endif
+	vbroadcasti128	ymm0,XMMWORD[$L$bswap_mask]
+
+
+
+	vmovdqu	xmm1,XMMWORD[r12]
+	vpshufb	xmm1,xmm1,xmm0
+	vbroadcasti128	ymm11,XMMWORD[rsi]
+	vpshufb	ymm11,ymm11,ymm0
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti128	ymm9,XMMWORD[r9]
+	vbroadcasti128	ymm10,XMMWORD[r11]
+
+
+	vpaddd	ymm11,ymm11,YMMWORD[$L$ctr_pattern]
+
+
+
+	cmp	r8,127
+	jbe	NEAR $L$crypt_loop_4x_done__func1
+
+	vmovdqu	ymm7,YMMWORD[128+rdi]
+	vmovdqu	ymm8,YMMWORD[((128+32))+rdi]
+
+
+
+	vmovdqu	ymm2,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm14,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm15,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+
+
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	vpxor	ymm14,ymm14,ymm9
+	vpxor	ymm15,ymm15,ymm9
+
+	lea	rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_first_4_vecs__func1
+	vpxor	ymm2,ymm10,YMMWORD[rcx]
+	vpxor	ymm3,ymm10,YMMWORD[32+rcx]
+	vpxor	ymm5,ymm10,YMMWORD[64+rcx]
+	vpxor	ymm6,ymm10,YMMWORD[96+rcx]
+	DB	0xc4,0x62,0x1d,0xdd,0xe2
+	DB	0xc4,0x62,0x15,0xdd,0xeb
+	DB	0xc4,0x62,0x0d,0xdd,0xf5
+	DB	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+	vmovdqu	YMMWORD[64+rdx],ymm14
+	vmovdqu	YMMWORD[96+rdx],ymm15
+
+	sub	rcx,-128
+	add	r8,-128
+	cmp	r8,127
+	jbe	NEAR $L$ghash_last_ciphertext_4x__func1
+ALIGN	16
+$L$crypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	ymm2,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm14,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm15,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+
+
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	vpxor	ymm14,ymm14,ymm9
+	vpxor	ymm15,ymm15,ymm9
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func1
+	je	NEAR $L$aes192__func1
+
+	vbroadcasti128	ymm2,XMMWORD[((-208))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	ymm2,XMMWORD[((-192))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+$L$aes192__func1:
+	vbroadcasti128	ymm2,XMMWORD[((-176))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	ymm2,XMMWORD[((-160))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+$L$aes128__func1:
+	prefetcht0	[512+rcx]
+	prefetcht0	[((512+64))+rcx]
+
+	vmovdqu	ymm3,YMMWORD[rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[rdi]
+	vpxor	ymm3,ymm3,ymm1
+	DB	0xc4,0xe3,0x65,0x44,0xec,0x00
+	DB	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vbroadcasti128	ymm2,XMMWORD[((-144))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vbroadcasti128	ymm2,XMMWORD[((-128))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	ymm3,YMMWORD[32+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[32+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-112))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	ymm3,YMMWORD[64+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[64+rdi]
+
+	vbroadcasti128	ymm2,XMMWORD[((-96))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-80))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	ymm6,ymm6,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[96+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+
+	vbroadcasti128	ymm2,XMMWORD[((-64))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vmovdqu	ymm4,YMMWORD[96+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-48))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm1
+
+
+	vbroadcasti128	ymm4,XMMWORD[$L$gfpoly]
+	DB	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-32))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	DB	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm1,ymm1,ymm6
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-16))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vextracti128	xmm2,ymm1,1
+	vpxor	xmm1,xmm1,xmm2
+
+
+	sub	rdx,-128
+	vpxor	ymm2,ymm10,YMMWORD[rcx]
+	vpxor	ymm3,ymm10,YMMWORD[32+rcx]
+	vpxor	ymm5,ymm10,YMMWORD[64+rcx]
+	vpxor	ymm6,ymm10,YMMWORD[96+rcx]
+	DB	0xc4,0x62,0x1d,0xdd,0xe2
+	DB	0xc4,0x62,0x15,0xdd,0xeb
+	DB	0xc4,0x62,0x0d,0xdd,0xf5
+	DB	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+	vmovdqu	YMMWORD[64+rdx],ymm14
+	vmovdqu	YMMWORD[96+rdx],ymm15
+
+	sub	rcx,-128
+
+	add	r8,-128
+	cmp	r8,127
+	ja	NEAR $L$crypt_loop_4x__func1
+$L$ghash_last_ciphertext_4x__func1:
+
+	vmovdqu	ymm3,YMMWORD[rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[rdi]
+	vpxor	ymm3,ymm3,ymm1
+	DB	0xc4,0xe3,0x65,0x44,0xec,0x00
+	DB	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vmovdqu	ymm3,YMMWORD[32+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[32+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vmovdqu	ymm3,YMMWORD[64+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[64+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	ymm6,ymm6,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[96+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[96+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm1
+
+
+	vbroadcasti128	ymm4,XMMWORD[$L$gfpoly]
+	DB	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm2
+
+	DB	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm1,ymm1,ymm6
+	vpxor	ymm1,ymm1,ymm2
+	vextracti128	xmm2,ymm1,1
+	vpxor	xmm1,xmm1,xmm2
+
+	sub	rdx,-128
+$L$crypt_loop_4x_done__func1:
+
+	test	r8,r8
+	jz	NEAR $L$done__func1
+
+
+
+
+
+	lea	rsi,[128+rdi]
+	sub	rsi,r8
+
+
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+	vpxor	xmm7,xmm7,xmm7
+
+	cmp	r8,64
+	jb	NEAR $L$lessthan64bytes__func1
+
+
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_1__func1:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_1__func1
+	DB	0xc4,0x42,0x1d,0xdd,0xe2
+	DB	0xc4,0x42,0x15,0xdd,0xea
+
+
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	ymm3,YMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	ymm13,ymm13,ymm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+
+
+	vpshufb	ymm12,ymm12,ymm0
+	vpshufb	ymm13,ymm13,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	ymm3,YMMWORD[32+rsi]
+	DB	0xc4,0xe3,0x1d,0x44,0xea,0x00
+	DB	0xc4,0xe3,0x1d,0x44,0xf2,0x01
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xfa,0x11
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+	add	rsi,64
+	add	rcx,64
+	add	rdx,64
+	sub	r8,64
+	jz	NEAR $L$reduce__func1
+
+	vpxor	xmm1,xmm1,xmm1
+
+
+$L$lessthan64bytes__func1:
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_2__func1:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_2__func1
+	DB	0xc4,0x42,0x1d,0xdd,0xe2
+	DB	0xc4,0x42,0x15,0xdd,0xea
+
+
+
+
+	cmp	r8,32
+	jb	NEAR $L$xor_one_block__func1
+	je	NEAR $L$xor_two_blocks__func1
+
+$L$xor_three_blocks__func1:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	xmm3,XMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	xmm13,xmm13,xmm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	XMMWORD[32+rdx],xmm13
+
+	vpshufb	ymm12,ymm12,ymm0
+	vpshufb	xmm13,xmm13,xmm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	xmm3,XMMWORD[32+rsi]
+	vpclmulqdq	xmm4,xmm13,xmm3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x11
+	vpxor	ymm7,ymm7,ymm4
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func1
+
+$L$xor_two_blocks__func1:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vmovdqu	YMMWORD[rdx],ymm12
+	vpshufb	ymm12,ymm12,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func1
+
+$L$xor_one_block__func1:
+	vmovdqu	xmm2,XMMWORD[rcx]
+	vpxor	xmm12,xmm12,xmm2
+	vmovdqu	XMMWORD[rdx],xmm12
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm12,xmm12,xmm1
+	vmovdqu	xmm2,XMMWORD[rsi]
+
+$L$ghash_mul_one_vec_unreduced__func1:
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x00
+	vpxor	ymm5,ymm5,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x01
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+$L$reduce__func1:
+
+	vbroadcasti128	ymm2,XMMWORD[$L$gfpoly]
+	DB	0xc4,0xe3,0x6d,0x44,0xdd,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xde,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm7,ymm7,ymm6
+	vpxor	ymm7,ymm7,ymm3
+	vextracti128	xmm1,ymm7,1
+	vpxor	xmm1,xmm1,xmm7
+
+$L$done__func1:
+
+	vpshufb	xmm1,xmm1,xmm0
+	vmovdqu	XMMWORD[r12],xmm1
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17:
+
+
+global	aes_gcm_dec_update_vaes_avx2
+
+ALIGN	32
+aes_gcm_dec_update_vaes_avx2:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16:
+	vbroadcasti128	ymm0,XMMWORD[$L$bswap_mask]
+
+
+
+	vmovdqu	xmm1,XMMWORD[r12]
+	vpshufb	xmm1,xmm1,xmm0
+	vbroadcasti128	ymm11,XMMWORD[rsi]
+	vpshufb	ymm11,ymm11,ymm0
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti128	ymm9,XMMWORD[r9]
+	vbroadcasti128	ymm10,XMMWORD[r11]
+
+
+	vpaddd	ymm11,ymm11,YMMWORD[$L$ctr_pattern]
+
+
+
+	cmp	r8,127
+	jbe	NEAR $L$crypt_loop_4x_done__func2
+
+	vmovdqu	ymm7,YMMWORD[128+rdi]
+	vmovdqu	ymm8,YMMWORD[((128+32))+rdi]
+ALIGN	16
+$L$crypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	ymm2,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm14,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm15,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+
+
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	vpxor	ymm14,ymm14,ymm9
+	vpxor	ymm15,ymm15,ymm9
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func2
+	je	NEAR $L$aes192__func2
+
+	vbroadcasti128	ymm2,XMMWORD[((-208))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	ymm2,XMMWORD[((-192))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+$L$aes192__func2:
+	vbroadcasti128	ymm2,XMMWORD[((-176))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vbroadcasti128	ymm2,XMMWORD[((-160))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+$L$aes128__func2:
+	prefetcht0	[512+rcx]
+	prefetcht0	[((512+64))+rcx]
+
+	vmovdqu	ymm3,YMMWORD[rcx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[rdi]
+	vpxor	ymm3,ymm3,ymm1
+	DB	0xc4,0xe3,0x65,0x44,0xec,0x00
+	DB	0xc4,0xe3,0x65,0x44,0xcc,0x11
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xf7,0x00
+
+	vbroadcasti128	ymm2,XMMWORD[((-144))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vbroadcasti128	ymm2,XMMWORD[((-128))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	ymm3,YMMWORD[32+rcx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[32+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xd7,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-112))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vmovdqu	ymm3,YMMWORD[64+rcx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[64+rdi]
+
+	vbroadcasti128	ymm2,XMMWORD[((-96))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-80))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xc3,0x6d,0x44,0xd0,0x00
+	vpxor	ymm6,ymm6,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[96+rcx]
+	vpshufb	ymm3,ymm3,ymm0
+
+	vbroadcasti128	ymm2,XMMWORD[((-64))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vmovdqu	ymm4,YMMWORD[96+rdi]
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	DB	0xc4,0xe3,0x65,0x44,0xd4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	DB	0xc4,0xc3,0x6d,0x44,0xd0,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-48))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm1
+
+
+	vbroadcasti128	ymm4,XMMWORD[$L$gfpoly]
+	DB	0xc4,0xe3,0x5d,0x44,0xd5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-32))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+
+	DB	0xc4,0xe3,0x5d,0x44,0xd6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm1,ymm1,ymm6
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-16))+r11]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	DB	0xc4,0x62,0x0d,0xdc,0xf2
+	DB	0xc4,0x62,0x05,0xdc,0xfa
+
+	vextracti128	xmm2,ymm1,1
+	vpxor	xmm1,xmm1,xmm2
+
+
+
+	vpxor	ymm2,ymm10,YMMWORD[rcx]
+	vpxor	ymm3,ymm10,YMMWORD[32+rcx]
+	vpxor	ymm5,ymm10,YMMWORD[64+rcx]
+	vpxor	ymm6,ymm10,YMMWORD[96+rcx]
+	DB	0xc4,0x62,0x1d,0xdd,0xe2
+	DB	0xc4,0x62,0x15,0xdd,0xeb
+	DB	0xc4,0x62,0x0d,0xdd,0xf5
+	DB	0xc4,0x62,0x05,0xdd,0xfe
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+	vmovdqu	YMMWORD[64+rdx],ymm14
+	vmovdqu	YMMWORD[96+rdx],ymm15
+
+	sub	rcx,-128
+	sub	rdx,-128
+	add	r8,-128
+	cmp	r8,127
+	ja	NEAR $L$crypt_loop_4x__func2
+$L$crypt_loop_4x_done__func2:
+
+	test	r8,r8
+	jz	NEAR $L$done__func2
+
+
+
+
+
+	lea	rsi,[128+rdi]
+	sub	rsi,r8
+
+
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+	vpxor	xmm7,xmm7,xmm7
+
+	cmp	r8,64
+	jb	NEAR $L$lessthan64bytes__func2
+
+
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_1__func2:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_1__func2
+	DB	0xc4,0x42,0x1d,0xdd,0xe2
+	DB	0xc4,0x42,0x15,0xdd,0xea
+
+
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	ymm3,YMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	ymm13,ymm13,ymm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+
+
+	vpshufb	ymm12,ymm2,ymm0
+	vpshufb	ymm13,ymm3,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	ymm3,YMMWORD[32+rsi]
+	DB	0xc4,0xe3,0x1d,0x44,0xea,0x00
+	DB	0xc4,0xe3,0x1d,0x44,0xf2,0x01
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xfa,0x11
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x15,0x44,0xe3,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+	add	rsi,64
+	add	rcx,64
+	add	rdx,64
+	sub	r8,64
+	jz	NEAR $L$reduce__func2
+
+	vpxor	xmm1,xmm1,xmm1
+
+
+$L$lessthan64bytes__func2:
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_2__func2:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	DB	0xc4,0x62,0x1d,0xdc,0xe2
+	DB	0xc4,0x62,0x15,0xdc,0xea
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_2__func2
+	DB	0xc4,0x42,0x1d,0xdd,0xe2
+	DB	0xc4,0x42,0x15,0xdd,0xea
+
+
+
+
+	cmp	r8,32
+	jb	NEAR $L$xor_one_block__func2
+	je	NEAR $L$xor_two_blocks__func2
+
+$L$xor_three_blocks__func2:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	xmm3,XMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	xmm13,xmm13,xmm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	XMMWORD[32+rdx],xmm13
+
+	vpshufb	ymm12,ymm2,ymm0
+	vpshufb	xmm13,xmm3,xmm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	xmm3,XMMWORD[32+rsi]
+	vpclmulqdq	xmm4,xmm13,xmm3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x11
+	vpxor	ymm7,ymm7,ymm4
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func2
+
+$L$xor_two_blocks__func2:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vmovdqu	YMMWORD[rdx],ymm12
+	vpshufb	ymm12,ymm2,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func2
+
+$L$xor_one_block__func2:
+	vmovdqu	xmm2,XMMWORD[rcx]
+	vpxor	xmm12,xmm12,xmm2
+	vmovdqu	XMMWORD[rdx],xmm12
+	vpshufb	xmm12,xmm2,xmm0
+	vpxor	xmm12,xmm12,xmm1
+	vmovdqu	xmm2,XMMWORD[rsi]
+
+$L$ghash_mul_one_vec_unreduced__func2:
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x00
+	vpxor	ymm5,ymm5,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x01
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	DB	0xc4,0xe3,0x1d,0x44,0xe2,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+$L$reduce__func2:
+
+	vbroadcasti128	ymm2,XMMWORD[$L$gfpoly]
+	DB	0xc4,0xe3,0x6d,0x44,0xdd,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm3
+	DB	0xc4,0xe3,0x6d,0x44,0xde,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm7,ymm7,ymm6
+	vpxor	ymm7,ymm7,ymm3
+	vextracti128	xmm1,ymm7,1
+	vpxor	xmm1,xmm1,xmm7
+
+$L$done__func2:
+
+	vpshufb	xmm1,xmm1,xmm0
+	vmovdqu	XMMWORD[r12],xmm1
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17:
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_gcm_init_vpclmulqdq_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+	DB	34
+
+	DW	0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1
+	DB	9
+	DB	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1
+	DB	130
+
+	DW	0
+$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	96
+
+	DW	0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	96
+
+	DW	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o
new file mode 100644
index 0000000000..89ecad4569
Binary files /dev/null and b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S
new file mode 100644
index 0000000000..c547df6367
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S
@@ -0,0 +1,883 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc	
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%r10d
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+
+	prefetcht0	512(%rdi)
+	prefetcht0	576(%rdi)
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%rax
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	ret
+.cfi_endproc	
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl	aesni_gcm_decrypt
+.hidden aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,@function
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc	
+
+_CET_ENDBR
+	xorq	%rax,%rax
+
+
+
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+	pushq	%rbx
+.cfi_offset	%rbx,-24
+
+	pushq	%r12
+.cfi_offset	%r12,-32
+
+	pushq	%r13
+.cfi_offset	%r13,-40
+
+	pushq	%r14
+.cfi_offset	%r14,-48
+
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	vzeroupper
+
+	movq	16(%rbp),%r12
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r12),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32(%r9),%r9
+	movl	240-128(%rcx),%r10d
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	movq	%rdi,%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%rax,%rax
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	movq	16(%rbp),%r12
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+.cfi_def_cfa	%rsp, 0x38
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+.Lgcm_dec_abort:
+	ret
+
+.cfi_endproc	
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+.cfi_startproc	
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%r10),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	ret
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.cfi_endproc	
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.hidden aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc	
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+2(%rip)
+#endif
+	xorq	%rax,%rax
+
+
+
+
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+	pushq	%rbx
+.cfi_offset	%rbx,-24
+
+	pushq	%r12
+.cfi_offset	%r12,-32
+
+	pushq	%r13
+.cfi_offset	%r13,-40
+
+	pushq	%r14
+.cfi_offset	%r14,-48
+
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%r10d
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	movq	%rsi,%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	movq	16(%rbp),%r12
+	leaq	32(%r9),%r9
+	vmovdqu	(%r12),%xmm8
+	subq	$12,%rdx
+	movq	$192,%rax
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	movq	16(%rbp),%r12
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+.cfi_def_cfa	%rsp, 0x38
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+.Lgcm_enc_abort:
+	ret
+
+.cfi_endproc	
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S
new file mode 100644
index 0000000000..b9a70cfc8c
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S
@@ -0,0 +1,868 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+.p2align	5
+_aesni_ctr32_ghash_6x:
+
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	L$oop6x
+
+.p2align	5
+L$oop6x:
+	addl	$100663296,%ebx
+	jc	L$handle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+L$resume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%r10d
+	jb	L$enc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	L$enc_tail
+
+.p2align	5
+L$handle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	L$resume_ctr32
+
+.p2align	5
+L$enc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+
+	prefetcht0	512(%rdi)
+	prefetcht0	576(%rdi)
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%rax
+	subq	$0x6,%rdx
+	jc	L$6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	L$oop6x
+
+L$6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	ret
+
+
+.globl	_aesni_gcm_decrypt
+.private_extern _aesni_gcm_decrypt
+
+.p2align	5
+_aesni_gcm_decrypt:
+
+
+_CET_ENDBR
+	xorq	%rax,%rax
+
+
+
+	cmpq	$0x60,%rdx
+	jb	L$gcm_dec_abort
+
+	pushq	%rbp
+
+
+	movq	%rsp,%rbp
+
+	pushq	%rbx
+
+
+	pushq	%r12
+
+
+	pushq	%r13
+
+
+	pushq	%r14
+
+
+	pushq	%r15
+
+
+	vzeroupper
+
+	movq	16(%rbp),%r12
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	L$bswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r12),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32(%r9),%r9
+	movl	240-128(%rcx),%r10d
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	L$dec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	L$dec_no_key_aliasing
+	subq	%r15,%rsp
+L$dec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	movq	%rdi,%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%rax,%rax
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	movq	16(%rbp),%r12
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+L$gcm_dec_abort:
+	ret
+
+
+
+
+.p2align	5
+_aesni_ctr32_6x:
+
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%r10),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	L$handle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	L$oop_ctr32
+
+.p2align	4
+L$oop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	L$oop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	ret
+.p2align	5
+L$handle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	L$oop_ctr32
+
+
+
+.globl	_aesni_gcm_encrypt
+.private_extern _aesni_gcm_encrypt
+
+.p2align	5
+_aesni_gcm_encrypt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+2(%rip)
+#endif
+	xorq	%rax,%rax
+
+
+
+
+	cmpq	$288,%rdx
+	jb	L$gcm_enc_abort
+
+	pushq	%rbp
+
+
+	movq	%rsp,%rbp
+
+	pushq	%rbx
+
+
+	pushq	%r12
+
+
+	pushq	%r13
+
+
+	pushq	%r14
+
+
+	pushq	%r15
+
+
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	L$bswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%r10d
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	L$enc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	L$enc_no_key_aliasing
+	subq	%r15,%rsp
+L$enc_no_key_aliasing:
+
+	movq	%rsi,%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	movq	16(%rbp),%r12
+	leaq	32(%r9),%r9
+	vmovdqu	(%r12),%xmm8
+	subq	$12,%rdx
+	movq	$192,%rax
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	movq	16(%rbp),%r12
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+L$gcm_enc_abort:
+	ret
+
+
+
+.section	__DATA,__const
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$poly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$one_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$two_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+L$one_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm
new file mode 100644
index 0000000000..f1a7ca8bd1
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm
@@ -0,0 +1,1104 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+
+ALIGN	32
+_aesni_ctr32_ghash_6x:
+
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	sub	r8,6
+	vpxor	xmm4,xmm4,xmm4
+	vmovdqu	xmm15,XMMWORD[((0-128))+r9]
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpaddb	xmm12,xmm11,xmm2
+	vpaddb	xmm13,xmm12,xmm2
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm9,xmm1,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm4
+	jmp	NEAR $L$oop6x
+
+ALIGN	32
+$L$oop6x:
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm10,xmm10,xmm15
+	vpxor	xmm11,xmm11,xmm15
+
+$L$resume_ctr32:
+	vmovdqu	XMMWORD[rdi],xmm1
+	vpclmulqdq	xmm5,xmm7,xmm3,0x10
+	vpxor	xmm12,xmm12,xmm15
+	vmovups	xmm2,XMMWORD[((16-128))+r9]
+	vpclmulqdq	xmm6,xmm7,xmm3,0x01
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xor	r12,r12
+	cmp	r15,r14
+
+	vaesenc	xmm9,xmm9,xmm2
+	vmovdqu	xmm0,XMMWORD[((48+8))+rsp]
+	vpxor	xmm13,xmm13,xmm15
+	vpclmulqdq	xmm1,xmm7,xmm3,0x00
+	vaesenc	xmm10,xmm10,xmm2
+	vpxor	xmm14,xmm14,xmm15
+	setnc	r12b
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vaesenc	xmm11,xmm11,xmm2
+	vmovdqu	xmm3,XMMWORD[((16-32))+rsi]
+	neg	r12
+	vaesenc	xmm12,xmm12,xmm2
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm3,0x00
+	vpxor	xmm8,xmm8,xmm4
+	vaesenc	xmm13,xmm13,xmm2
+	vpxor	xmm4,xmm1,xmm5
+	and	r12,0x60
+	vmovups	xmm15,XMMWORD[((32-128))+r9]
+	vpclmulqdq	xmm1,xmm0,xmm3,0x10
+	vaesenc	xmm14,xmm14,xmm2
+
+	vpclmulqdq	xmm2,xmm0,xmm3,0x01
+	lea	r14,[r12*1+r14]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpclmulqdq	xmm3,xmm0,xmm3,0x11
+	vmovdqu	xmm0,XMMWORD[((64+8))+rsp]
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[88+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[80+r14]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((32+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((40+8))+rsp],r12
+	vmovdqu	xmm5,XMMWORD[((48-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((48-128))+r9]
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm5,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm5,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm7,xmm7,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm5,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	vpclmulqdq	xmm5,xmm0,xmm5,0x11
+	vmovdqu	xmm0,XMMWORD[((80+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vmovdqu	xmm1,XMMWORD[((64-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((64-128))+r9]
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm1,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm1,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[72+r14]
+	vpxor	xmm7,xmm7,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm1,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[64+r14]
+	vpclmulqdq	xmm1,xmm0,xmm1,0x11
+	vmovdqu	xmm0,XMMWORD[((96+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((48+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((56+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm2,XMMWORD[((96-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((80-128))+r9]
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm2,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm2,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[56+r14]
+	vpxor	xmm7,xmm7,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm2,0x01
+	vpxor	xmm8,xmm8,XMMWORD[((112+8))+rsp]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[48+r14]
+	vpclmulqdq	xmm2,xmm0,xmm2,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((64+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((72+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm3
+	vmovdqu	xmm3,XMMWORD[((112-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((96-128))+r9]
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm8,xmm3,0x10
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm8,xmm3,0x01
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[40+r14]
+	vpxor	xmm7,xmm7,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm3,0x00
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[32+r14]
+	vpclmulqdq	xmm8,xmm8,xmm3,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((80+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((88+8))+rsp],r12
+	vpxor	xmm6,xmm6,xmm5
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm6,xmm6,xmm1
+
+	vmovups	xmm15,XMMWORD[((112-128))+r9]
+	vpslldq	xmm5,xmm6,8
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm3,XMMWORD[16+r11]
+
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm7,xmm7,xmm8
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm4,xmm4,xmm5
+	movbe	r13,QWORD[24+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[16+r14]
+	vpalignr	xmm0,xmm4,xmm4,8
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	mov	QWORD[((96+8))+rsp],r13
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((104+8))+rsp],r12
+	vaesenc	xmm13,xmm13,xmm15
+	vmovups	xmm1,XMMWORD[((128-128))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vmovups	xmm15,XMMWORD[((144-128))+r9]
+	vaesenc	xmm10,xmm10,xmm1
+	vpsrldq	xmm6,xmm6,8
+	vaesenc	xmm11,xmm11,xmm1
+	vpxor	xmm7,xmm7,xmm6
+	vaesenc	xmm12,xmm12,xmm1
+	vpxor	xmm4,xmm4,xmm0
+	movbe	r13,QWORD[8+r14]
+	vaesenc	xmm13,xmm13,xmm1
+	movbe	r12,QWORD[r14]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((160-128))+r9]
+	cmp	r10d,11
+	jb	NEAR $L$enc_tail
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((176-128))+r9]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((192-128))+r9]
+
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((208-128))+r9]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((224-128))+r9]
+	jmp	NEAR $L$enc_tail
+
+ALIGN	32
+$L$handle_ctr32:
+	vmovdqu	xmm0,XMMWORD[r11]
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm15
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm15
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpshufb	xmm14,xmm14,xmm0
+	vpshufb	xmm1,xmm1,xmm0
+	jmp	NEAR $L$resume_ctr32
+
+ALIGN	32
+$L$enc_tail:
+	vaesenc	xmm9,xmm9,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm7
+	vpalignr	xmm8,xmm4,xmm4,8
+	vaesenc	xmm10,xmm10,xmm15
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	vpxor	xmm2,xmm1,XMMWORD[rcx]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm0,xmm1,XMMWORD[16+rcx]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm5,xmm1,XMMWORD[32+rcx]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm6,xmm1,XMMWORD[48+rcx]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm7,xmm1,XMMWORD[64+rcx]
+	vpxor	xmm3,xmm1,XMMWORD[80+rcx]
+	vmovdqu	xmm1,XMMWORD[rdi]
+
+	vaesenclast	xmm9,xmm9,xmm2
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	vaesenclast	xmm10,xmm10,xmm0
+	vpaddb	xmm0,xmm1,xmm2
+	mov	QWORD[((112+8))+rsp],r13
+	lea	rcx,[96+rcx]
+
+	prefetcht0	[512+rcx]
+	prefetcht0	[576+rcx]
+	vaesenclast	xmm11,xmm11,xmm5
+	vpaddb	xmm5,xmm0,xmm2
+	mov	QWORD[((120+8))+rsp],r12
+	lea	rdx,[96+rdx]
+	vmovdqu	xmm15,XMMWORD[((0-128))+r9]
+	vaesenclast	xmm12,xmm12,xmm6
+	vpaddb	xmm6,xmm5,xmm2
+	vaesenclast	xmm13,xmm13,xmm7
+	vpaddb	xmm7,xmm6,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vpaddb	xmm3,xmm7,xmm2
+
+	add	rax,0x60
+	sub	r8,0x6
+	jc	NEAR $L$6x_done
+
+	vmovups	XMMWORD[(-96)+rdx],xmm9
+	vpxor	xmm9,xmm1,xmm15
+	vmovups	XMMWORD[(-80)+rdx],xmm10
+	vmovdqa	xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rdx],xmm11
+	vmovdqa	xmm11,xmm5
+	vmovups	XMMWORD[(-48)+rdx],xmm12
+	vmovdqa	xmm12,xmm6
+	vmovups	XMMWORD[(-32)+rdx],xmm13
+	vmovdqa	xmm13,xmm7
+	vmovups	XMMWORD[(-16)+rdx],xmm14
+	vmovdqa	xmm14,xmm3
+	vmovdqu	xmm7,XMMWORD[((32+8))+rsp]
+	jmp	NEAR $L$oop6x
+
+$L$6x_done:
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpxor	xmm8,xmm8,xmm4
+
+	ret
+
+
+global	aesni_gcm_decrypt
+
+ALIGN	32
+aesni_gcm_decrypt:
+
+$L$SEH_begin_aesni_gcm_decrypt_1:
+_CET_ENDBR
+	xor	rax,rax
+
+
+
+	cmp	r8,0x60
+	jb	NEAR $L$gcm_dec_abort
+
+	push	rbp
+
+$L$SEH_prologue_aesni_gcm_decrypt_2:
+	mov	rbp,rsp
+
+	push	rbx
+
+$L$SEH_prologue_aesni_gcm_decrypt_3:
+	push	r12
+
+$L$SEH_prologue_aesni_gcm_decrypt_4:
+	push	r13
+
+$L$SEH_prologue_aesni_gcm_decrypt_5:
+	push	r14
+
+$L$SEH_prologue_aesni_gcm_decrypt_6:
+	push	r15
+
+$L$SEH_prologue_aesni_gcm_decrypt_7:
+	lea	rsp,[((-168))+rsp]
+$L$SEH_prologue_aesni_gcm_decrypt_8:
+$L$SEH_prologue_aesni_gcm_decrypt_9:
+
+
+
+	mov	QWORD[16+rbp],rdi
+$L$SEH_prologue_aesni_gcm_decrypt_10:
+	mov	QWORD[24+rbp],rsi
+$L$SEH_prologue_aesni_gcm_decrypt_11:
+	mov	rdi,QWORD[48+rbp]
+	mov	rsi,QWORD[56+rbp]
+
+	movaps	XMMWORD[(-208)+rbp],xmm6
+$L$SEH_prologue_aesni_gcm_decrypt_12:
+	movaps	XMMWORD[(-192)+rbp],xmm7
+$L$SEH_prologue_aesni_gcm_decrypt_13:
+	movaps	XMMWORD[(-176)+rbp],xmm8
+$L$SEH_prologue_aesni_gcm_decrypt_14:
+	movaps	XMMWORD[(-160)+rbp],xmm9
+$L$SEH_prologue_aesni_gcm_decrypt_15:
+	movaps	XMMWORD[(-144)+rbp],xmm10
+$L$SEH_prologue_aesni_gcm_decrypt_16:
+	movaps	XMMWORD[(-128)+rbp],xmm11
+$L$SEH_prologue_aesni_gcm_decrypt_17:
+	movaps	XMMWORD[(-112)+rbp],xmm12
+$L$SEH_prologue_aesni_gcm_decrypt_18:
+	movaps	XMMWORD[(-96)+rbp],xmm13
+$L$SEH_prologue_aesni_gcm_decrypt_19:
+	movaps	XMMWORD[(-80)+rbp],xmm14
+$L$SEH_prologue_aesni_gcm_decrypt_20:
+	movaps	XMMWORD[(-64)+rbp],xmm15
+$L$SEH_prologue_aesni_gcm_decrypt_21:
+$L$SEH_endprologue_aesni_gcm_decrypt_22:
+	vzeroupper
+
+	mov	r12,QWORD[64+rbp]
+	vmovdqu	xmm1,XMMWORD[rdi]
+	add	rsp,-128
+	mov	ebx,DWORD[12+rdi]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+r9]
+	mov	r15,0xf80
+	vmovdqu	xmm8,XMMWORD[r12]
+	and	rsp,-128
+	vmovdqu	xmm0,XMMWORD[r11]
+	lea	r9,[128+r9]
+	lea	rsi,[32+rsi]
+	mov	r10d,DWORD[((240-128))+r9]
+	vpshufb	xmm8,xmm8,xmm0
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$dec_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$dec_no_key_aliasing
+	sub	rsp,r15
+$L$dec_no_key_aliasing:
+
+	vmovdqu	xmm7,XMMWORD[80+rcx]
+	mov	r14,rcx
+	vmovdqu	xmm4,XMMWORD[64+rcx]
+
+
+
+
+
+
+
+	lea	r15,[((-192))+r8*1+rcx]
+
+	vmovdqu	xmm5,XMMWORD[48+rcx]
+	shr	r8,4
+	xor	rax,rax
+	vmovdqu	xmm6,XMMWORD[32+rcx]
+	vpshufb	xmm7,xmm7,xmm0
+	vmovdqu	xmm2,XMMWORD[16+rcx]
+	vpshufb	xmm4,xmm4,xmm0
+	vmovdqu	xmm3,XMMWORD[rcx]
+	vpshufb	xmm5,xmm5,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm4
+	vpshufb	xmm6,xmm6,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm2,xmm2,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm6
+	vpshufb	xmm3,xmm3,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vmovdqu	XMMWORD[112+rsp],xmm3
+
+	call	_aesni_ctr32_ghash_6x
+
+	mov	r12,QWORD[64+rbp]
+	vmovups	XMMWORD[(-96)+rdx],xmm9
+	vmovups	XMMWORD[(-80)+rdx],xmm10
+	vmovups	XMMWORD[(-64)+rdx],xmm11
+	vmovups	XMMWORD[(-48)+rdx],xmm12
+	vmovups	XMMWORD[(-32)+rdx],xmm13
+	vmovups	XMMWORD[(-16)+rdx],xmm14
+
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[r12],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-208))+rbp]
+	movaps	xmm7,XMMWORD[((-192))+rbp]
+	movaps	xmm8,XMMWORD[((-176))+rbp]
+	movaps	xmm9,XMMWORD[((-160))+rbp]
+	movaps	xmm10,XMMWORD[((-144))+rbp]
+	movaps	xmm11,XMMWORD[((-128))+rbp]
+	movaps	xmm12,XMMWORD[((-112))+rbp]
+	movaps	xmm13,XMMWORD[((-96))+rbp]
+	movaps	xmm14,XMMWORD[((-80))+rbp]
+	movaps	xmm15,XMMWORD[((-64))+rbp]
+	mov	rdi,QWORD[16+rbp]
+	mov	rsi,QWORD[24+rbp]
+	lea	rsp,[((-40))+rbp]
+
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+$L$gcm_dec_abort:
+	ret
+$L$SEH_end_aesni_gcm_decrypt_23:
+
+
+
+ALIGN	32
+_aesni_ctr32_6x:
+
+	vmovdqu	xmm4,XMMWORD[((0-128))+r9]
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	lea	r13,[((-1))+r10]
+	vmovups	xmm15,XMMWORD[((16-128))+r9]
+	lea	r12,[((32-128))+r9]
+	vpxor	xmm9,xmm1,xmm4
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32_2
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpxor	xmm10,xmm10,xmm4
+	vpaddb	xmm12,xmm11,xmm2
+	vpxor	xmm11,xmm11,xmm4
+	vpaddb	xmm13,xmm12,xmm2
+	vpxor	xmm12,xmm12,xmm4
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm13,xmm13,xmm4
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+ALIGN	16
+$L$oop_ctr32:
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+	vmovups	xmm15,XMMWORD[r12]
+	lea	r12,[16+r12]
+	dec	r13d
+	jnz	NEAR $L$oop_ctr32
+
+	vmovdqu	xmm3,XMMWORD[r12]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm4,xmm3,XMMWORD[rcx]
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm5,xmm3,XMMWORD[16+rcx]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm6,xmm3,XMMWORD[32+rcx]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm8,xmm3,XMMWORD[48+rcx]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm2,xmm3,XMMWORD[64+rcx]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm3,xmm3,XMMWORD[80+rcx]
+	lea	rcx,[96+rcx]
+
+	vaesenclast	xmm9,xmm9,xmm4
+	vaesenclast	xmm10,xmm10,xmm5
+	vaesenclast	xmm11,xmm11,xmm6
+	vaesenclast	xmm12,xmm12,xmm8
+	vaesenclast	xmm13,xmm13,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vmovups	XMMWORD[rdx],xmm9
+	vmovups	XMMWORD[16+rdx],xmm10
+	vmovups	XMMWORD[32+rdx],xmm11
+	vmovups	XMMWORD[48+rdx],xmm12
+	vmovups	XMMWORD[64+rdx],xmm13
+	vmovups	XMMWORD[80+rdx],xmm14
+	lea	rdx,[96+rdx]
+
+	ret
+ALIGN	32
+$L$handle_ctr32_2:
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm4
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm4
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpxor	xmm12,xmm12,xmm4
+	vpshufb	xmm14,xmm14,xmm0
+	vpxor	xmm13,xmm13,xmm4
+	vpshufb	xmm1,xmm1,xmm0
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+
+
+global	aesni_gcm_encrypt
+
+ALIGN	32
+aesni_gcm_encrypt:
+
+$L$SEH_begin_aesni_gcm_encrypt_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+2))],1
+%endif
+	xor	rax,rax
+
+
+
+
+	cmp	r8,0x60*3
+	jb	NEAR $L$gcm_enc_abort
+
+	push	rbp
+
+$L$SEH_prologue_aesni_gcm_encrypt_2:
+	mov	rbp,rsp
+
+	push	rbx
+
+$L$SEH_prologue_aesni_gcm_encrypt_3:
+	push	r12
+
+$L$SEH_prologue_aesni_gcm_encrypt_4:
+	push	r13
+
+$L$SEH_prologue_aesni_gcm_encrypt_5:
+	push	r14
+
+$L$SEH_prologue_aesni_gcm_encrypt_6:
+	push	r15
+
+$L$SEH_prologue_aesni_gcm_encrypt_7:
+	lea	rsp,[((-168))+rsp]
+$L$SEH_prologue_aesni_gcm_encrypt_8:
+$L$SEH_prologue_aesni_gcm_encrypt_9:
+
+
+
+	mov	QWORD[16+rbp],rdi
+$L$SEH_prologue_aesni_gcm_encrypt_10:
+	mov	QWORD[24+rbp],rsi
+$L$SEH_prologue_aesni_gcm_encrypt_11:
+	mov	rdi,QWORD[48+rbp]
+	mov	rsi,QWORD[56+rbp]
+
+	movaps	XMMWORD[(-208)+rbp],xmm6
+$L$SEH_prologue_aesni_gcm_encrypt_12:
+	movaps	XMMWORD[(-192)+rbp],xmm7
+$L$SEH_prologue_aesni_gcm_encrypt_13:
+	movaps	XMMWORD[(-176)+rbp],xmm8
+$L$SEH_prologue_aesni_gcm_encrypt_14:
+	movaps	XMMWORD[(-160)+rbp],xmm9
+$L$SEH_prologue_aesni_gcm_encrypt_15:
+	movaps	XMMWORD[(-144)+rbp],xmm10
+$L$SEH_prologue_aesni_gcm_encrypt_16:
+	movaps	XMMWORD[(-128)+rbp],xmm11
+$L$SEH_prologue_aesni_gcm_encrypt_17:
+	movaps	XMMWORD[(-112)+rbp],xmm12
+$L$SEH_prologue_aesni_gcm_encrypt_18:
+	movaps	XMMWORD[(-96)+rbp],xmm13
+$L$SEH_prologue_aesni_gcm_encrypt_19:
+	movaps	XMMWORD[(-80)+rbp],xmm14
+$L$SEH_prologue_aesni_gcm_encrypt_20:
+	movaps	XMMWORD[(-64)+rbp],xmm15
+$L$SEH_prologue_aesni_gcm_encrypt_21:
+$L$SEH_endprologue_aesni_gcm_encrypt_22:
+	vzeroupper
+
+	vmovdqu	xmm1,XMMWORD[rdi]
+	add	rsp,-128
+	mov	ebx,DWORD[12+rdi]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+r9]
+	mov	r15,0xf80
+	lea	r9,[128+r9]
+	vmovdqu	xmm0,XMMWORD[r11]
+	and	rsp,-128
+	mov	r10d,DWORD[((240-128))+r9]
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$enc_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$enc_no_key_aliasing
+	sub	rsp,r15
+$L$enc_no_key_aliasing:
+
+	mov	r14,rdx
+
+
+
+
+
+
+
+
+	lea	r15,[((-192))+r8*1+rdx]
+
+	shr	r8,4
+
+	call	_aesni_ctr32_6x
+	vpshufb	xmm8,xmm9,xmm0
+	vpshufb	xmm2,xmm10,xmm0
+	vmovdqu	XMMWORD[112+rsp],xmm8
+	vpshufb	xmm4,xmm11,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vpshufb	xmm5,xmm12,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm4
+	vpshufb	xmm6,xmm13,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm7,xmm14,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm6
+
+	call	_aesni_ctr32_6x
+
+	mov	r12,QWORD[64+rbp]
+	lea	rsi,[32+rsi]
+	vmovdqu	xmm8,XMMWORD[r12]
+	sub	r8,12
+	mov	rax,0x60*2
+	vpshufb	xmm8,xmm8,xmm0
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	xmm7,XMMWORD[32+rsp]
+	vmovdqu	xmm0,XMMWORD[r11]
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpunpckhqdq	xmm1,xmm7,xmm7
+	vmovdqu	xmm15,XMMWORD[((32-32))+rsi]
+	vmovups	XMMWORD[(-96)+rdx],xmm9
+	vpshufb	xmm9,xmm9,xmm0
+	vpxor	xmm1,xmm1,xmm7
+	vmovups	XMMWORD[(-80)+rdx],xmm10
+	vpshufb	xmm10,xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rdx],xmm11
+	vpshufb	xmm11,xmm11,xmm0
+	vmovups	XMMWORD[(-48)+rdx],xmm12
+	vpshufb	xmm12,xmm12,xmm0
+	vmovups	XMMWORD[(-32)+rdx],xmm13
+	vpshufb	xmm13,xmm13,xmm0
+	vmovups	XMMWORD[(-16)+rdx],xmm14
+	vpshufb	xmm14,xmm14,xmm0
+	vmovdqu	XMMWORD[16+rsp],xmm9
+	vmovdqu	xmm6,XMMWORD[48+rsp]
+	vmovdqu	xmm0,XMMWORD[((16-32))+rsi]
+	vpunpckhqdq	xmm2,xmm6,xmm6
+	vpclmulqdq	xmm5,xmm7,xmm3,0x00
+	vpxor	xmm2,xmm2,xmm6
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+
+	vmovdqu	xmm9,XMMWORD[64+rsp]
+	vpclmulqdq	xmm4,xmm6,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm5,xmm9,xmm9
+	vpclmulqdq	xmm6,xmm6,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm9
+	vpxor	xmm6,xmm6,xmm7
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+rsi]
+	vpxor	xmm2,xmm2,xmm1
+
+	vmovdqu	xmm1,XMMWORD[80+rsp]
+	vpclmulqdq	xmm7,xmm9,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+rsi]
+	vpxor	xmm7,xmm7,xmm4
+	vpunpckhqdq	xmm4,xmm1,xmm1
+	vpclmulqdq	xmm9,xmm9,xmm3,0x11
+	vpxor	xmm4,xmm4,xmm1
+	vpxor	xmm9,xmm9,xmm6
+	vpclmulqdq	xmm5,xmm5,xmm15,0x00
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm2,XMMWORD[96+rsp]
+	vpclmulqdq	xmm6,xmm1,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+rsi]
+	vpxor	xmm6,xmm6,xmm7
+	vpunpckhqdq	xmm7,xmm2,xmm2
+	vpclmulqdq	xmm1,xmm1,xmm0,0x11
+	vpxor	xmm7,xmm7,xmm2
+	vpxor	xmm1,xmm1,xmm9
+	vpclmulqdq	xmm4,xmm4,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+
+	vpxor	xmm8,xmm8,XMMWORD[112+rsp]
+	vpclmulqdq	xmm5,xmm2,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+rsi]
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm2,xmm2,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm2,xmm2,xmm1
+	vpclmulqdq	xmm7,xmm7,xmm15,0x00
+	vpxor	xmm4,xmm7,xmm4
+
+	vpclmulqdq	xmm6,xmm8,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpunpckhqdq	xmm1,xmm14,xmm14
+	vpclmulqdq	xmm8,xmm8,xmm0,0x11
+	vpxor	xmm1,xmm1,xmm14
+	vpxor	xmm5,xmm6,xmm5
+	vpclmulqdq	xmm9,xmm9,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((32-32))+rsi]
+	vpxor	xmm7,xmm8,xmm2
+	vpxor	xmm6,xmm9,xmm4
+
+	vmovdqu	xmm0,XMMWORD[((16-32))+rsi]
+	vpxor	xmm9,xmm7,xmm5
+	vpclmulqdq	xmm4,xmm14,xmm3,0x00
+	vpxor	xmm6,xmm6,xmm9
+	vpunpckhqdq	xmm2,xmm13,xmm13
+	vpclmulqdq	xmm14,xmm14,xmm3,0x11
+	vpxor	xmm2,xmm2,xmm13
+	vpslldq	xmm9,xmm6,8
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+	vpxor	xmm8,xmm5,xmm9
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm7,xmm7,xmm6
+
+	vpclmulqdq	xmm5,xmm13,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+rsi]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm9,xmm12,xmm12
+	vpclmulqdq	xmm13,xmm13,xmm0,0x11
+	vpxor	xmm9,xmm9,xmm12
+	vpxor	xmm13,xmm13,xmm14
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+rsi]
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm4,xmm12,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm1,xmm11,xmm11
+	vpclmulqdq	xmm12,xmm12,xmm3,0x11
+	vpxor	xmm1,xmm1,xmm11
+	vpxor	xmm12,xmm12,xmm13
+	vxorps	xmm7,xmm7,XMMWORD[16+rsp]
+	vpclmulqdq	xmm9,xmm9,xmm15,0x00
+	vpxor	xmm9,xmm9,xmm2
+
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm5,xmm11,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+rsi]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm2,xmm10,xmm10
+	vpclmulqdq	xmm11,xmm11,xmm0,0x11
+	vpxor	xmm2,xmm2,xmm10
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpxor	xmm11,xmm11,xmm12
+	vpclmulqdq	xmm1,xmm1,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+rsi]
+	vpxor	xmm1,xmm1,xmm9
+
+	vxorps	xmm14,xmm14,xmm7
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm4,xmm10,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpclmulqdq	xmm10,xmm10,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm10,xmm10,xmm11
+	vpclmulqdq	xmm2,xmm2,xmm15,0x00
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm5,xmm8,xmm0,0x00
+	vpclmulqdq	xmm7,xmm8,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm4
+	vpclmulqdq	xmm6,xmm9,xmm15,0x10
+	vpxor	xmm7,xmm7,xmm10
+	vpxor	xmm6,xmm6,xmm2
+
+	vpxor	xmm4,xmm7,xmm5
+	vpxor	xmm6,xmm6,xmm4
+	vpslldq	xmm1,xmm6,8
+	vmovdqu	xmm3,XMMWORD[16+r11]
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm8,xmm5,xmm1
+	vpxor	xmm7,xmm7,xmm6
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm8,xmm8,xmm2
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm2,xmm2,xmm7
+	vpxor	xmm8,xmm8,xmm2
+	mov	r12,QWORD[64+rbp]
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[r12],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-208))+rbp]
+	movaps	xmm7,XMMWORD[((-192))+rbp]
+	movaps	xmm8,XMMWORD[((-176))+rbp]
+	movaps	xmm9,XMMWORD[((-160))+rbp]
+	movaps	xmm10,XMMWORD[((-144))+rbp]
+	movaps	xmm11,XMMWORD[((-128))+rbp]
+	movaps	xmm12,XMMWORD[((-112))+rbp]
+	movaps	xmm13,XMMWORD[((-96))+rbp]
+	movaps	xmm14,XMMWORD[((-80))+rbp]
+	movaps	xmm15,XMMWORD[((-64))+rbp]
+	mov	rdi,QWORD[16+rbp]
+	mov	rsi,QWORD[24+rbp]
+	lea	rsp,[((-40))+rbp]
+
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+$L$gcm_enc_abort:
+	ret
+$L$SEH_end_aesni_gcm_encrypt_23:
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$bswap_mask:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$poly:
+	DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$one_msb:
+	DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$two_lsb:
+	DB	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+$L$one_lsb:
+	DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+	DB	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
+	DB	101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
+	DB	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+	DB	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ALIGN	64
+section	.text
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_aesni_gcm_decrypt_1 wrt ..imagebase
+	DD	$L$SEH_end_aesni_gcm_decrypt_23 wrt ..imagebase
+	DD	$L$SEH_info_aesni_gcm_decrypt_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aesni_gcm_encrypt_1 wrt ..imagebase
+	DD	$L$SEH_end_aesni_gcm_encrypt_23 wrt ..imagebase
+	DD	$L$SEH_info_aesni_gcm_encrypt_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_aesni_gcm_decrypt_0:
+	DB	1
+	DB	$L$SEH_endprologue_aesni_gcm_decrypt_22-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	33
+	DB	213
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_20-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_19-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_18-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_17-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_16-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_15-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_14-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_13-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_12-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_11-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	100
+	DW	29
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_10-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	116
+	DW	28
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_9-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	3
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_8-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	1
+	DW	21
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_7-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	240
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_6-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	224
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_5-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	208
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_4-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	192
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_3-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	48
+	DB	$L$SEH_prologue_aesni_gcm_decrypt_2-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	80
+
+	DW	0
+$L$SEH_info_aesni_gcm_encrypt_0:
+	DB	1
+	DB	$L$SEH_endprologue_aesni_gcm_encrypt_22-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	33
+	DB	213
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_20-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_19-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_18-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_17-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_16-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_15-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_14-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_13-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_12-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_11-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	100
+	DW	29
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_10-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	116
+	DW	28
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_9-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	3
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_8-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	1
+	DW	21
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_7-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	240
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_6-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	224
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_5-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	208
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_4-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	192
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_3-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	48
+	DB	$L$SEH_prologue_aesni_gcm_encrypt_2-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	80
+
+	DW	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o
new file mode 100644
index 0000000000..4aca940369
Binary files /dev/null and b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/aesni-x86-elf.S b/ring-0.17.14/pregenerated/aesni-x86-elf.S
new file mode 100644
index 0000000000..20a5cdf89f
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-x86-elf.S
@@ -0,0 +1,720 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.hidden	_aesni_encrypt2
+.type	_aesni_encrypt2,@function
+.align	16
+_aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L000enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L000enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.hidden	_aesni_encrypt3
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L001enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L001enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.hidden	_aesni_encrypt4
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L002enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L002enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.hidden	_aesni_encrypt6
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L003_aesni_encrypt6_inner
+.align	16
+.L004enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.L003_aesni_encrypt6_inner:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.L_aesni_encrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L004enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	16
+aes_hw_ctr32_encrypt_blocks:
+.L_aes_hw_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L005pic_for_function_hit
+.L005pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+0-.L005pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	.L006ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,195,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,205,0
+	incl	%ebx
+.byte	102,15,58,34,195,1
+	incl	%ebp
+.byte	102,15,58,34,205,1
+	incl	%ebx
+.byte	102,15,58,34,195,2
+	incl	%ebp
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
+	cmpl	$6,%eax
+	jb	.L007ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
+	movdqa	%xmm7,32(%esp)
+	movl	%edx,%ebp
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	$6,%eax
+	jmp	.L008ctr32_loop6
+.align	16
+.L008ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
+	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
+	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,217
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	64(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	48(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	pshufd	$128,%xmm0,%xmm3
+	subl	$6,%eax
+	jnc	.L008ctr32_loop6
+	addl	$6,%eax
+	jz	.L009ctr32_ret
+	movdqu	(%ebp),%xmm7
+	movl	%ebp,%edx
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+.L007ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	.L010ctr32_one
+	pshufd	$64,%xmm0,%xmm4
+	por	%xmm7,%xmm3
+	je	.L011ctr32_two
+	pshufd	$192,%xmm1,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	.L012ctr32_three
+	pshufd	$128,%xmm1,%xmm6
+	por	%xmm7,%xmm5
+	je	.L013ctr32_four
+	por	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L009ctr32_ret
+.align	16
+.L006ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+.L010ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L014enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L014enc1_loop_1
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	.L009ctr32_ret
+.align	16
+.L011ctr32_two:
+	call	_aesni_encrypt2
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L009ctr32_ret
+.align	16
+.L012ctr32_three:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L009ctr32_ret
+.align	16
+.L013ctr32_four:
+	call	_aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L009ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin
+.globl	aes_hw_set_encrypt_key_base
+.hidden	aes_hw_set_encrypt_key_base
+.type	aes_hw_set_encrypt_key_base,@function
+.align	16
+aes_hw_set_encrypt_key_base:
+.L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L015pic_for_function_hit
+.L015pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L015pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	.L016pic
+.L016pic:
+	popl	%ebx
+	leal	.Lkey_const-.L016pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	.L01714rounds
+	cmpl	$128,%ecx
+	jne	.L018bad_keybits
+.align	16
+.L01910rounds:
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	.L020key_128_cold
+.byte	102,15,58,223,200,2
+	call	.L021key_128
+.byte	102,15,58,223,200,4
+	call	.L021key_128
+.byte	102,15,58,223,200,8
+	call	.L021key_128
+.byte	102,15,58,223,200,16
+	call	.L021key_128
+.byte	102,15,58,223,200,32
+	call	.L021key_128
+.byte	102,15,58,223,200,64
+	call	.L021key_128
+.byte	102,15,58,223,200,128
+	call	.L021key_128
+.byte	102,15,58,223,200,27
+	call	.L021key_128
+.byte	102,15,58,223,200,54
+	call	.L021key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	jmp	.L022good_key
+.align	16
+.L021key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.L020key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L01714rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L023key_256a_cold
+.byte	102,15,58,223,200,1
+	call	.L024key_256b
+.byte	102,15,58,223,202,2
+	call	.L025key_256a
+.byte	102,15,58,223,200,2
+	call	.L024key_256b
+.byte	102,15,58,223,202,4
+	call	.L025key_256a
+.byte	102,15,58,223,200,4
+	call	.L024key_256b
+.byte	102,15,58,223,202,8
+	call	.L025key_256a
+.byte	102,15,58,223,200,8
+	call	.L024key_256b
+.byte	102,15,58,223,202,16
+	call	.L025key_256a
+.byte	102,15,58,223,200,16
+	call	.L024key_256b
+.byte	102,15,58,223,202,32
+	call	.L025key_256a
+.byte	102,15,58,223,200,32
+	call	.L024key_256b
+.byte	102,15,58,223,202,64
+	call	.L025key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	.L022good_key
+.align	16
+.L025key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+.L023key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L024key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.L022good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	4
+.L018bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.size	aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin
+.globl	aes_hw_set_encrypt_key_alt
+.hidden	aes_hw_set_encrypt_key_alt
+.type	aes_hw_set_encrypt_key_alt,@function
+.align	16
+aes_hw_set_encrypt_key_alt:
+.L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L026pic_for_function_hit
+.L026pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L026pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	.L027pic
+.L027pic:
+	popl	%ebx
+	leal	.Lkey_const-.L027pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	.L02814rounds_alt
+	cmpl	$128,%ecx
+	jne	.L029bad_keybits
+.align	16
+.L03010rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+.L031loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	.L031loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	.L032good_key
+.align	16
+.L02814rounds_alt:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+.L033loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	.L034done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	.L033loop_key256
+.L034done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+.L032good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	4
+.L029bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.size	aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin
+.align	64
+.Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/aesni-x86-win32n.asm b/ring-0.17.14/pregenerated/aesni-x86-win32n.asm
new file mode 100644
index 0000000000..b9fe666eb7
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-x86-win32n.asm
@@ -0,0 +1,709 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+%ifdef BORINGSSL_DISPATCH_TEST
+extern	_BORINGSSL_function_hit
+%endif
+align	16
+__aesni_encrypt2:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$000enc2_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$000enc2_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,221,208
+db	102,15,56,221,216
+	ret
+align	16
+__aesni_encrypt3:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$001enc3_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$001enc3_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+	ret
+align	16
+__aesni_encrypt4:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	shl	ecx,4
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	15,31,64,0
+	add	ecx,16
+L$002enc4_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+db	102,15,56,220,232
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$002enc4_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+db	102,15,56,221,232
+	ret
+align	16
+__aesni_encrypt6:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+db	102,15,56,220,209
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+db	102,15,56,220,217
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	102,15,56,220,225
+	pxor	xmm7,xmm0
+	movups	xmm0,[ecx*1+edx]
+	add	ecx,16
+	jmp	NEAR L$003_aesni_encrypt6_inner
+align	16
+L$004enc6_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+L$003_aesni_encrypt6_inner:
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+L$_aesni_encrypt6_enter:
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+db	102,15,56,220,232
+db	102,15,56,220,240
+db	102,15,56,220,248
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$004enc6_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+db	102,15,56,221,232
+db	102,15,56,221,240
+db	102,15,56,221,248
+	ret
+global	_aes_hw_ctr32_encrypt_blocks
+align	16
+_aes_hw_ctr32_encrypt_blocks:
+L$_aes_hw_ctr32_encrypt_blocks_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$005pic_for_function_hit
+L$005pic_for_function_hit:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+0-L$005pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ebp,esp
+	sub	esp,88
+	and	esp,-16
+	mov	DWORD [80+esp],ebp
+	cmp	eax,1
+	je	NEAR L$006ctr32_one_shortcut
+	movdqu	xmm7,[ebx]
+	mov	DWORD [esp],202182159
+	mov	DWORD [4+esp],134810123
+	mov	DWORD [8+esp],67438087
+	mov	DWORD [12+esp],66051
+	mov	ecx,6
+	xor	ebp,ebp
+	mov	DWORD [16+esp],ecx
+	mov	DWORD [20+esp],ecx
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],ebp
+db	102,15,58,22,251,3
+db	102,15,58,34,253,3
+	mov	ecx,DWORD [240+edx]
+	bswap	ebx
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movdqa	xmm2,[esp]
+db	102,15,58,34,195,0
+	lea	ebp,[3+ebx]
+db	102,15,58,34,205,0
+	inc	ebx
+db	102,15,58,34,195,1
+	inc	ebp
+db	102,15,58,34,205,1
+	inc	ebx
+db	102,15,58,34,195,2
+	inc	ebp
+db	102,15,58,34,205,2
+	movdqa	[48+esp],xmm0
+db	102,15,56,0,194
+	movdqu	xmm6,[edx]
+	movdqa	[64+esp],xmm1
+db	102,15,56,0,202
+	pshufd	xmm2,xmm0,192
+	pshufd	xmm3,xmm0,128
+	cmp	eax,6
+	jb	NEAR L$007ctr32_tail
+	pxor	xmm7,xmm6
+	shl	ecx,4
+	mov	ebx,16
+	movdqa	[32+esp],xmm7
+	mov	ebp,edx
+	sub	ebx,ecx
+	lea	edx,[32+ecx*1+edx]
+	sub	eax,6
+	jmp	NEAR L$008ctr32_loop6
+align	16
+L$008ctr32_loop6:
+	pshufd	xmm4,xmm0,64
+	movdqa	xmm0,[32+esp]
+	pshufd	xmm5,xmm1,192
+	pxor	xmm2,xmm0
+	pshufd	xmm6,xmm1,128
+	pxor	xmm3,xmm0
+	pshufd	xmm7,xmm1,64
+	movups	xmm1,[16+ebp]
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+db	102,15,56,220,209
+	pxor	xmm6,xmm0
+	pxor	xmm7,xmm0
+db	102,15,56,220,217
+	movups	xmm0,[32+ebp]
+	mov	ecx,ebx
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+	call	L$_aesni_encrypt6_enter
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm3,xmm0
+	movups	[edi],xmm2
+	movdqa	xmm0,[16+esp]
+	xorps	xmm4,xmm1
+	movdqa	xmm1,[64+esp]
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	paddd	xmm1,xmm0
+	paddd	xmm0,[48+esp]
+	movdqa	xmm2,[esp]
+	movups	xmm3,[48+esi]
+	movups	xmm4,[64+esi]
+	xorps	xmm5,xmm3
+	movups	xmm3,[80+esi]
+	lea	esi,[96+esi]
+	movdqa	[48+esp],xmm0
+db	102,15,56,0,194
+	xorps	xmm6,xmm4
+	movups	[48+edi],xmm5
+	xorps	xmm7,xmm3
+	movdqa	[64+esp],xmm1
+db	102,15,56,0,202
+	movups	[64+edi],xmm6
+	pshufd	xmm2,xmm0,192
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	pshufd	xmm3,xmm0,128
+	sub	eax,6
+	jnc	NEAR L$008ctr32_loop6
+	add	eax,6
+	jz	NEAR L$009ctr32_ret
+	movdqu	xmm7,[ebp]
+	mov	edx,ebp
+	pxor	xmm7,[32+esp]
+	mov	ecx,DWORD [240+ebp]
+L$007ctr32_tail:
+	por	xmm2,xmm7
+	cmp	eax,2
+	jb	NEAR L$010ctr32_one
+	pshufd	xmm4,xmm0,64
+	por	xmm3,xmm7
+	je	NEAR L$011ctr32_two
+	pshufd	xmm5,xmm1,192
+	por	xmm4,xmm7
+	cmp	eax,4
+	jb	NEAR L$012ctr32_three
+	pshufd	xmm6,xmm1,128
+	por	xmm5,xmm7
+	je	NEAR L$013ctr32_four
+	por	xmm6,xmm7
+	call	__aesni_encrypt6
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm3,xmm0
+	movups	xmm0,[48+esi]
+	xorps	xmm4,xmm1
+	movups	xmm1,[64+esi]
+	xorps	xmm5,xmm0
+	movups	[edi],xmm2
+	xorps	xmm6,xmm1
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$006ctr32_one_shortcut:
+	movups	xmm2,[ebx]
+	mov	ecx,DWORD [240+edx]
+L$010ctr32_one:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$014enc1_loop_1:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$014enc1_loop_1
+db	102,15,56,221,209
+	movups	xmm6,[esi]
+	xorps	xmm6,xmm2
+	movups	[edi],xmm6
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$011ctr32_two:
+	call	__aesni_encrypt2
+	movups	xmm5,[esi]
+	movups	xmm6,[16+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$012ctr32_three:
+	call	__aesni_encrypt3
+	movups	xmm5,[esi]
+	movups	xmm6,[16+esi]
+	xorps	xmm2,xmm5
+	movups	xmm7,[32+esi]
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	xorps	xmm4,xmm7
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$013ctr32_four:
+	call	__aesni_encrypt4
+	movups	xmm6,[esi]
+	movups	xmm7,[16+esi]
+	movups	xmm1,[32+esi]
+	xorps	xmm2,xmm6
+	movups	xmm0,[48+esi]
+	xorps	xmm3,xmm7
+	movups	[edi],xmm2
+	xorps	xmm4,xmm1
+	movups	[16+edi],xmm3
+	xorps	xmm5,xmm0
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+L$009ctr32_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	movdqa	[32+esp],xmm0
+	pxor	xmm5,xmm5
+	movdqa	[48+esp],xmm0
+	pxor	xmm6,xmm6
+	movdqa	[64+esp],xmm0
+	pxor	xmm7,xmm7
+	mov	esp,DWORD [80+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_set_encrypt_key_base
+align	16
+_aes_hw_set_encrypt_key_base:
+L$_aes_hw_set_encrypt_key_base_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$015pic_for_function_hit
+L$015pic_for_function_hit:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+3-L$015pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	push	ebx
+	call	L$016pic
+L$016pic:
+	pop	ebx
+	lea	ebx,[(L$key_const-L$016pic)+ebx]
+	movups	xmm0,[eax]
+	xorps	xmm4,xmm4
+	lea	edx,[16+edx]
+	cmp	ecx,256
+	je	NEAR L$01714rounds
+	cmp	ecx,128
+	jne	NEAR L$018bad_keybits
+align	16
+L$01910rounds:
+	mov	ecx,9
+	movups	[edx-16],xmm0
+db	102,15,58,223,200,1
+	call	L$020key_128_cold
+db	102,15,58,223,200,2
+	call	L$021key_128
+db	102,15,58,223,200,4
+	call	L$021key_128
+db	102,15,58,223,200,8
+	call	L$021key_128
+db	102,15,58,223,200,16
+	call	L$021key_128
+db	102,15,58,223,200,32
+	call	L$021key_128
+db	102,15,58,223,200,64
+	call	L$021key_128
+db	102,15,58,223,200,128
+	call	L$021key_128
+db	102,15,58,223,200,27
+	call	L$021key_128
+db	102,15,58,223,200,54
+	call	L$021key_128
+	movups	[edx],xmm0
+	mov	DWORD [80+edx],ecx
+	jmp	NEAR L$022good_key
+align	16
+L$021key_128:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+L$020key_128_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$01714rounds:
+	movups	xmm2,[16+eax]
+	lea	edx,[16+edx]
+	mov	ecx,13
+	movups	[edx-32],xmm0
+	movups	[edx-16],xmm2
+db	102,15,58,223,202,1
+	call	L$023key_256a_cold
+db	102,15,58,223,200,1
+	call	L$024key_256b
+db	102,15,58,223,202,2
+	call	L$025key_256a
+db	102,15,58,223,200,2
+	call	L$024key_256b
+db	102,15,58,223,202,4
+	call	L$025key_256a
+db	102,15,58,223,200,4
+	call	L$024key_256b
+db	102,15,58,223,202,8
+	call	L$025key_256a
+db	102,15,58,223,200,8
+	call	L$024key_256b
+db	102,15,58,223,202,16
+	call	L$025key_256a
+db	102,15,58,223,200,16
+	call	L$024key_256b
+db	102,15,58,223,202,32
+	call	L$025key_256a
+db	102,15,58,223,200,32
+	call	L$024key_256b
+db	102,15,58,223,202,64
+	call	L$025key_256a
+	movups	[edx],xmm0
+	mov	DWORD [16+edx],ecx
+	xor	eax,eax
+	jmp	NEAR L$022good_key
+align	16
+L$025key_256a:
+	movups	[edx],xmm2
+	lea	edx,[16+edx]
+L$023key_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$024key_256b:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	ret
+L$022good_key:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	xor	eax,eax
+	pop	ebx
+	ret
+align	4
+L$018bad_keybits:
+	pxor	xmm0,xmm0
+	mov	eax,-2
+	pop	ebx
+	ret
+global	_aes_hw_set_encrypt_key_alt
+align	16
+_aes_hw_set_encrypt_key_alt:
+L$_aes_hw_set_encrypt_key_alt_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$026pic_for_function_hit
+L$026pic_for_function_hit:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+3-L$026pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	push	ebx
+	call	L$027pic
+L$027pic:
+	pop	ebx
+	lea	ebx,[(L$key_const-L$027pic)+ebx]
+	movups	xmm0,[eax]
+	xorps	xmm4,xmm4
+	lea	edx,[16+edx]
+	cmp	ecx,256
+	je	NEAR L$02814rounds_alt
+	cmp	ecx,128
+	jne	NEAR L$029bad_keybits
+align	16
+L$03010rounds_alt:
+	movdqa	xmm5,[ebx]
+	mov	ecx,8
+	movdqa	xmm4,[32+ebx]
+	movdqa	xmm2,xmm0
+	movdqu	[edx-16],xmm0
+L$031loop_key128:
+db	102,15,56,0,197
+db	102,15,56,221,196
+	pslld	xmm4,1
+	lea	edx,[16+edx]
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[edx-16],xmm0
+	movdqa	xmm2,xmm0
+	dec	ecx
+	jnz	NEAR L$031loop_key128
+	movdqa	xmm4,[48+ebx]
+db	102,15,56,0,197
+db	102,15,56,221,196
+	pslld	xmm4,1
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[edx],xmm0
+	movdqa	xmm2,xmm0
+db	102,15,56,0,197
+db	102,15,56,221,196
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[16+edx],xmm0
+	mov	ecx,9
+	mov	DWORD [96+edx],ecx
+	jmp	NEAR L$032good_key
+align	16
+L$02814rounds_alt:
+	movups	xmm2,[16+eax]
+	lea	edx,[16+edx]
+	movdqa	xmm5,[ebx]
+	movdqa	xmm4,[32+ebx]
+	mov	ecx,7
+	movdqu	[edx-32],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	[edx-16],xmm2
+L$033loop_key256:
+db	102,15,56,0,213
+db	102,15,56,221,212
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+	pxor	xmm0,xmm2
+	movdqu	[edx],xmm0
+	dec	ecx
+	jz	NEAR L$034done_key256
+	pshufd	xmm2,xmm0,255
+	pxor	xmm3,xmm3
+db	102,15,56,221,211
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+	pxor	xmm2,xmm1
+	movdqu	[16+edx],xmm2
+	lea	edx,[32+edx]
+	movdqa	xmm1,xmm2
+	jmp	NEAR L$033loop_key256
+L$034done_key256:
+	mov	ecx,13
+	mov	DWORD [16+edx],ecx
+L$032good_key:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	xor	eax,eax
+	pop	ebx
+	ret
+align	4
+L$029bad_keybits:
+	pxor	xmm0,xmm0
+	mov	eax,-2
+	pop	ebx
+	ret
+align	64
+L$key_const:
+dd	202313229,202313229,202313229,202313229
+dd	67569157,67569157,67569157,67569157
+dd	1,1,1,1
+dd	27,27,27,27
+db	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+db	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+db	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+db	115,108,46,111,114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/aesni-x86-win32n.o b/ring-0.17.14/pregenerated/aesni-x86-win32n.o
new file mode 100644
index 0000000000..12f7b0f1a7
Binary files /dev/null and b/ring-0.17.14/pregenerated/aesni-x86-win32n.o differ
diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-elf.S b/ring-0.17.14/pregenerated/aesni-x86_64-elf.S
new file mode 100644
index 0000000000..3504d56a8a
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-x86_64-elf.S
@@ -0,0 +1,1072 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.type	_aesni_encrypt2,@function
+.align	16
+_aesni_encrypt2:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop2:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop2
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+.Lenc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop6_enter
+.align	16
+.Lenc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.Lenc_loop6_enter:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_encrypt8,@function
+.align	16
+_aesni_encrypt8:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop8_inner
+.align	16
+.Lenc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.Lenc_loop8_inner:
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.Lenc_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt8,.-_aesni_encrypt8
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	16
+aes_hw_ctr32_encrypt_blocks:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit(%rip)
+#endif
+	cmpq	$1,%rdx
+	jne	.Lctr32_bulk
+
+
+
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm3
+	movl	240(%rcx),%edx
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%edx
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm2
+	jmp	.Lctr32_epilogue
+
+.align	16
+.Lctr32_bulk:
+	leaq	(%rsp),%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbp
+.cfi_offset	%rbp,-16
+	subq	$128,%rsp
+	andq	$-16,%rsp
+
+
+
+
+	movdqu	(%r8),%xmm2
+	movdqu	(%rcx),%xmm0
+	movl	12(%r8),%r8d
+	pxor	%xmm0,%xmm2
+	movl	12(%rcx),%ebp
+	movdqa	%xmm2,0(%rsp)
+	bswapl	%r8d
+	movdqa	%xmm2,%xmm3
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm2,%xmm5
+	movdqa	%xmm2,64(%rsp)
+	movdqa	%xmm2,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movq	%rdx,%r10
+	movdqa	%xmm2,112(%rsp)
+
+	leaq	1(%r8),%rax
+	leaq	2(%r8),%rdx
+	bswapl	%eax
+	bswapl	%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
+.byte	102,15,58,34,216,3
+	leaq	3(%r8),%rax
+	movdqa	%xmm3,16(%rsp)
+.byte	102,15,58,34,226,3
+	bswapl	%eax
+	movq	%r10,%rdx
+	leaq	4(%r8),%r10
+	movdqa	%xmm4,32(%rsp)
+	xorl	%ebp,%eax
+	bswapl	%r10d
+.byte	102,15,58,34,232,3
+	xorl	%ebp,%r10d
+	movdqa	%xmm5,48(%rsp)
+	leaq	5(%r8),%r9
+	movl	%r10d,64+12(%rsp)
+	bswapl	%r9d
+	leaq	6(%r8),%r10
+	movl	240(%rcx),%eax
+	xorl	%ebp,%r9d
+	bswapl	%r10d
+	movl	%r9d,80+12(%rsp)
+	xorl	%ebp,%r10d
+	leaq	7(%r8),%r9
+	movl	%r10d,96+12(%rsp)
+	bswapl	%r9d
+	xorl	%ebp,%r9d
+	movl	%r9d,112+12(%rsp)
+
+	movups	16(%rcx),%xmm1
+
+	movdqa	64(%rsp),%xmm6
+	movdqa	80(%rsp),%xmm7
+
+	cmpq	$8,%rdx
+	jb	.Lctr32_tail
+
+	leaq	128(%rcx),%rcx
+	subq	$8,%rdx
+	jmp	.Lctr32_loop8
+
+.align	32
+.Lctr32_loop8:
+	addl	$8,%r8d
+	movdqa	96(%rsp),%xmm8
+.byte	102,15,56,220,209
+	movl	%r8d,%r9d
+	movdqa	112(%rsp),%xmm9
+.byte	102,15,56,220,217
+	bswapl	%r9d
+	movups	32-128(%rcx),%xmm0
+.byte	102,15,56,220,225
+	xorl	%ebp,%r9d
+	nop
+.byte	102,15,56,220,233
+	movl	%r9d,0+12(%rsp)
+	leaq	1(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	48-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,16+12(%rsp)
+	leaq	2(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	64-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,32+12(%rsp)
+	leaq	3(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	80-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,48+12(%rsp)
+	leaq	4(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	96-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,64+12(%rsp)
+	leaq	5(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	112-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,80+12(%rsp)
+	leaq	6(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	128-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,96+12(%rsp)
+	leaq	7(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	144-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	xorl	%ebp,%r9d
+	movdqu	0(%rdi),%xmm10
+.byte	102,15,56,220,232
+	movl	%r9d,112+12(%rsp)
+	cmpl	$11,%eax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	160-128(%rcx),%xmm0
+
+	jb	.Lctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	176-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	192-128(%rcx),%xmm0
+
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	208-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	224-128(%rcx),%xmm0
+	jmp	.Lctr32_enc_done
+
+.align	16
+.Lctr32_enc_done:
+	movdqu	16(%rdi),%xmm11
+	pxor	%xmm0,%xmm10
+	movdqu	32(%rdi),%xmm12
+	pxor	%xmm0,%xmm11
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm0,%xmm12
+	movdqu	64(%rdi),%xmm14
+	pxor	%xmm0,%xmm13
+	movdqu	80(%rdi),%xmm15
+	pxor	%xmm0,%xmm14
+	prefetcht0	448(%rdi)
+	prefetcht0	512(%rdi)
+	pxor	%xmm0,%xmm15
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movdqu	96(%rdi),%xmm1
+	leaq	128(%rdi),%rdi
+
+.byte	102,65,15,56,221,210
+	pxor	%xmm0,%xmm1
+	movdqu	112-128(%rdi),%xmm10
+.byte	102,65,15,56,221,219
+	pxor	%xmm0,%xmm10
+	movdqa	0(%rsp),%xmm11
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	movdqa	16(%rsp),%xmm12
+	movdqa	32(%rsp),%xmm13
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	movdqa	48(%rsp),%xmm14
+	movdqa	64(%rsp),%xmm15
+.byte	102,68,15,56,221,193
+	movdqa	80(%rsp),%xmm0
+	movups	16-128(%rcx),%xmm1
+.byte	102,69,15,56,221,202
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm0,%xmm7
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+
+	subq	$8,%rdx
+	jnc	.Lctr32_loop8
+
+	addq	$8,%rdx
+	jz	.Lctr32_done
+	leaq	-128(%rcx),%rcx
+
+.Lctr32_tail:
+
+
+	leaq	16(%rcx),%rcx
+	cmpq	$4,%rdx
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
+
+
+	shll	$4,%eax
+	movdqa	96(%rsp),%xmm8
+	pxor	%xmm9,%xmm9
+
+	movups	16(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	32-16(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,225
+	addq	$16,%rax
+	movups	(%rdi),%xmm10
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+	movups	16(%rdi),%xmm11
+	movups	32(%rdi),%xmm12
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+
+	call	.Lenc_loop8_enter
+
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm10,%xmm2
+	movdqu	64(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm10,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	cmpq	$6,%rdx
+	jb	.Lctr32_done
+
+	movups	80(%rdi),%xmm11
+	xorps	%xmm11,%xmm7
+	movups	%xmm7,80(%rsi)
+	je	.Lctr32_done
+
+	movups	96(%rdi),%xmm12
+	xorps	%xmm12,%xmm8
+	movups	%xmm8,96(%rsi)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop4:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop4
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+	movups	(%rdi),%xmm10
+	movups	16(%rdi),%xmm11
+.byte	102,15,56,221,225
+.byte	102,15,56,221,233
+	movups	32(%rdi),%xmm12
+	movups	48(%rdi),%xmm13
+
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm5,48(%rsi)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop3:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop3
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+.byte	102,15,56,221,225
+
+	movups	(%rdi),%xmm10
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	cmpq	$2,%rdx
+	jb	.Lctr32_done
+
+	movups	16(%rdi),%xmm11
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	je	.Lctr32_done
+
+	movups	32(%rdi),%xmm12
+	xorps	%xmm12,%xmm4
+	movups	%xmm4,32(%rsi)
+
+.Lctr32_done:
+	xorps	%xmm0,%xmm0
+	xorl	%ebp,%ebp
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,112(%rsp)
+	pxor	%xmm15,%xmm15
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lctr32_epilogue:
+	ret
+.cfi_endproc	
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl	aes_hw_set_encrypt_key_base
+.hidden aes_hw_set_encrypt_key_base
+.type	aes_hw_set_encrypt_key_base,@function
+.align	16
+aes_hw_set_encrypt_key_base:
+.cfi_startproc	
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds
+
+
+	cmpl	$128,%esi
+	jne	.Lbad_keybits
+
+.L10rounds:
+	movl	$9,%esi
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,64
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,128
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,27
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,54
+	call	.Lkey_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc	
+
+
+.align	16
+.Lkey_expansion_128:
+.cfi_startproc	
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.cfi_endproc	
+
+.align	16
+.Lkey_expansion_256a:
+.cfi_startproc	
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.cfi_endproc	
+
+.align	16
+.Lkey_expansion_256b:
+.cfi_startproc	
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.cfi_endproc	
+.size	aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base
+
+.globl	aes_hw_set_encrypt_key_alt
+.hidden aes_hw_set_encrypt_key_alt
+.type	aes_hw_set_encrypt_key_alt,@function
+.align	16
+aes_hw_set_encrypt_key_alt:
+.cfi_startproc	
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds_alt
+
+	cmpl	$128,%esi
+	jne	.Lbad_keybits_alt
+
+	movl	$9,%esi
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	.Loop_key128
+
+	movdqa	.Lkey_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+
+
+.align	16
+.L14rounds_alt:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	.Ldone_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.Lbad_keybits_alt:
+	movq	$-2,%rax
+.Lenc_key_ret_alt:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc	
+
+.size	aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long	6,6,6,0
+.Lincrement64:
+.long	1,0,0,0
+.Lincrement1:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long	0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long	1,1,1,1
+.Lkey_rcon1b:
+.long	0x1b,0x1b,0x1b,0x1b
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-macosx.S b/ring-0.17.14/pregenerated/aesni-x86_64-macosx.S
new file mode 100644
index 0000000000..5737063d55
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-x86_64-macosx.S
@@ -0,0 +1,1072 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.p2align	4
+_aesni_encrypt2:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+L$enc_loop2:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop2
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt3:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+L$enc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt4:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+L$enc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt6:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	L$enc_loop6_enter
+.p2align	4
+L$enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+L$enc_loop6_enter:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt8:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	L$enc_loop8_inner
+.p2align	4
+L$enc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+L$enc_loop8_inner:
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+L$enc_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	ret
+
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+
+.p2align	4
+_aes_hw_ctr32_encrypt_blocks:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit(%rip)
+#endif
+	cmpq	$1,%rdx
+	jne	L$ctr32_bulk
+
+
+
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm3
+	movl	240(%rcx),%edx
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+L$oop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%edx
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_enc1_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm2
+	jmp	L$ctr32_epilogue
+
+.p2align	4
+L$ctr32_bulk:
+	leaq	(%rsp),%r11
+
+	pushq	%rbp
+
+	subq	$128,%rsp
+	andq	$-16,%rsp
+
+
+
+
+	movdqu	(%r8),%xmm2
+	movdqu	(%rcx),%xmm0
+	movl	12(%r8),%r8d
+	pxor	%xmm0,%xmm2
+	movl	12(%rcx),%ebp
+	movdqa	%xmm2,0(%rsp)
+	bswapl	%r8d
+	movdqa	%xmm2,%xmm3
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm2,%xmm5
+	movdqa	%xmm2,64(%rsp)
+	movdqa	%xmm2,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movq	%rdx,%r10
+	movdqa	%xmm2,112(%rsp)
+
+	leaq	1(%r8),%rax
+	leaq	2(%r8),%rdx
+	bswapl	%eax
+	bswapl	%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
+.byte	102,15,58,34,216,3
+	leaq	3(%r8),%rax
+	movdqa	%xmm3,16(%rsp)
+.byte	102,15,58,34,226,3
+	bswapl	%eax
+	movq	%r10,%rdx
+	leaq	4(%r8),%r10
+	movdqa	%xmm4,32(%rsp)
+	xorl	%ebp,%eax
+	bswapl	%r10d
+.byte	102,15,58,34,232,3
+	xorl	%ebp,%r10d
+	movdqa	%xmm5,48(%rsp)
+	leaq	5(%r8),%r9
+	movl	%r10d,64+12(%rsp)
+	bswapl	%r9d
+	leaq	6(%r8),%r10
+	movl	240(%rcx),%eax
+	xorl	%ebp,%r9d
+	bswapl	%r10d
+	movl	%r9d,80+12(%rsp)
+	xorl	%ebp,%r10d
+	leaq	7(%r8),%r9
+	movl	%r10d,96+12(%rsp)
+	bswapl	%r9d
+	xorl	%ebp,%r9d
+	movl	%r9d,112+12(%rsp)
+
+	movups	16(%rcx),%xmm1
+
+	movdqa	64(%rsp),%xmm6
+	movdqa	80(%rsp),%xmm7
+
+	cmpq	$8,%rdx
+	jb	L$ctr32_tail
+
+	leaq	128(%rcx),%rcx
+	subq	$8,%rdx
+	jmp	L$ctr32_loop8
+
+.p2align	5
+L$ctr32_loop8:
+	addl	$8,%r8d
+	movdqa	96(%rsp),%xmm8
+.byte	102,15,56,220,209
+	movl	%r8d,%r9d
+	movdqa	112(%rsp),%xmm9
+.byte	102,15,56,220,217
+	bswapl	%r9d
+	movups	32-128(%rcx),%xmm0
+.byte	102,15,56,220,225
+	xorl	%ebp,%r9d
+	nop
+.byte	102,15,56,220,233
+	movl	%r9d,0+12(%rsp)
+	leaq	1(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	48-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,16+12(%rsp)
+	leaq	2(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	64-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,32+12(%rsp)
+	leaq	3(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	80-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,48+12(%rsp)
+	leaq	4(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	96-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,64+12(%rsp)
+	leaq	5(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	112-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,80+12(%rsp)
+	leaq	6(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	128-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,96+12(%rsp)
+	leaq	7(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	144-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	xorl	%ebp,%r9d
+	movdqu	0(%rdi),%xmm10
+.byte	102,15,56,220,232
+	movl	%r9d,112+12(%rsp)
+	cmpl	$11,%eax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	160-128(%rcx),%xmm0
+
+	jb	L$ctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	176-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	192-128(%rcx),%xmm0
+
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	208-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	224-128(%rcx),%xmm0
+	jmp	L$ctr32_enc_done
+
+.p2align	4
+L$ctr32_enc_done:
+	movdqu	16(%rdi),%xmm11
+	pxor	%xmm0,%xmm10
+	movdqu	32(%rdi),%xmm12
+	pxor	%xmm0,%xmm11
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm0,%xmm12
+	movdqu	64(%rdi),%xmm14
+	pxor	%xmm0,%xmm13
+	movdqu	80(%rdi),%xmm15
+	pxor	%xmm0,%xmm14
+	prefetcht0	448(%rdi)
+	prefetcht0	512(%rdi)
+	pxor	%xmm0,%xmm15
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movdqu	96(%rdi),%xmm1
+	leaq	128(%rdi),%rdi
+
+.byte	102,65,15,56,221,210
+	pxor	%xmm0,%xmm1
+	movdqu	112-128(%rdi),%xmm10
+.byte	102,65,15,56,221,219
+	pxor	%xmm0,%xmm10
+	movdqa	0(%rsp),%xmm11
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	movdqa	16(%rsp),%xmm12
+	movdqa	32(%rsp),%xmm13
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	movdqa	48(%rsp),%xmm14
+	movdqa	64(%rsp),%xmm15
+.byte	102,68,15,56,221,193
+	movdqa	80(%rsp),%xmm0
+	movups	16-128(%rcx),%xmm1
+.byte	102,69,15,56,221,202
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm0,%xmm7
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+
+	subq	$8,%rdx
+	jnc	L$ctr32_loop8
+
+	addq	$8,%rdx
+	jz	L$ctr32_done
+	leaq	-128(%rcx),%rcx
+
+L$ctr32_tail:
+
+
+	leaq	16(%rcx),%rcx
+	cmpq	$4,%rdx
+	jb	L$ctr32_loop3
+	je	L$ctr32_loop4
+
+
+	shll	$4,%eax
+	movdqa	96(%rsp),%xmm8
+	pxor	%xmm9,%xmm9
+
+	movups	16(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	32-16(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,225
+	addq	$16,%rax
+	movups	(%rdi),%xmm10
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+	movups	16(%rdi),%xmm11
+	movups	32(%rdi),%xmm12
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+
+	call	L$enc_loop8_enter
+
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm10,%xmm2
+	movdqu	64(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm10,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	cmpq	$6,%rdx
+	jb	L$ctr32_done
+
+	movups	80(%rdi),%xmm11
+	xorps	%xmm11,%xmm7
+	movups	%xmm7,80(%rsi)
+	je	L$ctr32_done
+
+	movups	96(%rdi),%xmm12
+	xorps	%xmm12,%xmm8
+	movups	%xmm8,96(%rsi)
+	jmp	L$ctr32_done
+
+.p2align	5
+L$ctr32_loop4:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx),%xmm1
+	jnz	L$ctr32_loop4
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+	movups	(%rdi),%xmm10
+	movups	16(%rdi),%xmm11
+.byte	102,15,56,221,225
+.byte	102,15,56,221,233
+	movups	32(%rdi),%xmm12
+	movups	48(%rdi),%xmm13
+
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm5,48(%rsi)
+	jmp	L$ctr32_done
+
+.p2align	5
+L$ctr32_loop3:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx),%xmm1
+	jnz	L$ctr32_loop3
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+.byte	102,15,56,221,225
+
+	movups	(%rdi),%xmm10
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	cmpq	$2,%rdx
+	jb	L$ctr32_done
+
+	movups	16(%rdi),%xmm11
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	je	L$ctr32_done
+
+	movups	32(%rdi),%xmm12
+	xorps	%xmm12,%xmm4
+	movups	%xmm4,32(%rsi)
+
+L$ctr32_done:
+	xorps	%xmm0,%xmm0
+	xorl	%ebp,%ebp
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,112(%rsp)
+	pxor	%xmm15,%xmm15
+	movq	-8(%r11),%rbp
+
+	leaq	(%r11),%rsp
+
+L$ctr32_epilogue:
+	ret
+
+
+.globl	_aes_hw_set_encrypt_key_base
+.private_extern _aes_hw_set_encrypt_key_base
+
+.p2align	4
+_aes_hw_set_encrypt_key_base:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	L$14rounds
+
+
+	cmpl	$128,%esi
+	jne	L$bad_keybits
+
+L$10rounds:
+	movl	$9,%esi
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	L$key_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,4
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,8
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,16
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,32
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,64
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,128
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,27
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,54
+	call	L$key_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret
+
+
+
+.p2align	4
+L$14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	L$key_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,2
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,2
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,4
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,4
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,8
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,8
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,16
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,16
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,32
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,32
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,64
+	call	L$key_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$bad_keybits:
+	movq	$-2,%rax
+L$enc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+
+	ret
+
+
+
+.p2align	4
+L$key_expansion_128:
+
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+
+.p2align	4
+L$key_expansion_256a:
+
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+
+.p2align	4
+L$key_expansion_256b:
+
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+
+
+
+.globl	_aes_hw_set_encrypt_key_alt
+.private_extern _aes_hw_set_encrypt_key_alt
+
+.p2align	4
+_aes_hw_set_encrypt_key_alt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	L$14rounds_alt
+
+	cmpl	$128,%esi
+	jne	L$bad_keybits_alt
+
+	movl	$9,%esi
+	movdqa	L$key_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	L$oop_key128
+
+.p2align	4
+L$oop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	L$oop_key128
+
+	movdqa	L$key_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret_alt
+
+
+
+.p2align	4
+L$14rounds_alt:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movdqa	L$key_rotate(%rip),%xmm5
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	L$oop_key256
+
+.p2align	4
+L$oop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	L$done_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	L$oop_key256
+
+L$done_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret_alt
+
+.p2align	4
+L$bad_keybits_alt:
+	movq	$-2,%rax
+L$enc_key_ret_alt:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+
+	ret
+
+
+
+.section	__DATA,__const
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$increment32:
+.long	6,6,6,0
+L$increment64:
+.long	1,0,0,0
+L$increment1:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long	0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long	1,1,1,1
+L$key_rcon1b:
+.long	0x1b,0x1b,0x1b,0x1b
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm
new file mode 100644
index 0000000000..ceeec55321
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm
@@ -0,0 +1,1242 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+ALIGN	16
+_aesni_encrypt2:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$enc_loop2:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop2
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt3:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$enc_loop3:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop3
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt4:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	xorps	xmm5,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	0x0f,0x1f,0x00
+	add	rax,16
+
+$L$enc_loop4:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop4
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	DB	102,15,56,221,232
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt6:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	DB	102,15,56,220,209
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,220,217
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	DB	102,15,56,220,225
+	pxor	xmm7,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$enc_loop6_enter
+ALIGN	16
+$L$enc_loop6:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+$L$enc_loop6_enter:
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop6
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	DB	102,15,56,221,232
+	DB	102,15,56,221,240
+	DB	102,15,56,221,248
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt8:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,220,209
+	pxor	xmm7,xmm0
+	pxor	xmm8,xmm0
+	DB	102,15,56,220,217
+	pxor	xmm9,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$enc_loop8_inner
+ALIGN	16
+$L$enc_loop8:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+$L$enc_loop8_inner:
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+$L$enc_loop8_enter:
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop8
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	DB	102,15,56,221,232
+	DB	102,15,56,221,240
+	DB	102,15,56,221,248
+	DB	102,68,15,56,221,192
+	DB	102,68,15,56,221,200
+	ret
+
+
+global	aes_hw_ctr32_encrypt_blocks
+
+ALIGN	16
+aes_hw_ctr32_encrypt_blocks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes_hw_ctr32_encrypt_blocks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+	mov	BYTE[BORINGSSL_function_hit],1
+%endif
+	cmp	rdx,1
+	jne	NEAR $L$ctr32_bulk
+
+
+
+	movups	xmm2,XMMWORD[r8]
+	movups	xmm3,XMMWORD[rdi]
+	mov	edx,DWORD[240+rcx]
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_enc1_1:
+	DB	102,15,56,220,209
+	dec	edx
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_enc1_1
+	DB	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	xorps	xmm2,xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm2,xmm2
+	jmp	NEAR $L$ctr32_epilogue
+
+ALIGN	16
+$L$ctr32_bulk:
+	lea	r11,[rsp]
+
+	push	rbp
+
+	sub	rsp,288
+	and	rsp,-16
+	movaps	XMMWORD[(-168)+r11],xmm6
+	movaps	XMMWORD[(-152)+r11],xmm7
+	movaps	XMMWORD[(-136)+r11],xmm8
+	movaps	XMMWORD[(-120)+r11],xmm9
+	movaps	XMMWORD[(-104)+r11],xmm10
+	movaps	XMMWORD[(-88)+r11],xmm11
+	movaps	XMMWORD[(-72)+r11],xmm12
+	movaps	XMMWORD[(-56)+r11],xmm13
+	movaps	XMMWORD[(-40)+r11],xmm14
+	movaps	XMMWORD[(-24)+r11],xmm15
+$L$ctr32_body:
+
+
+
+
+	movdqu	xmm2,XMMWORD[r8]
+	movdqu	xmm0,XMMWORD[rcx]
+	mov	r8d,DWORD[12+r8]
+	pxor	xmm2,xmm0
+	mov	ebp,DWORD[12+rcx]
+	movdqa	XMMWORD[rsp],xmm2
+	bswap	r8d
+	movdqa	xmm3,xmm2
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm2
+	movdqa	XMMWORD[64+rsp],xmm2
+	movdqa	XMMWORD[80+rsp],xmm2
+	movdqa	XMMWORD[96+rsp],xmm2
+	mov	r10,rdx
+	movdqa	XMMWORD[112+rsp],xmm2
+
+	lea	rax,[1+r8]
+	lea	rdx,[2+r8]
+	bswap	eax
+	bswap	edx
+	xor	eax,ebp
+	xor	edx,ebp
+DB	102,15,58,34,216,3
+	lea	rax,[3+r8]
+	movdqa	XMMWORD[16+rsp],xmm3
+DB	102,15,58,34,226,3
+	bswap	eax
+	mov	rdx,r10
+	lea	r10,[4+r8]
+	movdqa	XMMWORD[32+rsp],xmm4
+	xor	eax,ebp
+	bswap	r10d
+DB	102,15,58,34,232,3
+	xor	r10d,ebp
+	movdqa	XMMWORD[48+rsp],xmm5
+	lea	r9,[5+r8]
+	mov	DWORD[((64+12))+rsp],r10d
+	bswap	r9d
+	lea	r10,[6+r8]
+	mov	eax,DWORD[240+rcx]
+	xor	r9d,ebp
+	bswap	r10d
+	mov	DWORD[((80+12))+rsp],r9d
+	xor	r10d,ebp
+	lea	r9,[7+r8]
+	mov	DWORD[((96+12))+rsp],r10d
+	bswap	r9d
+	xor	r9d,ebp
+	mov	DWORD[((112+12))+rsp],r9d
+
+	movups	xmm1,XMMWORD[16+rcx]
+
+	movdqa	xmm6,XMMWORD[64+rsp]
+	movdqa	xmm7,XMMWORD[80+rsp]
+
+	cmp	rdx,8
+	jb	NEAR $L$ctr32_tail
+
+	lea	rcx,[128+rcx]
+	sub	rdx,8
+	jmp	NEAR $L$ctr32_loop8
+
+ALIGN	32
+$L$ctr32_loop8:
+	add	r8d,8
+	movdqa	xmm8,XMMWORD[96+rsp]
+	DB	102,15,56,220,209
+	mov	r9d,r8d
+	movdqa	xmm9,XMMWORD[112+rsp]
+	DB	102,15,56,220,217
+	bswap	r9d
+	movups	xmm0,XMMWORD[((32-128))+rcx]
+	DB	102,15,56,220,225
+	xor	r9d,ebp
+	nop
+	DB	102,15,56,220,233
+	mov	DWORD[((0+12))+rsp],r9d
+	lea	r9,[1+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((48-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	mov	DWORD[((16+12))+rsp],r9d
+	lea	r9,[2+r8]
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((64-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	mov	DWORD[((32+12))+rsp],r9d
+	lea	r9,[3+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((80-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	mov	DWORD[((48+12))+rsp],r9d
+	lea	r9,[4+r8]
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((96-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	mov	DWORD[((64+12))+rsp],r9d
+	lea	r9,[5+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((112-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	mov	DWORD[((80+12))+rsp],r9d
+	lea	r9,[6+r8]
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((128-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	mov	DWORD[((96+12))+rsp],r9d
+	lea	r9,[7+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((144-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	xor	r9d,ebp
+	movdqu	xmm10,XMMWORD[rdi]
+	DB	102,15,56,220,232
+	mov	DWORD[((112+12))+rsp],r9d
+	cmp	eax,11
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((160-128))+rcx]
+
+	jb	NEAR $L$ctr32_enc_done
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((176-128))+rcx]
+
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((192-128))+rcx]
+
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((208-128))+rcx]
+
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((224-128))+rcx]
+	jmp	NEAR $L$ctr32_enc_done
+
+ALIGN	16
+$L$ctr32_enc_done:
+	movdqu	xmm11,XMMWORD[16+rdi]
+	pxor	xmm10,xmm0
+	movdqu	xmm12,XMMWORD[32+rdi]
+	pxor	xmm11,xmm0
+	movdqu	xmm13,XMMWORD[48+rdi]
+	pxor	xmm12,xmm0
+	movdqu	xmm14,XMMWORD[64+rdi]
+	pxor	xmm13,xmm0
+	movdqu	xmm15,XMMWORD[80+rdi]
+	pxor	xmm14,xmm0
+	prefetcht0	[448+rdi]
+	prefetcht0	[512+rdi]
+	pxor	xmm15,xmm0
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movdqu	xmm1,XMMWORD[96+rdi]
+	lea	rdi,[128+rdi]
+
+	DB	102,65,15,56,221,210
+	pxor	xmm1,xmm0
+	movdqu	xmm10,XMMWORD[((112-128))+rdi]
+	DB	102,65,15,56,221,219
+	pxor	xmm10,xmm0
+	movdqa	xmm11,XMMWORD[rsp]
+	DB	102,65,15,56,221,228
+	DB	102,65,15,56,221,237
+	movdqa	xmm12,XMMWORD[16+rsp]
+	movdqa	xmm13,XMMWORD[32+rsp]
+	DB	102,65,15,56,221,246
+	DB	102,65,15,56,221,255
+	movdqa	xmm14,XMMWORD[48+rsp]
+	movdqa	xmm15,XMMWORD[64+rsp]
+	DB	102,68,15,56,221,193
+	movdqa	xmm0,XMMWORD[80+rsp]
+	movups	xmm1,XMMWORD[((16-128))+rcx]
+	DB	102,69,15,56,221,202
+
+	movups	XMMWORD[rsi],xmm2
+	movdqa	xmm2,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	movdqa	xmm3,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+	movdqa	xmm4,xmm13
+	movups	XMMWORD[48+rsi],xmm5
+	movdqa	xmm5,xmm14
+	movups	XMMWORD[64+rsi],xmm6
+	movdqa	xmm6,xmm15
+	movups	XMMWORD[80+rsi],xmm7
+	movdqa	xmm7,xmm0
+	movups	XMMWORD[96+rsi],xmm8
+	movups	XMMWORD[112+rsi],xmm9
+	lea	rsi,[128+rsi]
+
+	sub	rdx,8
+	jnc	NEAR $L$ctr32_loop8
+
+	add	rdx,8
+	jz	NEAR $L$ctr32_done
+	lea	rcx,[((-128))+rcx]
+
+$L$ctr32_tail:
+
+
+	lea	rcx,[16+rcx]
+	cmp	rdx,4
+	jb	NEAR $L$ctr32_loop3
+	je	NEAR $L$ctr32_loop4
+
+
+	shl	eax,4
+	movdqa	xmm8,XMMWORD[96+rsp]
+	pxor	xmm9,xmm9
+
+	movups	xmm0,XMMWORD[16+rcx]
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	lea	rcx,[((32-16))+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,220,225
+	add	rax,16
+	movups	xmm10,XMMWORD[rdi]
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	movups	xmm11,XMMWORD[16+rdi]
+	movups	xmm12,XMMWORD[32+rdi]
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+
+	call	$L$enc_loop8_enter
+
+	movdqu	xmm13,XMMWORD[48+rdi]
+	pxor	xmm2,xmm10
+	movdqu	xmm10,XMMWORD[64+rdi]
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm6,xmm10
+	movdqu	XMMWORD[48+rsi],xmm5
+	movdqu	XMMWORD[64+rsi],xmm6
+	cmp	rdx,6
+	jb	NEAR $L$ctr32_done
+
+	movups	xmm11,XMMWORD[80+rdi]
+	xorps	xmm7,xmm11
+	movups	XMMWORD[80+rsi],xmm7
+	je	NEAR $L$ctr32_done
+
+	movups	xmm12,XMMWORD[96+rdi]
+	xorps	xmm8,xmm12
+	movups	XMMWORD[96+rsi],xmm8
+	jmp	NEAR $L$ctr32_done
+
+ALIGN	32
+$L$ctr32_loop4:
+	DB	102,15,56,220,209
+	lea	rcx,[16+rcx]
+	dec	eax
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rcx]
+	jnz	NEAR $L$ctr32_loop4
+	DB	102,15,56,221,209
+	DB	102,15,56,221,217
+	movups	xmm10,XMMWORD[rdi]
+	movups	xmm11,XMMWORD[16+rdi]
+	DB	102,15,56,221,225
+	DB	102,15,56,221,233
+	movups	xmm12,XMMWORD[32+rdi]
+	movups	xmm13,XMMWORD[48+rdi]
+
+	xorps	xmm2,xmm10
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm3,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[48+rsi],xmm5
+	jmp	NEAR $L$ctr32_done
+
+ALIGN	32
+$L$ctr32_loop3:
+	DB	102,15,56,220,209
+	lea	rcx,[16+rcx]
+	dec	eax
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	movups	xmm1,XMMWORD[rcx]
+	jnz	NEAR $L$ctr32_loop3
+	DB	102,15,56,221,209
+	DB	102,15,56,221,217
+	DB	102,15,56,221,225
+
+	movups	xmm10,XMMWORD[rdi]
+	xorps	xmm2,xmm10
+	movups	XMMWORD[rsi],xmm2
+	cmp	rdx,2
+	jb	NEAR $L$ctr32_done
+
+	movups	xmm11,XMMWORD[16+rdi]
+	xorps	xmm3,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	je	NEAR $L$ctr32_done
+
+	movups	xmm12,XMMWORD[32+rdi]
+	xorps	xmm4,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+
+$L$ctr32_done:
+	xorps	xmm0,xmm0
+	xor	ebp,ebp
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	XMMWORD[(-168)+r11],xmm0
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	XMMWORD[(-152)+r11],xmm0
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	XMMWORD[(-136)+r11],xmm0
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	XMMWORD[(-120)+r11],xmm0
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	XMMWORD[(-104)+r11],xmm0
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	XMMWORD[(-88)+r11],xmm0
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	XMMWORD[(-72)+r11],xmm0
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	XMMWORD[(-56)+r11],xmm0
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	XMMWORD[(-40)+r11],xmm0
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	movaps	XMMWORD[(-24)+r11],xmm0
+	movaps	XMMWORD[rsp],xmm0
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	XMMWORD[48+rsp],xmm0
+	movaps	XMMWORD[64+rsp],xmm0
+	movaps	XMMWORD[80+rsp],xmm0
+	movaps	XMMWORD[96+rsp],xmm0
+	movaps	XMMWORD[112+rsp],xmm0
+	mov	rbp,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$ctr32_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes_hw_ctr32_encrypt_blocks:
+global	aes_hw_set_encrypt_key_base
+
+ALIGN	16
+aes_hw_set_encrypt_key_base:
+
+$L$SEH_begin_aes_hw_set_encrypt_key_base_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+	mov	BYTE[((BORINGSSL_function_hit+3))],1
+%endif
+	sub	rsp,8
+
+$L$SEH_prologue_aes_hw_set_encrypt_key_base_2:
+$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3:
+	movups	xmm0,XMMWORD[rcx]
+	xorps	xmm4,xmm4
+	lea	rax,[16+r8]
+	cmp	edx,256
+	je	NEAR $L$14rounds
+
+
+	cmp	edx,128
+	jne	NEAR $L$bad_keybits
+
+$L$10rounds:
+	mov	edx,9
+
+	movups	XMMWORD[r8],xmm0
+	DB	102,15,58,223,200,1
+	call	$L$key_expansion_128_cold
+	DB	102,15,58,223,200,2
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,4
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,8
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,16
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,32
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,64
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,128
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,27
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,54
+	call	$L$key_expansion_128
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[80+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+
+
+ALIGN	16
+$L$14rounds:
+	movups	xmm2,XMMWORD[16+rcx]
+	mov	edx,13
+	lea	rax,[16+rax]
+
+	movups	XMMWORD[r8],xmm0
+	movups	XMMWORD[16+r8],xmm2
+	DB	102,15,58,223,202,1
+	call	$L$key_expansion_256a_cold
+	DB	102,15,58,223,200,1
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,2
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,2
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,4
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,4
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,8
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,8
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,16
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,16
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,32
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,32
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,64
+	call	$L$key_expansion_256a
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[16+rax],edx
+	xor	rax,rax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$bad_keybits:
+	mov	rax,-2
+$L$enc_key_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	add	rsp,8
+
+	ret
+
+$L$SEH_end_aes_hw_set_encrypt_key_base_4:
+
+ALIGN	16
+$L$key_expansion_128:
+
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+$L$key_expansion_128_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+
+
+ALIGN	16
+$L$key_expansion_256a:
+
+	movups	XMMWORD[rax],xmm2
+	lea	rax,[16+rax]
+$L$key_expansion_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+
+
+ALIGN	16
+$L$key_expansion_256b:
+
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	ret
+
+
+
+global	aes_hw_set_encrypt_key_alt
+
+ALIGN	16
+aes_hw_set_encrypt_key_alt:
+
+$L$SEH_begin_aes_hw_set_encrypt_key_alt_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+	mov	BYTE[((BORINGSSL_function_hit+3))],1
+%endif
+	sub	rsp,8
+
+$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2:
+$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3:
+	movups	xmm0,XMMWORD[rcx]
+	xorps	xmm4,xmm4
+	lea	rax,[16+r8]
+	cmp	edx,256
+	je	NEAR $L$14rounds_alt
+
+	cmp	edx,128
+	jne	NEAR $L$bad_keybits_alt
+
+	mov	edx,9
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	mov	r10d,8
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	movdqa	xmm2,xmm0
+	movdqu	XMMWORD[r8],xmm0
+	jmp	NEAR $L$oop_key128
+
+ALIGN	16
+$L$oop_key128:
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+	pslld	xmm4,1
+	lea	rax,[16+rax]
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[(-16)+rax],xmm0
+	movdqa	xmm2,xmm0
+
+	dec	r10d
+	jnz	NEAR $L$oop_key128
+
+	movdqa	xmm4,XMMWORD[$L$key_rcon1b]
+
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+	pslld	xmm4,1
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	movdqa	xmm2,xmm0
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[16+rax],xmm0
+
+	mov	DWORD[96+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret_alt
+
+
+
+ALIGN	16
+$L$14rounds_alt:
+	movups	xmm2,XMMWORD[16+rcx]
+	mov	edx,13
+	lea	rax,[16+rax]
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	mov	r10d,7
+	movdqu	XMMWORD[r8],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	XMMWORD[16+r8],xmm2
+	jmp	NEAR $L$oop_key256
+
+ALIGN	16
+$L$oop_key256:
+DB	102,15,56,0,213
+	DB	102,15,56,221,212
+
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	dec	r10d
+	jz	NEAR $L$done_key256
+
+	pshufd	xmm2,xmm0,0xff
+	pxor	xmm3,xmm3
+	DB	102,15,56,221,211
+
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+
+	pxor	xmm2,xmm1
+	movdqu	XMMWORD[16+rax],xmm2
+	lea	rax,[32+rax]
+	movdqa	xmm1,xmm2
+
+	jmp	NEAR $L$oop_key256
+
+$L$done_key256:
+	mov	DWORD[16+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret_alt
+
+ALIGN	16
+$L$bad_keybits_alt:
+	mov	rax,-2
+$L$enc_key_ret_alt:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	add	rsp,8
+
+	ret
+
+$L$SEH_end_aes_hw_set_encrypt_key_alt_4:
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$bswap_mask:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$increment32:
+	DD	6,6,6,0
+$L$increment64:
+	DD	1,0,0,0
+$L$increment1:
+	DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate:
+	DD	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+$L$key_rotate192:
+	DD	0x04070605,0x04070605,0x04070605,0x04070605
+$L$key_rcon1:
+	DD	1,1,1,1
+$L$key_rcon1b:
+	DD	0x1b,0x1b,0x1b,0x1b
+
+	DB	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+	DB	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+	DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+	DB	115,108,46,111,114,103,62,0
+ALIGN	64
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+ctr_xts_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[208+r8]
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	QWORD[160+r8],rbp
+
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_info_ctr32 wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ctr32:
+	DB	9,0,0,0
+	DD	ctr_xts_se_handler wrt ..imagebase
+	DD	$L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+section	.pdata
+ALIGN	4
+	DD	$L$SEH_begin_aes_hw_set_encrypt_key_base_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_set_encrypt_key_base_4 wrt ..imagebase
+	DD	$L$SEH_info_aes_hw_set_encrypt_key_base_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_set_encrypt_key_alt_4 wrt ..imagebase
+	DD	$L$SEH_info_aes_hw_set_encrypt_key_alt_0 wrt ..imagebase
+
+
+section	.xdata
+ALIGN	4
+$L$SEH_info_aes_hw_set_encrypt_key_base_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3-$L$SEH_begin_aes_hw_set_encrypt_key_base_1
+	DB	1
+	DB	0
+	DB	$L$SEH_prologue_aes_hw_set_encrypt_key_base_2-$L$SEH_begin_aes_hw_set_encrypt_key_base_1
+	DB	2
+
+	DW	0
+$L$SEH_info_aes_hw_set_encrypt_key_alt_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1
+	DB	1
+	DB	0
+	DB	$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1
+	DB	2
+
+	DW	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-nasm.o b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.o
new file mode 100644
index 0000000000..f0a1501b54
Binary files /dev/null and b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/aesv8-armx-ios64.S b/ring-0.17.14/pregenerated/aesv8-armx-ios64.S
new file mode 100644
index 0000000000..75c626a017
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesv8-armx-ios64.S
@@ -0,0 +1,353 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.section	__TEXT,__const
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+
+.align	5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon@PAGE
+	add	x3,x3,Lrcon@PAGEOFF
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	// 192-bit key support was removed.
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+// 192-bit key support was removed.
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+
+.align	5
+_aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/aesv8-armx-linux64.S b/ring-0.17.14/pregenerated/aesv8-armx-linux64.S
new file mode 100644
index 0000000000..5ed5a72347
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesv8-armx-linux64.S
@@ -0,0 +1,353 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,%function
+.align	5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	.Lenc_key_abort
+	cmp	w1,#256
+	b.gt	.Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	.Lenc_key_abort
+
+	adrp	x3,.Lrcon
+	add	x3,x3,:lo12:.Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	.Loop128
+	// 192-bit key support was removed.
+	b	.L256
+
+.align	4
+.Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	.Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	.Ldone
+
+// 192-bit key support was removed.
+
+.align	4
+.L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+.Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	.Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	.Loop256
+
+.Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+.Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,%function
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	.Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	.Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	.Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+.Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	.Lctr32_done
+	st1	{v3.16b},[x1]
+
+.Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/aesv8-armx-win64.S b/ring-0.17.14/pregenerated/aesv8-armx-win64.S
new file mode 100644
index 0000000000..70cc6420a6
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesv8-armx-win64.S
@@ -0,0 +1,357 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+
+.def aes_hw_set_encrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_encrypt_key:
+Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon
+	add	x3,x3,:lo12:Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	// 192-bit key support was removed.
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+// 192-bit key support was removed.
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+.globl	aes_hw_ctr32_encrypt_blocks
+
+.def aes_hw_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S
new file mode 100644
index 0000000000..1450b89446
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S
@@ -0,0 +1,1554 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#if __ARM_MAX_ARCH__ >= 8
+
+
+.text
+.globl	_aes_gcm_enc_kernel
+.private_extern	_aes_gcm_enc_kernel
+
+.align	4
+_aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	Lenc_prepretail                                  // do prepretail
+
+Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	Lenc_main_loop
+
+Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_aes_gcm_dec_kernel
+.private_extern	_aes_gcm_dec_kernel
+
+.align	4
+_aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S
new file mode 100644
index 0000000000..4e87ce7592
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S
@@ -0,0 +1,1554 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch	armv8-a+crypto
+.text
+.globl	aes_gcm_enc_kernel
+.hidden	aes_gcm_enc_kernel
+.type	aes_gcm_enc_kernel,%function
+.align	4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	.Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	.Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+.Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	.Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	.Lenc_prepretail                                  // do prepretail
+
+.Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	.Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	.Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+.Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // .LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	.Lenc_main_loop
+
+.Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	.Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	.Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+.Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+.Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	.Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	.Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	.Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.Lenc_blocks_less_than_1
+.Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+.Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+.Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+.Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
+.globl	aes_gcm_dec_kernel
+.hidden	aes_gcm_dec_kernel
+.type	aes_gcm_dec_kernel,%function
+.align	4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	.Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	.Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+.Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	.Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	.Ldec_prepretail                                  // do prepretail
+
+.Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	.Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	.Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+.Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // .LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	.Ldec_main_loop
+
+.Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	.Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	.Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+.Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+.Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	.Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	.Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	.Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.Ldec_blocks_less_than_1
+.Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+.Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+.Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+.Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S
new file mode 100644
index 0000000000..6f39544f29
--- /dev/null
+++ b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S
@@ -0,0 +1,1558 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch	armv8-a+crypto
+.text
+.globl	aes_gcm_enc_kernel
+
+.def aes_gcm_enc_kernel
+   .type 32
+.endef
+.align	4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	Lenc_prepretail                                  // do prepretail
+
+Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	Lenc_main_loop
+
+Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	aes_gcm_dec_kernel
+
+.def aes_gcm_dec_kernel
+   .type 32
+.endef
+.align	4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/armv4-mont-linux32.S b/ring-0.17.14/pregenerated/armv4-mont-linux32.S
new file mode 100644
index 0000000000..e1c005501e
--- /dev/null
+++ b/ring-0.17.14/pregenerated/armv4-mont-linux32.S
@@ -0,0 +1,937 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.globl	bn_mul_mont_nohw
+.hidden	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+
+.align	5
+bn_mul_mont_nohw:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	cmp	ip,#2
+	mov	r0,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
+
+	mov	r0,r0,lsl#2		@ rescale r0 for byte count
+	sub	sp,sp,r0		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	r0,r0,#4		@ "num=num-1"
+	add	r4,r2,r0		@ &bp[num-1]
+
+	add	r0,sp,r0		@ r0 to point at &tp[num-1]
+	ldr	r8,[r0,#14*4]		@ &n0
+	ldr	r2,[r2]		@ bp[0]
+	ldr	r5,[r1],#4		@ ap[0],ap++
+	ldr	r6,[r3],#4		@ np[0],np++
+	ldr	r8,[r8]		@ *n0
+	str	r4,[r0,#15*4]		@ save &bp[num]
+
+	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
+	str	r8,[r0,#14*4]		@ save n0 value
+	mul	r8,r10,r8		@ "tp[0]"*n0
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
+	mov	r4,sp
+
+.L1st:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	.L1st
+
+	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
+	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	mov	r7,sp
+	str	r14,[r0,#4]		@ tp[num]=
+
+.Louter:
+	sub	r7,r0,r7		@ "original" r0-1 value
+	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
+	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
+	ldr	r5,[r1,#-4]		@ ap[0]
+	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
+	ldr	r7,[sp,#4]		@ tp[1]
+
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
+	str	r4,[r0,#13*4]		@ save bp
+	mul	r8,r10,r8
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
+	mov	r4,sp
+
+.Linner:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	.Linner
+
+	adds	r12,r12,r11
+	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
+	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adds	r12,r12,r7
+	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	str	r14,[r0,#4]		@ tp[num]=
+
+	cmp	r4,r7
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	r7,sp
+	bne	.Louter
+
+	ldr	r2,[r0,#12*4]		@ pull rp
+	mov	r5,sp
+	add	r0,r0,#4		@ r0 to point at &tp[num]
+	sub	r5,r0,r5		@ "original" num value
+	mov	r4,sp			@ "rewind" r4
+	mov	r1,r4			@ "borrow" r1
+	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
+
+	subs	r7,r7,r7		@ "clear" carry flag
+.Lsub:	ldr	r7,[r4],#4
+	ldr	r6,[r3],#4
+	sbcs	r7,r7,r6		@ tp[j]-np[j]
+	str	r7,[r2],#4		@ rp[j]=
+	teq	r4,r0		@ preserve carry
+	bne	.Lsub
+	sbcs	r14,r14,#0		@ upmost carry
+	mov	r4,sp			@ "rewind" r4
+	sub	r2,r2,r5		@ "rewind" r2
+
+.Lcopy:	ldr	r7,[r4]		@ conditional copy
+	ldr	r5,[r2]
+	str	sp,[r4],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	r5,r7
+	str	r5,[r2],#4
+	teq	r4,r0		@ preserve carry
+	bne	.Lcopy
+
+	mov	sp,r0
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH>=5
+	bx	lr				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	bn_mul8x_mont_neon
+.hidden	bn_mul8x_mont_neon
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	r5,#8
+	bhi	.LNEON_8n
+
+	@ special case for r5==8, everything is in register bank...
+
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	sub	r7,sp,r5,lsl#4
+	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
+	and	r7,r7,#-64
+	vld1.32	{d30[0]}, [r4,:32]
+	mov	sp,r7			@ alloca
+	vzip.16	d28,d8
+
+	vmull.u32	q6,d28,d0[0]
+	vmull.u32	q7,d28,d0[1]
+	vmull.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmull.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	vmul.u32	d29,d29,d30
+
+	vmull.u32	q10,d28,d2[0]
+	vld1.32	{d4,d5,d6,d7}, [r3]!
+	vmull.u32	q11,d28,d2[1]
+	vmull.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmull.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	sub	r9,r5,#1
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	vadd.u64	d12,d12,d10
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	subs	r9,r9,#1
+	vmul.u32	d29,d29,d30
+
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	d12,d12,d10
+	mov	r7,sp
+	vshr.u64	d10,d12,#16
+	mov	r8,r5
+	vadd.u64	d13,d13,d10
+	add	r6,sp,#96
+	vshr.u64	d10,d13,#16
+	vzip.16	d12,d13
+
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_8n:
+	veor	q6,q6,q6
+	sub	r7,sp,#128
+	veor	q7,q7,q7
+	sub	r7,r7,r5,lsl#4
+	veor	q8,q8,q8
+	and	r7,r7,#-64
+	veor	q9,q9,q9
+	mov	sp,r7			@ alloca
+	veor	q10,q10,q10
+	add	r7,r7,#256
+	veor	q11,q11,q11
+	sub	r8,r5,#8
+	veor	q12,q12,q12
+	veor	q13,q13,q13
+
+.LNEON_8n_init:
+	vst1.64	{q6,q7},[r7,:256]!
+	subs	r8,r8,#8
+	vst1.64	{q8,q9},[r7,:256]!
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12,q13},[r7,:256]!
+	bne	.LNEON_8n_init
+
+	add	r6,sp,#256
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	add	r10,sp,#8
+	vld1.32	{d30[0]},[r4,:32]
+	mov	r9,r5
+	b	.LNEON_8n_outer
+
+.align	4
+.LNEON_8n_outer:
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	add	r7,sp,#128
+	vld1.32	{d4,d5,d6,d7},[r3]!
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+	vadd.u64	d29,d29,d12
+	vmlal.u32	q10,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q11,d28,d2[1]
+	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q6,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q7,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q8,d29,d5[0]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vadd.u64	d12,d12,d13
+	vmlal.u32	q11,d29,d6[1]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vadd.u64	d14,d14,d12
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]!
+	vmlal.u32	q8,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q9,d28,d1[0]
+	vshl.i64	d29,d15,#16
+	vmlal.u32	q10,d28,d1[1]
+	vadd.u64	d29,d29,d14
+	vmlal.u32	q11,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q12,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
+	vmlal.u32	q13,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q7,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q8,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q9,d29,d5[0]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vadd.u64	d14,d14,d15
+	vmlal.u32	q12,d29,d6[1]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vadd.u64	d16,d16,d14
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]!
+	vmlal.u32	q9,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q10,d28,d1[0]
+	vshl.i64	d29,d17,#16
+	vmlal.u32	q11,d28,d1[1]
+	vadd.u64	d29,d29,d16
+	vmlal.u32	q12,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q13,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
+	vmlal.u32	q6,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q8,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q9,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q10,d29,d5[0]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vadd.u64	d16,d16,d17
+	vmlal.u32	q13,d29,d6[1]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vadd.u64	d18,d18,d16
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]!
+	vmlal.u32	q10,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q11,d28,d1[0]
+	vshl.i64	d29,d19,#16
+	vmlal.u32	q12,d28,d1[1]
+	vadd.u64	d29,d29,d18
+	vmlal.u32	q13,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q6,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
+	vmlal.u32	q7,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q9,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q10,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q11,d29,d5[0]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vadd.u64	d18,d18,d19
+	vmlal.u32	q6,d29,d6[1]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vadd.u64	d20,d20,d18
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]!
+	vmlal.u32	q11,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q12,d28,d1[0]
+	vshl.i64	d29,d21,#16
+	vmlal.u32	q13,d28,d1[1]
+	vadd.u64	d29,d29,d20
+	vmlal.u32	q6,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q7,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
+	vmlal.u32	q8,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q10,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q11,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q12,d29,d5[0]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vadd.u64	d20,d20,d21
+	vmlal.u32	q7,d29,d6[1]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vadd.u64	d22,d22,d20
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]!
+	vmlal.u32	q12,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q13,d28,d1[0]
+	vshl.i64	d29,d23,#16
+	vmlal.u32	q6,d28,d1[1]
+	vadd.u64	d29,d29,d22
+	vmlal.u32	q7,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q8,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
+	vmlal.u32	q9,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q11,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q12,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q13,d29,d5[0]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vadd.u64	d22,d22,d23
+	vmlal.u32	q8,d29,d6[1]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vadd.u64	d24,d24,d22
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]!
+	vmlal.u32	q13,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q6,d28,d1[0]
+	vshl.i64	d29,d25,#16
+	vmlal.u32	q7,d28,d1[1]
+	vadd.u64	d29,d29,d24
+	vmlal.u32	q8,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q9,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
+	vmlal.u32	q10,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q12,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q13,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q6,d29,d5[0]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vadd.u64	d24,d24,d25
+	vmlal.u32	q9,d29,d6[1]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vadd.u64	d26,d26,d24
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]!
+	vmlal.u32	q6,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q7,d28,d1[0]
+	vshl.i64	d29,d27,#16
+	vmlal.u32	q8,d28,d1[1]
+	vadd.u64	d29,d29,d26
+	vmlal.u32	q9,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q10,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
+	vmlal.u32	q11,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q12,d28,d3[1]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q6,d29,d4[1]
+	vmlal.u32	q7,d29,d5[0]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vadd.u64	d26,d26,d27
+	vmlal.u32	q10,d29,d6[1]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q11,d29,d7[0]
+	vmlal.u32	q12,d29,d7[1]
+	vadd.u64	d12,d12,d26
+	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
+	add	r10,sp,#8		@ rewind
+	sub	r8,r5,#8
+	b	.LNEON_8n_inner
+
+.align	4
+.LNEON_8n_inner:
+	subs	r8,r8,#8
+	vmlal.u32	q6,d28,d0[0]
+	vld1.64	{q13},[r6,:128]
+	vmlal.u32	q7,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	q8,d28,d1[0]
+	vld1.32	{d4,d5,d6,d7},[r3]!
+	vmlal.u32	q9,d28,d1[1]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vmlal.u32	q11,d29,d6[1]
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vst1.64	{q6},[r7,:128]!
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]
+	vmlal.u32	q8,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
+	vmlal.u32	q9,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d1[1]
+	vmlal.u32	q11,d28,d2[0]
+	vmlal.u32	q12,d28,d2[1]
+	vmlal.u32	q13,d28,d3[0]
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
+	vmlal.u32	q7,d29,d4[0]
+	vmlal.u32	q8,d29,d4[1]
+	vmlal.u32	q9,d29,d5[0]
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vmlal.u32	q12,d29,d6[1]
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vst1.64	{q7},[r7,:128]!
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]
+	vmlal.u32	q9,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
+	vmlal.u32	q10,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q11,d28,d1[1]
+	vmlal.u32	q12,d28,d2[0]
+	vmlal.u32	q13,d28,d2[1]
+	vmlal.u32	q6,d28,d3[0]
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
+	vmlal.u32	q8,d29,d4[0]
+	vmlal.u32	q9,d29,d4[1]
+	vmlal.u32	q10,d29,d5[0]
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vmlal.u32	q13,d29,d6[1]
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vst1.64	{q8},[r7,:128]!
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]
+	vmlal.u32	q10,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
+	vmlal.u32	q11,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q12,d28,d1[1]
+	vmlal.u32	q13,d28,d2[0]
+	vmlal.u32	q6,d28,d2[1]
+	vmlal.u32	q7,d28,d3[0]
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
+	vmlal.u32	q9,d29,d4[0]
+	vmlal.u32	q10,d29,d4[1]
+	vmlal.u32	q11,d29,d5[0]
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vmlal.u32	q6,d29,d6[1]
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vst1.64	{q9},[r7,:128]!
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]
+	vmlal.u32	q11,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
+	vmlal.u32	q12,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q13,d28,d1[1]
+	vmlal.u32	q6,d28,d2[0]
+	vmlal.u32	q7,d28,d2[1]
+	vmlal.u32	q8,d28,d3[0]
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
+	vmlal.u32	q10,d29,d4[0]
+	vmlal.u32	q11,d29,d4[1]
+	vmlal.u32	q12,d29,d5[0]
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vmlal.u32	q7,d29,d6[1]
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vst1.64	{q10},[r7,:128]!
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]
+	vmlal.u32	q12,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
+	vmlal.u32	q13,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q6,d28,d1[1]
+	vmlal.u32	q7,d28,d2[0]
+	vmlal.u32	q8,d28,d2[1]
+	vmlal.u32	q9,d28,d3[0]
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
+	vmlal.u32	q11,d29,d4[0]
+	vmlal.u32	q12,d29,d4[1]
+	vmlal.u32	q13,d29,d5[0]
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vmlal.u32	q8,d29,d6[1]
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vst1.64	{q11},[r7,:128]!
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]
+	vmlal.u32	q13,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
+	vmlal.u32	q6,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q7,d28,d1[1]
+	vmlal.u32	q8,d28,d2[0]
+	vmlal.u32	q9,d28,d2[1]
+	vmlal.u32	q10,d28,d3[0]
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
+	vmlal.u32	q12,d29,d4[0]
+	vmlal.u32	q13,d29,d4[1]
+	vmlal.u32	q6,d29,d5[0]
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vmlal.u32	q9,d29,d6[1]
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vst1.64	{q12},[r7,:128]!
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]
+	vmlal.u32	q6,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
+	vmlal.u32	q7,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q8,d28,d1[1]
+	vmlal.u32	q9,d28,d2[0]
+	vmlal.u32	q10,d28,d2[1]
+	vmlal.u32	q11,d28,d3[0]
+	vmlal.u32	q12,d28,d3[1]
+	it	eq
+	subeq	r1,r1,r5,lsl#2	@ rewind
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q6,d29,d4[1]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q7,d29,d5[0]
+	add	r10,sp,#8		@ rewind
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vmlal.u32	q10,d29,d6[1]
+	vmlal.u32	q11,d29,d7[0]
+	vst1.64	{q13},[r7,:128]!
+	vmlal.u32	q12,d29,d7[1]
+
+	bne	.LNEON_8n_inner
+	add	r6,sp,#128
+	vst1.64	{q6,q7},[r7,:256]!
+	veor	q2,q2,q2		@ d4-d5
+	vst1.64	{q8,q9},[r7,:256]!
+	veor	q3,q3,q3		@ d6-d7
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12},[r7,:128]
+
+	subs	r9,r9,#8
+	vld1.64	{q6,q7},[r6,:256]!
+	vld1.64	{q8,q9},[r6,:256]!
+	vld1.64	{q10,q11},[r6,:256]!
+	vld1.64	{q12,q13},[r6,:256]!
+
+	itt	ne
+	subne	r3,r3,r5,lsl#2	@ rewind
+	bne	.LNEON_8n_outer
+
+	add	r7,sp,#128
+	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	d10,d12,#16
+	vst1.64	{q2,q3},[sp,:256]!
+	vadd.u64	d13,d13,d10
+	vst1.64	{q2,q3}, [sp,:256]!
+	vshr.u64	d10,d13,#16
+	vst1.64	{q2,q3}, [sp,:256]!
+	vzip.16	d12,d13
+
+	mov	r8,r5
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_tail:
+	vadd.u64	d12,d12,d10
+	vshr.u64	d10,d12,#16
+	vld1.64	{q8,q9}, [r6, :256]!
+	vadd.u64	d13,d13,d10
+	vld1.64	{q10,q11}, [r6, :256]!
+	vshr.u64	d10,d13,#16
+	vld1.64	{q12,q13}, [r6, :256]!
+	vzip.16	d12,d13
+
+.LNEON_tail_entry:
+	vadd.u64	d14,d14,d10
+	vst1.32	{d12[0]}, [r7, :32]!
+	vshr.u64	d10,d14,#16
+	vadd.u64	d15,d15,d10
+	vshr.u64	d10,d15,#16
+	vzip.16	d14,d15
+	vadd.u64	d16,d16,d10
+	vst1.32	{d14[0]}, [r7, :32]!
+	vshr.u64	d10,d16,#16
+	vadd.u64	d17,d17,d10
+	vshr.u64	d10,d17,#16
+	vzip.16	d16,d17
+	vadd.u64	d18,d18,d10
+	vst1.32	{d16[0]}, [r7, :32]!
+	vshr.u64	d10,d18,#16
+	vadd.u64	d19,d19,d10
+	vshr.u64	d10,d19,#16
+	vzip.16	d18,d19
+	vadd.u64	d20,d20,d10
+	vst1.32	{d18[0]}, [r7, :32]!
+	vshr.u64	d10,d20,#16
+	vadd.u64	d21,d21,d10
+	vshr.u64	d10,d21,#16
+	vzip.16	d20,d21
+	vadd.u64	d22,d22,d10
+	vst1.32	{d20[0]}, [r7, :32]!
+	vshr.u64	d10,d22,#16
+	vadd.u64	d23,d23,d10
+	vshr.u64	d10,d23,#16
+	vzip.16	d22,d23
+	vadd.u64	d24,d24,d10
+	vst1.32	{d22[0]}, [r7, :32]!
+	vshr.u64	d10,d24,#16
+	vadd.u64	d25,d25,d10
+	vshr.u64	d10,d25,#16
+	vzip.16	d24,d25
+	vadd.u64	d26,d26,d10
+	vst1.32	{d24[0]}, [r7, :32]!
+	vshr.u64	d10,d26,#16
+	vadd.u64	d27,d27,d10
+	vshr.u64	d10,d27,#16
+	vzip.16	d26,d27
+	vld1.64	{q6,q7}, [r6, :256]!
+	subs	r8,r8,#8
+	vst1.32	{d26[0]},   [r7, :32]!
+	bne	.LNEON_tail
+
+	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
+	sub	r3,r3,r5,lsl#2			@ rewind r3
+	subs	r1,sp,#0				@ clear carry flag
+	add	r2,sp,r5,lsl#2
+
+.LNEON_sub:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r3!, {r8,r9,r10,r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [r1]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,r2,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	r1,sp
+	sub	r0,r0,r11				@ rewind r0
+	mov	r3,r2				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r0,  {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	r1, {r4,r5,r6,r7}
+	stmia	r0!, {r8,r9,r10,r11}
+	sub	r1,r1,#16
+	ldmia	r0, {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	.LNEON_copy_n_zap
+
+	mov	sp,ip
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	bx	lr						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/armv8-mont-ios64.S b/ring-0.17.14/pregenerated/armv8-mont-ios64.S
new file mode 100644
index 0000000000..ef7d5b6761
--- /dev/null
+++ b/ring-0.17.14/pregenerated/armv8-mont-ios64.S
@@ -0,0 +1,1416 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+.text
+
+.globl	_bn_mul_mont_nohw
+.private_extern	_bn_mul_mont_nohw
+
+.align	5
+_bn_mul_mont_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_bn_sqr8x_mont
+.private_extern	_bn_sqr8x_mont
+
+.align	5
+_bn_sqr8x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_bn_mul4x_mont
+.private_extern	_bn_mul4x_mont
+
+.align	5
+_bn_mul4x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/armv8-mont-linux64.S b/ring-0.17.14/pregenerated/armv8-mont-linux64.S
new file mode 100644
index 0000000000..4e8a5c226d
--- /dev/null
+++ b/ring-0.17.14/pregenerated/armv8-mont-linux64.S
@@ -0,0 +1,1416 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+.text
+
+.globl	bn_mul_mont_nohw
+.hidden	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+.align	5
+bn_mul_mont_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,.L1st_skip
+
+.L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,.L1st
+
+.L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+.Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,.Linner_skip
+
+.Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,.Linner
+
+.Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,.Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+.Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,.Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+.Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,.Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+.globl	bn_sqr8x_mont
+.hidden	bn_sqr8x_mont
+.type	bn_sqr8x_mont,%function
+.align	5
+bn_sqr8x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	.Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,.Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+.Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,.Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+.Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,.Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	.Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	.Lsqr8x_mul
+
+.align	4
+.Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,.Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	.Lsqr8x_outer_loop
+
+.align	4
+.Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+.Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,.Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+.Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,.Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,.Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+.Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,.Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,.Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	.Lsqr8x_tail
+
+.align	4
+.Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	.Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+.Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,.Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+.Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,.Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	.Lsqr8x_done
+
+.align	4
+.Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+.Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+.globl	bn_mul4x_mont
+.hidden	bn_mul4x_mont
+.type	bn_mul4x_mont,%function
+.align	5
+bn_mul4x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+.Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_1st_reduction
+
+	cbz	x10,.Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,.Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	.Loop_mul4x_1st_tail
+
+.align	5
+.Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+.Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+.Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,.Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	.Loop_mul4x_tail
+
+.align	4
+.Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	.Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	.Loop_mul4x_reduction
+
+.align	4
+.Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+.Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,.Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+.Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,.Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	.Lmul4x_done
+
+.align	4
+.Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+.Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/armv8-mont-win64.S b/ring-0.17.14/pregenerated/armv8-mont-win64.S
new file mode 100644
index 0000000000..b5f2b9eb80
--- /dev/null
+++ b/ring-0.17.14/pregenerated/armv8-mont-win64.S
@@ -0,0 +1,1422 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+.text
+
+.globl	bn_mul_mont_nohw
+
+.def bn_mul_mont_nohw
+   .type 32
+.endef
+.align	5
+bn_mul_mont_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	bn_sqr8x_mont
+
+.def bn_sqr8x_mont
+   .type 32
+.endef
+.align	5
+bn_sqr8x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	bn_mul4x_mont
+
+.def bn_mul4x_mont
+   .type 32
+.endef
+.align	5
+bn_mul4x_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/bsaes-armv7-linux32.S b/ring-0.17.14/pregenerated/bsaes-armv7-linux32.S
new file mode 100644
index 0000000000..771c2cdf7b
--- /dev/null
+++ b/ring-0.17.14/pregenerated/bsaes-armv7-linux32.S
@@ -0,0 +1,796 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@     https://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt	19.5 cycles per byte processed with 128-bit key
+@ decrypt	22.1 cycles per byte processed with 128-bit key
+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@						<appro@openssl.org>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code	32
+# undef __thumb2__
+#endif
+
+.type	_bsaes_const,%object
+.align	6
+_bsaes_const:
+.LM0ISR:@ InvShiftRows constants
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:@ ShiftRows constants
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	6
+.size	_bsaes_const,.-_bsaes_const
+
+.type	_bsaes_encrypt8,%function
+.align	4
+_bsaes_encrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0SR
+#else
+	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
+#endif
+
+	vldmia	r6!, {q8}		@ .LM0SR
+_bsaes_encrypt8_alt:
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	.Lenc_sbox
+.align	4
+.Lenc_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Lenc_sbox:
+	veor	q2, q2, q1
+	veor	q5, q5, q6
+	veor	q3, q3, q0
+	veor	q6, q6, q2
+	veor	q5, q5, q0
+
+	veor	q6, q6, q3
+	veor	q3, q3, q7
+	veor	q7, q7, q5
+	veor	q3, q3, q4
+	veor	q4, q4, q5
+
+	veor	q2, q2, q7
+	veor	q3, q3, q1
+	veor	q1, q1, q5
+	veor	q11, q7, q4
+	veor	q10, q1, q2
+	veor	q9, q5, q3
+	veor	q13, q2, q4
+	vmov	q8, q10
+	veor	q12, q6, q0
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q3, q0
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q7, q1
+	veor	q12, q5, q6
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q2, q3
+	veor	q9, q9, q14
+	vand	q13, q4, q0
+	vand	q14, q1, q5
+	vorr	q15, q7, q6
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q6, q0
+	veor	q8, q5, q3
+	veor	q10, q15, q14
+	vand	q10, q10, q6
+	veor	q6, q6, q5
+	vand	q11, q5, q15
+	vand	q6, q6, q14
+	veor	q5, q11, q10
+	veor	q6, q6, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q3
+	vand	q8, q8, q15
+	vand	q3, q3, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q3
+	veor	q12, q12, q11
+	veor	q3, q3, q10
+	veor	q6, q6, q12
+	veor	q0, q0, q12
+	veor	q5, q5, q8
+	veor	q3, q3, q8
+
+	veor	q12, q7, q4
+	veor	q8, q1, q2
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q4
+	veor	q12, q12, q8
+	veor	q4, q4, q2
+	vand	q8, q8, q15
+	vand	q2, q2, q13
+	vand	q12, q12, q14
+	vand	q4, q4, q9
+	veor	q8, q8, q12
+	veor	q4, q4, q2
+	veor	q12, q12, q11
+	veor	q2, q2, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q7
+	veor	q7, q7, q1
+	vand	q11, q1, q15
+	vand	q7, q7, q14
+	veor	q1, q11, q10
+	veor	q7, q7, q11
+	veor	q7, q7, q12
+	veor	q4, q4, q12
+	veor	q1, q1, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q0
+	veor	q1, q1, q6
+	veor	q6, q6, q0
+	veor	q4, q4, q7
+	veor	q0, q0, q1
+
+	veor	q1, q1, q5
+	veor	q5, q5, q2
+	veor	q2, q2, q3
+	veor	q3, q3, q5
+	veor	q4, q4, q5
+
+	veor	q6, q6, q3
+	subs	r5,r5,#1
+	bcc	.Lenc_done
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q4, q4, #12
+	veor	q1, q1, q9
+	vext.8	q11, q6, q6, #12
+	veor	q4, q4, q10
+	vext.8	q12, q3, q3, #12
+	veor	q6, q6, q11
+	vext.8	q13, q7, q7, #12
+	veor	q3, q3, q12
+	vext.8	q14, q2, q2, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q2, q2, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q3
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q3, q3, #8
+	veor	q12, q12, q6
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q2
+	vext.8	q3, q6, q6, #8
+	veor	q11, q11, q4
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q6, q2, q2, #8
+	veor	q11, q11, q5
+	vext.8	q2, q4, q4, #8
+	veor	q5, q9, q13
+	veor	q4, q8, q12
+	veor	q3, q3, q11
+	veor	q7, q7, q15
+	veor	q6, q6, q14
+	 @ vmov	q4, q8
+	veor	q2, q2, q10
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Lenc_loop
+	vldmia	r6, {q12}		@ .LSRM0
+	b	.Lenc_loop
+.align	4
+.Lenc_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q3, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q4, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q6
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q4, q4, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q3, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q6
+	veor	q11, q11, q4
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #2
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q6, #4
+	vshr.u64	q11, q4, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q4, q4, q8
+	veor	q6, q6, q8
+	veor	q3, q3, q8
+	veor	q7, q7, q8
+	veor	q2, q2, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+.type	_bsaes_key_convert,%function
+.align	4
+_bsaes_key_convert:
+	adr	r6,.
+	vld1.8	{q7},  [r4]!		@ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0
+#else
+	sub	r6,r6,#_bsaes_key_convert-.LM0
+#endif
+	vld1.8	{q15}, [r4]!		@ load round 1 key
+
+	vmov.i8	q8,  #0x01			@ bit masks
+	vmov.i8	q9,  #0x02
+	vmov.i8	q10, #0x04
+	vmov.i8	q11, #0x08
+	vmov.i8	q12, #0x10
+	vmov.i8	q13, #0x20
+	vldmia	r6, {q14}		@ .LM0
+
+#ifdef __ARMEL__
+	vrev32.8	q7,  q7
+	vrev32.8	q15, q15
+#endif
+	sub	r5,r5,#1
+	vstmia	r12!, {q7}		@ save round 0 key
+	b	.Lkey_loop
+
+.align	4
+.Lkey_loop:
+	vtbl.8	d14,{q15},d28
+	vtbl.8	d15,{q15},d29
+	vmov.i8	q6,  #0x40
+	vmov.i8	q15, #0x80
+
+	vtst.8	q0, q7, q8
+	vtst.8	q1, q7, q9
+	vtst.8	q2, q7, q10
+	vtst.8	q3, q7, q11
+	vtst.8	q4, q7, q12
+	vtst.8	q5, q7, q13
+	vtst.8	q6, q7, q6
+	vtst.8	q7, q7, q15
+	vld1.8	{q15}, [r4]!		@ load next round key
+	vmvn	q0, q0		@ "pnot"
+	vmvn	q1, q1
+	vmvn	q5, q5
+	vmvn	q6, q6
+#ifdef __ARMEL__
+	vrev32.8	q15, q15
+#endif
+	subs	r5,r5,#1
+	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
+	bne	.Lkey_loop
+
+	vmov.i8	q7,#0x63			@ compose .L63
+	@ don't save last round key
+	bx	lr
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+.globl	bsaes_ctr32_encrypt_blocks
+.hidden	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,%function
+.align	5
+bsaes_ctr32_encrypt_blocks:
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+	vld1.8	{q0}, [r8]		@ load counter
+#ifdef	__APPLE__
+	mov	r8, #:lower16:(.LREVM0SR-.LM0)
+	add	r8, r6, r8
+#else
+	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
+#endif
+	vldmia	sp, {q4}		@ load round0 key
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+.align	2
+	add	r12, r3, #248
+	vld1.8	{q0}, [r8]		@ load counter
+	adrl	r8, .LREVM0SR			@ borrow r8
+	vldmia	r12, {q4}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	q8,#1		@ compose 1<<96
+	veor	q9,q9,q9
+	vrev32.8	q0,q0
+	vext.8	q8,q9,q8,#4
+	vrev32.8	q4,q4
+	vadd.u32	q9,q8,q8	@ compose 2<<96
+	vstmia	sp, {q4}		@ save adjusted round0 key
+	b	.Lctr_enc_loop
+
+.align	4
+.Lctr_enc_loop:
+	vadd.u32	q10, q8, q9	@ compose 3<<96
+	vadd.u32	q1, q0, q8	@ +1
+	vadd.u32	q2, q0, q9	@ +2
+	vadd.u32	q3, q0, q10	@ +3
+	vadd.u32	q4, q1, q10
+	vadd.u32	q5, q2, q10
+	vadd.u32	q6, q3, q10
+	vadd.u32	q7, q4, q10
+	vadd.u32	q10, q5, q10	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia	sp, {q9}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x10		@ pass next round key
+#else
+	add	r4, r3, #264
+#endif
+	vldmia	r8, {q8}			@ .LREVM0SR
+	mov	r5, r10			@ pass rounds
+	vstmia	r9, {q10}			@ save next counter
+#ifdef	__APPLE__
+	mov	r6, #:lower16:(.LREVM0SR-.LSR)
+	sub	r6, r8, r6
+#else
+	sub	r6, r8, #.LREVM0SR-.LSR	@ pass constants
+#endif
+
+	bl	_bsaes_encrypt8_alt
+
+	subs	r2, r2, #8
+	blo	.Lctr_enc_loop_done
+
+	vld1.8	{q8,q9}, [r0]!	@ load input
+	vld1.8	{q10,q11}, [r0]!
+	veor	q0, q8
+	veor	q1, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q10
+	veor	q6, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q3, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q7, q13
+	veor	q2, q14
+	vst1.8	{q4}, [r1]!
+	veor	q5, q15
+	vst1.8	{q6}, [r1]!
+	vmov.i32	q8, #1			@ compose 1<<96
+	vst1.8	{q3}, [r1]!
+	veor	q9, q9, q9
+	vst1.8	{q7}, [r1]!
+	vext.8	q8, q9, q8, #4
+	vst1.8	{q2}, [r1]!
+	vadd.u32	q9,q8,q8		@ compose 2<<96
+	vst1.8	{q5}, [r1]!
+	vldmia	r9, {q0}			@ load counter
+
+	bne	.Lctr_enc_loop
+	b	.Lctr_enc_done
+
+.align	4
+.Lctr_enc_loop_done:
+	add	r2, r2, #8
+	vld1.8	{q8}, [r0]!	@ load input
+	veor	q0, q8
+	vst1.8	{q0}, [r1]!	@ write output
+	cmp	r2, #2
+	blo	.Lctr_enc_done
+	vld1.8	{q9}, [r0]!
+	veor	q1, q9
+	vst1.8	{q1}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q10}, [r0]!
+	veor	q4, q10
+	vst1.8	{q4}, [r1]!
+	cmp	r2, #4
+	blo	.Lctr_enc_done
+	vld1.8	{q11}, [r0]!
+	veor	q6, q11
+	vst1.8	{q6}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q12}, [r0]!
+	veor	q3, q12
+	vst1.8	{q3}, [r1]!
+	cmp	r2, #6
+	blo	.Lctr_enc_done
+	vld1.8	{q13}, [r0]!
+	veor	q7, q13
+	vst1.8	{q7}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q14}, [r0]
+	veor	q2, q14
+	vst1.8	{q2}, [r1]!
+
+.Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	.Lctr_enc_bzero
+#else
+	vstmia	sp, {q0,q1}
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/chacha-armv4-linux32.S b/ring-0.17.14/pregenerated/chacha-armv4-linux32.S
new file mode 100644
index 0000000000..f058e5801e
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-armv4-linux32.S
@@ -0,0 +1,1449 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+.Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone:
+.long	1,0,0,0
+
+.globl	ChaCha20_ctr32_nohw
+.hidden	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
+.align	5
+ChaCha20_ctr32_nohw:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	adr	r14,.Lsigma
+	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
+	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	.Loop_outer_enter
+
+.align	4
+.Loop_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+.Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	.Loop
+
+.align	4
+.Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	.Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __ARM_ARCH>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH<7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	.Lunaligned
+	cmp	r11,#64		@ restore flags
+# else
+	ldr	r10,[sp,#4*(2)]
+# endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1		@ next counter value
+	strhi	r8,[sp,#4*(12)]	@ save next counter value
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_outer
+
+	beq	.Ldone
+# if __ARM_ARCH<7
+	b	.Ltail
+
+.align	4
+.Lunaligned:@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+# endif
+#endif
+#if __ARM_ARCH<7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#endif
+
+.Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+.Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	.Loop_tail
+
+.Ldone:
+	add	sp,sp,#4*(32+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	ChaCha20_ctr32_neon
+.hidden	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
+.align	5
+ChaCha20_ctr32_neon:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	adr	r14,.Lsigma
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
+	stmdb	sp!,{r0,r1,r2,r3}
+
+	vld1.32	{q1,q2},[r3]		@ load key
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+
+	sub	sp,sp,#4*(16+16)
+	vld1.32	{q3},[r12]		@ load counter and nonce
+	add	r12,sp,#4*8
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	vld1.32	{q0},[r14]!		@ load sigma
+	vld1.32	{q12},[r14]		@ one
+	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
+
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr	d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr	d26,[sp,#4*(16+2)]
+	vmov	q4,q0
+	vstr	d28,[sp,#4*(16+4)]
+	vmov	q8,q0
+	vmov	q5,q1
+	vmov	q9,q1
+	b	.Loop_neon_enter
+
+.align	4
+.Loop_neon_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	cmp	r11,#64*2		@ if len<=64*2
+	bls	.Lbreak_neon		@ switch to integer-only
+	vmov	q4,q0
+	str	r11,[sp,#4*(32+2)]	@ save len
+	vmov	q8,q0
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	vmov	q5,q1
+	str	r14,  [sp,#4*(32+0)]	@ save out
+	vmov	q9,q1
+.Loop_neon_enter:
+	ldr	r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov	q6,q2
+	ldr	r10, [sp,#4*(13)]
+	vmov	q10,q2
+	ldr	r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	add	r12,r12,#3	@ counter+3
+	b	.Loop_neon
+
+.align	4
+.Loop_neon:
+	subs	r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne	.Loop_neon
+
+	add	r11,sp,#32
+	vld1.32	{q12,q13},[sp]		@ load key material
+	vld1.32	{q14,q15},[r11]
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr	d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp	r11,#64*4
+	blo	.Ltail_neon
+
+	vld1.8	{q12,q13},[r12]!	@ load input
+	mov	r11,sp
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12		@ xor with input
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	vst1.8	{q0,q1},[r14]!	@ store output
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vld1.32	{q0,q1},[r11]!	@ load for next iteration
+	veor	d25,d25,d25
+	vldr	d24,[sp,#4*(16+4)]	@ four
+	veor	q9,q9,q13
+	vld1.32	{q2,q3},[r11]
+	veor	q10,q10,q14
+	vst1.8	{q4,q5},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q6,q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	vst1.8	{q8,q9},[r14]!
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+	vst1.8	{q10,q11},[r14]!
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8	@ xor with input
+	add	r8,sp,#4*(4)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r5,r5,r9
+	ldr	r9,[r12,#-12]
+	add	r6,r6,r10
+	ldr	r10,[r12,#-8]
+	add	r7,r7,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+	add	r8,sp,#4*(8)
+	eor	r5,r5,r9
+	str	r4,[r14],#16		@ store output
+	eor	r6,r6,r10
+	str	r5,[r14,#-12]
+	eor	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8
+	add	r8,sp,#4*(12)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,r8,#4		@ next counter value
+	add	r5,r5,r9
+	str	r8,[sp,#4*(12)]	@ save next counter value
+	ldr	r8,[r12],#16		@ load input
+	add	r6,r6,r10
+	add	r4,r4,#3		@ counter+3
+	ldr	r9,[r12,#-12]
+	add	r7,r7,r11
+	ldr	r10,[r12,#-8]
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+# ifdef	__thumb2__
+	it	hi
+# endif
+	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
+	eor	r5,r5,r9
+	eor	r6,r6,r10
+	str	r4,[r14],#16		@ store output
+	eor	r7,r7,r11
+	str	r5,[r14,#-12]
+	sub	r11,r8,#64*4	@ len-=64*4
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_neon_outer
+
+	b	.Ldone_neon
+
+.align	4
+.Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str	r11, [sp,#4*(20+32+2)]	@ save len
+	add	r11,sp,#4*(32+4)
+	str	r12,   [sp,#4*(20+32+1)]	@ save inp
+	str	r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr	r12,[sp,#4*(16+10)]
+	ldr	r14,[sp,#4*(16+11)]
+	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
+	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(20+16+15)]
+	add	r11,sp,#4*(20)
+	vst1.32	{q0,q1},[r11]!		@ copy key
+	add	sp,sp,#4*(20)			@ switch frame
+	vst1.32	{q2,q3},[r11]
+	mov	r11,#10
+	b	.Loop				@ go integer-only
+
+.align	4
+.Ltail_neon:
+	cmp	r11,#64*3
+	bhs	.L192_or_more_neon
+	cmp	r11,#64*2
+	bhs	.L128_or_more_neon
+	cmp	r11,#64*1
+	bhs	.L64_or_more_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q0,q1},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q2,q3},[r8]
+	b	.Loop_tail_neon
+
+.align	4
+.L64_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vst1.8	{q0,q1},[r14]!
+	vst1.8	{q2,q3},[r14]!
+
+	beq	.Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q4,q5},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q6,q7},[r8]
+	sub	r11,r11,#64*1	@ len-=64*1
+	b	.Loop_tail_neon
+
+.align	4
+.L128_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vst1.8	{q0,q1},[r14]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vst1.8	{q4,q5},[r14]!
+	vst1.8	{q6,q7},[r14]!
+
+	beq	.Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q8,q9},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q10,q11},[r8]
+	sub	r11,r11,#64*2	@ len-=64*2
+	b	.Loop_tail_neon
+
+.align	4
+.L192_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q0,q1},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vst1.8	{q2,q3},[r14]!
+	veor	q9,q9,q13
+	vst1.8	{q4,q5},[r14]!
+	veor	q10,q10,q14
+	vst1.8	{q6,q7},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q8,q9},[r14]!
+	vst1.8	{q10,q11},[r14]!
+
+	beq	.Ldone_neon
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(4)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r0,sp,#4*(16+8)
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(12)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r4,r4,#3		@ counter+3
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldr	r11,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r10,sp,#4*(0)
+	sub	r11,r11,#64*3	@ len-=64*3
+
+.Loop_tail_neon:
+	ldrb	r8,[r10],#1	@ read buffer on stack
+	ldrb	r9,[r12],#1		@ read input
+	subs	r11,r11,#1
+	eor	r8,r8,r9
+	strb	r8,[r14],#1		@ store output
+	bne	.Loop_tail_neon
+
+.Ldone_neon:
+	add	sp,sp,#4*(32+4)
+	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+	add	sp,sp,#4*(16+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/chacha-armv8-ios64.S b/ring-0.17.14/pregenerated/chacha-armv8-ios64.S
new file mode 100644
index 0000000000..322f67e47d
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-armv8-ios64.S
@@ -0,0 +1,1966 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+.section	__TEXT,__const
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	_ChaCha20_ctr32_nohw
+.private_extern	_ChaCha20_ctr32_nohw
+
+.align	5
+_ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	_ChaCha20_ctr32_neon
+.private_extern	_ChaCha20_ctr32_neon
+
+.align	5
+_ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/chacha-armv8-linux64.S b/ring-0.17.14/pregenerated/chacha-armv8-linux64.S
new file mode 100644
index 0000000000..00108f0286
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-armv8-linux64.S
@@ -0,0 +1,1966 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+.section	.rodata
+
+.align	5
+.Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+.Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	ChaCha20_ctr32_nohw
+.hidden	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
+.align	5
+ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+.Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+.Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,.Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	.Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	.Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+.Ltail:
+	add	x2,x2,#64
+.Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+.Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+
+.globl	ChaCha20_ctr32_neon
+.hidden	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
+.align	5
+ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	.L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+.Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+.Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,.Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	.Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	.Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	.Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	.Last_neon
+
+.Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	.Last_neon
+.Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	.Last_neon
+
+.align	4
+.Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+.Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+.type	ChaCha20_512_neon,%function
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+.Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+.Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+.Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	.Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	.Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	.Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	.Loop_outer
+
+.Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_512_neon,.-ChaCha20_512_neon
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/chacha-armv8-win64.S b/ring-0.17.14/pregenerated/chacha-armv8-win64.S
new file mode 100644
index 0000000000..12d054264e
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-armv8-win64.S
@@ -0,0 +1,1972 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+.section	.rodata
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	ChaCha20_ctr32_nohw
+
+.def ChaCha20_ctr32_nohw
+   .type 32
+.endef
+.align	5
+ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	ChaCha20_ctr32_neon
+
+.def ChaCha20_ctr32_neon
+   .type 32
+.endef
+.align	5
+ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def ChaCha20_512_neon
+   .type 32
+.endef
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/chacha-x86-elf.S b/ring-0.17.14/pregenerated/chacha-x86-elf.S
new file mode 100644
index 0000000000..b8b6b63c17
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-x86-elf.S
@@ -0,0 +1,601 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	ChaCha20_ctr32_ssse3
+.hidden	ChaCha20_ctr32_ssse3
+.type	ChaCha20_ctr32_ssse3,@function
+.align	16
+ChaCha20_ctr32_ssse3:
+.L_ChaCha20_ctr32_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.Lpic_point
+.Lpic_point:
+	popl	%eax
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$524,%esp
+	andl	$-64,%esp
+	movl	%ebp,512(%esp)
+	leal	.Lssse3_data-.Lpic_point(%eax),%eax
+	movdqu	(%ebx),%xmm3
+	cmpl	$256,%ecx
+	jb	.L0001x
+	movl	%edx,516(%esp)
+	movl	%ebx,520(%esp)
+	subl	$256,%ecx
+	leal	384(%esp),%ebp
+	movdqu	(%edx),%xmm7
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	paddd	48(%eax),%xmm0
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	psubd	64(%eax),%xmm0
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,64(%ebp)
+	movdqa	%xmm1,80(%ebp)
+	movdqa	%xmm2,96(%ebp)
+	movdqa	%xmm3,112(%ebp)
+	movdqu	16(%edx),%xmm3
+	movdqa	%xmm4,-64(%ebp)
+	movdqa	%xmm5,-48(%ebp)
+	movdqa	%xmm6,-32(%ebp)
+	movdqa	%xmm7,-16(%ebp)
+	movdqa	32(%eax),%xmm7
+	leal	128(%esp),%ebx
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,(%ebp)
+	movdqa	%xmm1,16(%ebp)
+	movdqa	%xmm2,32(%ebp)
+	movdqa	%xmm3,48(%ebp)
+	movdqa	%xmm4,-128(%ebp)
+	movdqa	%xmm5,-112(%ebp)
+	movdqa	%xmm6,-96(%ebp)
+	movdqa	%xmm7,-80(%ebp)
+	leal	128(%esi),%esi
+	leal	128(%edi),%edi
+	jmp	.L001outer_loop
+.align	16
+.L001outer_loop:
+	movdqa	-112(%ebp),%xmm1
+	movdqa	-96(%ebp),%xmm2
+	movdqa	-80(%ebp),%xmm3
+	movdqa	-48(%ebp),%xmm5
+	movdqa	-32(%ebp),%xmm6
+	movdqa	-16(%ebp),%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	movdqa	%xmm2,-96(%ebx)
+	movdqa	%xmm3,-80(%ebx)
+	movdqa	%xmm5,-48(%ebx)
+	movdqa	%xmm6,-32(%ebx)
+	movdqa	%xmm7,-16(%ebx)
+	movdqa	32(%ebp),%xmm2
+	movdqa	48(%ebp),%xmm3
+	movdqa	64(%ebp),%xmm4
+	movdqa	80(%ebp),%xmm5
+	movdqa	96(%ebp),%xmm6
+	movdqa	112(%ebp),%xmm7
+	paddd	64(%eax),%xmm4
+	movdqa	%xmm2,32(%ebx)
+	movdqa	%xmm3,48(%ebx)
+	movdqa	%xmm4,64(%ebx)
+	movdqa	%xmm5,80(%ebx)
+	movdqa	%xmm6,96(%ebx)
+	movdqa	%xmm7,112(%ebx)
+	movdqa	%xmm4,64(%ebp)
+	movdqa	-128(%ebp),%xmm0
+	movdqa	%xmm4,%xmm6
+	movdqa	-64(%ebp),%xmm3
+	movdqa	(%ebp),%xmm4
+	movdqa	16(%ebp),%xmm5
+	movl	$10,%edx
+	nop
+.align	16
+.L002loop:
+	paddd	%xmm3,%xmm0
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm0,%xmm6
+	pshufb	(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-48(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	80(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,64(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-64(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-32(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	96(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,80(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,16(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-48(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	48(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-16(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	112(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,96(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-32(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	-48(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,%xmm6
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-16(%ebx)
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-32(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	64(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,112(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,32(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-48(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-16(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	80(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,64(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,48(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-32(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	16(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-64(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	96(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,80(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-16(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	64(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,96(%ebx)
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	por	%xmm1,%xmm3
+	decl	%edx
+	jnz	.L002loop
+	movdqa	%xmm3,-64(%ebx)
+	movdqa	%xmm4,(%ebx)
+	movdqa	%xmm5,16(%ebx)
+	movdqa	%xmm6,64(%ebx)
+	movdqa	%xmm7,96(%ebx)
+	movdqa	-112(%ebx),%xmm1
+	movdqa	-96(%ebx),%xmm2
+	movdqa	-80(%ebx),%xmm3
+	paddd	-128(%ebp),%xmm0
+	paddd	-112(%ebp),%xmm1
+	paddd	-96(%ebp),%xmm2
+	paddd	-80(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	-64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	-48(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	-32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	-16(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	-64(%ebp),%xmm0
+	paddd	-48(%ebp),%xmm1
+	paddd	-32(%ebp),%xmm2
+	paddd	-16(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	16(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	48(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	(%ebp),%xmm0
+	paddd	16(%ebp),%xmm1
+	paddd	32(%ebp),%xmm2
+	paddd	48(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	80(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	96(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	112(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	64(%ebp),%xmm0
+	paddd	80(%ebp),%xmm1
+	paddd	96(%ebp),%xmm2
+	paddd	112(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	208(%esi),%esi
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm5
+	pxor	%xmm2,%xmm6
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	208(%edi),%edi
+	subl	$256,%ecx
+	jnc	.L001outer_loop
+	addl	$256,%ecx
+	jz	.L003done
+	movl	520(%esp),%ebx
+	leal	-128(%esi),%esi
+	movl	516(%esp),%edx
+	leal	-128(%edi),%edi
+	movd	64(%ebp),%xmm2
+	movdqu	(%ebx),%xmm3
+	paddd	96(%eax),%xmm2
+	pand	112(%eax),%xmm3
+	por	%xmm2,%xmm3
+.L0001x:
+	movdqa	32(%eax),%xmm0
+	movdqu	(%edx),%xmm1
+	movdqu	16(%edx),%xmm2
+	movdqa	(%eax),%xmm6
+	movdqa	16(%eax),%xmm7
+	movl	%ebp,48(%esp)
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	movl	$10,%edx
+	jmp	.L004loop1x
+.align	16
+.L005outer1x:
+	movdqa	80(%eax),%xmm3
+	movdqa	(%esp),%xmm0
+	movdqa	16(%esp),%xmm1
+	movdqa	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	movl	$10,%edx
+	movdqa	%xmm3,48(%esp)
+	jmp	.L004loop1x
+.align	16
+.L004loop1x:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decl	%edx
+	jnz	.L004loop1x
+	paddd	(%esp),%xmm0
+	paddd	16(%esp),%xmm1
+	paddd	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	cmpl	$64,%ecx
+	jb	.L006tail
+	movdqu	(%esi),%xmm4
+	movdqu	16(%esi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+	leal	64(%esi),%esi
+	movdqu	%xmm0,(%edi)
+	movdqu	%xmm1,16(%edi)
+	movdqu	%xmm2,32(%edi)
+	movdqu	%xmm3,48(%edi)
+	leal	64(%edi),%edi
+	subl	$64,%ecx
+	jnz	.L005outer1x
+	jmp	.L003done
+.L006tail:
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	xorl	%ebp,%ebp
+.L007tail_loop:
+	movb	(%esp,%ebp,1),%al
+	movb	(%esi,%ebp,1),%dl
+	leal	1(%ebp),%ebp
+	xorb	%dl,%al
+	movb	%al,-1(%edi,%ebp,1)
+	decl	%ecx
+	jnz	.L007tail_loop
+.L003done:
+	movl	512(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	ChaCha20_ctr32_ssse3,.-.L_ChaCha20_ctr32_ssse3_begin
+.align	64
+.Lssse3_data:
+.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long	1634760805,857760878,2036477234,1797285236
+.long	0,1,2,3
+.long	4,4,4,4
+.long	1,0,0,0
+.long	4,0,0,0
+.long	0,-1,-1,-1
+.align	64
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte	114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/chacha-x86-win32n.asm b/ring-0.17.14/pregenerated/chacha-x86-win32n.asm
new file mode 100644
index 0000000000..36e7b34234
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-x86-win32n.asm
@@ -0,0 +1,607 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_ChaCha20_ctr32_ssse3
+align	16
+_ChaCha20_ctr32_ssse3:
+L$_ChaCha20_ctr32_ssse3_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	call	L$pic_point
+L$pic_point:
+	pop	eax
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	ecx,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ebp,esp
+	sub	esp,524
+	and	esp,-64
+	mov	DWORD [512+esp],ebp
+	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
+	movdqu	xmm3,[ebx]
+	cmp	ecx,256
+	jb	NEAR L$0001x
+	mov	DWORD [516+esp],edx
+	mov	DWORD [520+esp],ebx
+	sub	ecx,256
+	lea	ebp,[384+esp]
+	movdqu	xmm7,[edx]
+	pshufd	xmm0,xmm3,0
+	pshufd	xmm1,xmm3,85
+	pshufd	xmm2,xmm3,170
+	pshufd	xmm3,xmm3,255
+	paddd	xmm0,[48+eax]
+	pshufd	xmm4,xmm7,0
+	pshufd	xmm5,xmm7,85
+	psubd	xmm0,[64+eax]
+	pshufd	xmm6,xmm7,170
+	pshufd	xmm7,xmm7,255
+	movdqa	[64+ebp],xmm0
+	movdqa	[80+ebp],xmm1
+	movdqa	[96+ebp],xmm2
+	movdqa	[112+ebp],xmm3
+	movdqu	xmm3,[16+edx]
+	movdqa	[ebp-64],xmm4
+	movdqa	[ebp-48],xmm5
+	movdqa	[ebp-32],xmm6
+	movdqa	[ebp-16],xmm7
+	movdqa	xmm7,[32+eax]
+	lea	ebx,[128+esp]
+	pshufd	xmm0,xmm3,0
+	pshufd	xmm1,xmm3,85
+	pshufd	xmm2,xmm3,170
+	pshufd	xmm3,xmm3,255
+	pshufd	xmm4,xmm7,0
+	pshufd	xmm5,xmm7,85
+	pshufd	xmm6,xmm7,170
+	pshufd	xmm7,xmm7,255
+	movdqa	[ebp],xmm0
+	movdqa	[16+ebp],xmm1
+	movdqa	[32+ebp],xmm2
+	movdqa	[48+ebp],xmm3
+	movdqa	[ebp-128],xmm4
+	movdqa	[ebp-112],xmm5
+	movdqa	[ebp-96],xmm6
+	movdqa	[ebp-80],xmm7
+	lea	esi,[128+esi]
+	lea	edi,[128+edi]
+	jmp	NEAR L$001outer_loop
+align	16
+L$001outer_loop:
+	movdqa	xmm1,[ebp-112]
+	movdqa	xmm2,[ebp-96]
+	movdqa	xmm3,[ebp-80]
+	movdqa	xmm5,[ebp-48]
+	movdqa	xmm6,[ebp-32]
+	movdqa	xmm7,[ebp-16]
+	movdqa	[ebx-112],xmm1
+	movdqa	[ebx-96],xmm2
+	movdqa	[ebx-80],xmm3
+	movdqa	[ebx-48],xmm5
+	movdqa	[ebx-32],xmm6
+	movdqa	[ebx-16],xmm7
+	movdqa	xmm2,[32+ebp]
+	movdqa	xmm3,[48+ebp]
+	movdqa	xmm4,[64+ebp]
+	movdqa	xmm5,[80+ebp]
+	movdqa	xmm6,[96+ebp]
+	movdqa	xmm7,[112+ebp]
+	paddd	xmm4,[64+eax]
+	movdqa	[32+ebx],xmm2
+	movdqa	[48+ebx],xmm3
+	movdqa	[64+ebx],xmm4
+	movdqa	[80+ebx],xmm5
+	movdqa	[96+ebx],xmm6
+	movdqa	[112+ebx],xmm7
+	movdqa	[64+ebp],xmm4
+	movdqa	xmm0,[ebp-128]
+	movdqa	xmm6,xmm4
+	movdqa	xmm3,[ebp-64]
+	movdqa	xmm4,[ebp]
+	movdqa	xmm5,[16+ebp]
+	mov	edx,10
+	nop
+align	16
+L$002loop:
+	paddd	xmm0,xmm3
+	movdqa	xmm2,xmm3
+	pxor	xmm6,xmm0
+	pshufb	xmm6,[eax]
+	paddd	xmm4,xmm6
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-48]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-112]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[80+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-128],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[64+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	movdqa	[ebx],xmm4
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-64],xmm2
+	paddd	xmm5,xmm7
+	movdqa	xmm4,[32+ebx]
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-32]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-96]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[96+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-112],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[80+ebx],xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	movdqa	[16+ebx],xmm5
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-48],xmm3
+	paddd	xmm4,xmm6
+	movdqa	xmm5,[48+ebx]
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-16]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-80]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[112+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-96],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[96+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-32],xmm2
+	paddd	xmm5,xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-48]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-128]
+	paddd	xmm1,xmm3
+	pxor	xmm7,xmm1
+	movdqa	[ebx-80],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	xmm6,xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-16],xmm3
+	paddd	xmm4,xmm6
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-32]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-112]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[64+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-128],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[112+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	movdqa	[32+ebx],xmm4
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-48],xmm2
+	paddd	xmm5,xmm7
+	movdqa	xmm4,[ebx]
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-16]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-96]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[80+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-112],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[64+ebx],xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	movdqa	[48+ebx],xmm5
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-32],xmm3
+	paddd	xmm4,xmm6
+	movdqa	xmm5,[16+ebx]
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-64]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-80]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[96+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-96],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[80+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-16],xmm2
+	paddd	xmm5,xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-128]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[64+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-80],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[96+ebx],xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	por	xmm3,xmm1
+	dec	edx
+	jnz	NEAR L$002loop
+	movdqa	[ebx-64],xmm3
+	movdqa	[ebx],xmm4
+	movdqa	[16+ebx],xmm5
+	movdqa	[64+ebx],xmm6
+	movdqa	[96+ebx],xmm7
+	movdqa	xmm1,[ebx-112]
+	movdqa	xmm2,[ebx-96]
+	movdqa	xmm3,[ebx-80]
+	paddd	xmm0,[ebp-128]
+	paddd	xmm1,[ebp-112]
+	paddd	xmm2,[ebp-96]
+	paddd	xmm3,[ebp-80]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[ebx-64]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[ebx-48]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[ebx-32]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[ebx-16]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[ebp-64]
+	paddd	xmm1,[ebp-48]
+	paddd	xmm2,[ebp-32]
+	paddd	xmm3,[ebp-16]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[ebx]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[16+ebx]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[32+ebx]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[48+ebx]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[ebp]
+	paddd	xmm1,[16+ebp]
+	paddd	xmm2,[32+ebp]
+	paddd	xmm3,[48+ebp]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[64+ebx]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[80+ebx]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[96+ebx]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[112+ebx]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[64+ebp]
+	paddd	xmm1,[80+ebp]
+	paddd	xmm2,[96+ebp]
+	paddd	xmm3,[112+ebp]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[208+esi]
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm1
+	pxor	xmm6,xmm2
+	pxor	xmm7,xmm3
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[208+edi]
+	sub	ecx,256
+	jnc	NEAR L$001outer_loop
+	add	ecx,256
+	jz	NEAR L$003done
+	mov	ebx,DWORD [520+esp]
+	lea	esi,[esi-128]
+	mov	edx,DWORD [516+esp]
+	lea	edi,[edi-128]
+	movd	xmm2,DWORD [64+ebp]
+	movdqu	xmm3,[ebx]
+	paddd	xmm2,[96+eax]
+	pand	xmm3,[112+eax]
+	por	xmm3,xmm2
+L$0001x:
+	movdqa	xmm0,[32+eax]
+	movdqu	xmm1,[edx]
+	movdqu	xmm2,[16+edx]
+	movdqa	xmm6,[eax]
+	movdqa	xmm7,[16+eax]
+	mov	DWORD [48+esp],ebp
+	movdqa	[esp],xmm0
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm2
+	movdqa	[48+esp],xmm3
+	mov	edx,10
+	jmp	NEAR L$004loop1x
+align	16
+L$005outer1x:
+	movdqa	xmm3,[80+eax]
+	movdqa	xmm0,[esp]
+	movdqa	xmm1,[16+esp]
+	movdqa	xmm2,[32+esp]
+	paddd	xmm3,[48+esp]
+	mov	edx,10
+	movdqa	[48+esp],xmm3
+	jmp	NEAR L$004loop1x
+align	16
+L$004loop1x:
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,57
+	pshufd	xmm3,xmm3,147
+	nop
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,147
+	pshufd	xmm3,xmm3,57
+	dec	edx
+	jnz	NEAR L$004loop1x
+	paddd	xmm0,[esp]
+	paddd	xmm1,[16+esp]
+	paddd	xmm2,[32+esp]
+	paddd	xmm3,[48+esp]
+	cmp	ecx,64
+	jb	NEAR L$006tail
+	movdqu	xmm4,[esi]
+	movdqu	xmm5,[16+esi]
+	pxor	xmm0,xmm4
+	movdqu	xmm4,[32+esi]
+	pxor	xmm1,xmm5
+	movdqu	xmm5,[48+esi]
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm5
+	lea	esi,[64+esi]
+	movdqu	[edi],xmm0
+	movdqu	[16+edi],xmm1
+	movdqu	[32+edi],xmm2
+	movdqu	[48+edi],xmm3
+	lea	edi,[64+edi]
+	sub	ecx,64
+	jnz	NEAR L$005outer1x
+	jmp	NEAR L$003done
+L$006tail:
+	movdqa	[esp],xmm0
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm2
+	movdqa	[48+esp],xmm3
+	xor	eax,eax
+	xor	edx,edx
+	xor	ebp,ebp
+L$007tail_loop:
+	mov	al,BYTE [ebp*1+esp]
+	mov	dl,BYTE [ebp*1+esi]
+	lea	ebp,[1+ebp]
+	xor	al,dl
+	mov	BYTE [ebp*1+edi-1],al
+	dec	ecx
+	jnz	NEAR L$007tail_loop
+L$003done:
+	mov	esp,DWORD [512+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$ssse3_data:
+db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+dd	1634760805,857760878,2036477234,1797285236
+dd	0,1,2,3
+dd	4,4,4,4
+dd	1,0,0,0
+dd	4,0,0,0
+dd	0,-1,-1,-1
+align	64
+db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+db	114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/chacha-x86-win32n.o b/ring-0.17.14/pregenerated/chacha-x86-win32n.o
new file mode 100644
index 0000000000..928f2e452d
Binary files /dev/null and b/ring-0.17.14/pregenerated/chacha-x86-win32n.o differ
diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-elf.S b/ring-0.17.14/pregenerated/chacha-x86_64-elf.S
new file mode 100644
index 0000000000..a85ae2b2bb
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-x86_64-elf.S
@@ -0,0 +1,1474 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.section	.rodata
+.align	64
+.Lzero:
+.long	0,0,0,0
+.Lone:
+.long	1,0,0,0
+.Linc:
+.long	0,1,2,3
+.Lfour:
+.long	4,4,4,4
+.Lincy:
+.long	0,2,4,6,1,3,5,7
+.Leight:
+.long	8,8,8,8,8,8,8,8
+.Lrot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	ChaCha20_ctr32_nohw
+.hidden ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,@function
+.align	64
+ChaCha20_ctr32_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-56
+	subq	$64+24,%rsp
+.cfi_adjust_cfa_offset	88
+.Lctr32_body:
+
+
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lone(%rip),%xmm4
+
+
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	%rdx,%rbp
+	jmp	.Loop_outer
+
+.align	32
+.Loop_outer:
+	movl	$0x61707865,%eax
+	movl	$0x3320646e,%ebx
+	movl	$0x79622d32,%ecx
+	movl	$0x6b206574,%edx
+	movl	16(%rsp),%r8d
+	movl	20(%rsp),%r9d
+	movl	24(%rsp),%r10d
+	movl	28(%rsp),%r11d
+	movd	%xmm3,%r12d
+	movl	52(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	60(%rsp),%r15d
+
+	movq	%rbp,64+0(%rsp)
+	movl	$10,%ebp
+	movq	%rsi,64+8(%rsp)
+.byte	102,72,15,126,214
+	movq	%rdi,64+16(%rsp)
+	movq	%rsi,%rdi
+	shrq	$32,%rdi
+	jmp	.Loop
+
+.align	32
+.Loop:
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$16,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$16,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$12,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$12,%r9d
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$8,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$8,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$7,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$7,%r9d
+	movl	%esi,32(%rsp)
+	movl	%edi,36(%rsp)
+	movl	40(%rsp),%esi
+	movl	44(%rsp),%edi
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$16,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$16,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$12,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$12,%r11d
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$8,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$8,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$7,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$7,%r11d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$16,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$16,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$12,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$12,%r10d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$8,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$8,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$7,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$7,%r10d
+	movl	%esi,40(%rsp)
+	movl	%edi,44(%rsp)
+	movl	32(%rsp),%esi
+	movl	36(%rsp),%edi
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$16,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$16,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$12,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$12,%r8d
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$8,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$8,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$7,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$7,%r8d
+	decl	%ebp
+	jnz	.Loop
+	movl	%edi,36(%rsp)
+	movl	%esi,32(%rsp)
+	movq	64(%rsp),%rbp
+	movdqa	%xmm2,%xmm1
+	movq	64+8(%rsp),%rsi
+	paddd	%xmm4,%xmm3
+	movq	64+16(%rsp),%rdi
+
+	addl	$0x61707865,%eax
+	addl	$0x3320646e,%ebx
+	addl	$0x79622d32,%ecx
+	addl	$0x6b206574,%edx
+	addl	16(%rsp),%r8d
+	addl	20(%rsp),%r9d
+	addl	24(%rsp),%r10d
+	addl	28(%rsp),%r11d
+	addl	48(%rsp),%r12d
+	addl	52(%rsp),%r13d
+	addl	56(%rsp),%r14d
+	addl	60(%rsp),%r15d
+	paddd	32(%rsp),%xmm1
+
+	cmpq	$64,%rbp
+	jb	.Ltail
+
+	xorl	0(%rsi),%eax
+	xorl	4(%rsi),%ebx
+	xorl	8(%rsi),%ecx
+	xorl	12(%rsi),%edx
+	xorl	16(%rsi),%r8d
+	xorl	20(%rsi),%r9d
+	xorl	24(%rsi),%r10d
+	xorl	28(%rsi),%r11d
+	movdqu	32(%rsi),%xmm0
+	xorl	48(%rsi),%r12d
+	xorl	52(%rsi),%r13d
+	xorl	56(%rsi),%r14d
+	xorl	60(%rsi),%r15d
+	leaq	64(%rsi),%rsi
+	pxor	%xmm1,%xmm0
+
+	movdqa	%xmm2,32(%rsp)
+	movd	%xmm3,48(%rsp)
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movl	%r12d,48(%rdi)
+	movl	%r13d,52(%rdi)
+	movl	%r14d,56(%rdi)
+	movl	%r15d,60(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rbp
+	jnz	.Loop_outer
+
+	jmp	.Ldone
+
+.align	16
+.Ltail:
+	movl	%eax,0(%rsp)
+	movl	%ebx,4(%rsp)
+	xorq	%rbx,%rbx
+	movl	%ecx,8(%rsp)
+	movl	%edx,12(%rsp)
+	movl	%r8d,16(%rsp)
+	movl	%r9d,20(%rsp)
+	movl	%r10d,24(%rsp)
+	movl	%r11d,28(%rsp)
+	movdqa	%xmm1,32(%rsp)
+	movl	%r12d,48(%rsp)
+	movl	%r13d,52(%rsp)
+	movl	%r14d,56(%rsp)
+	movl	%r15d,60(%rsp)
+
+.Loop_tail:
+	movzbl	(%rsi,%rbx,1),%eax
+	movzbl	(%rsp,%rbx,1),%edx
+	leaq	1(%rbx),%rbx
+	xorl	%edx,%eax
+	movb	%al,-1(%rdi,%rbx,1)
+	decq	%rbp
+	jnz	.Loop_tail
+
+.Ldone:
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+.cfi_restore	r15
+	movq	-40(%rsi),%r14
+.cfi_restore	r14
+	movq	-32(%rsi),%r13
+.cfi_restore	r13
+	movq	-24(%rsi),%r12
+.cfi_restore	r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	rbx
+	leaq	(%rsi),%rsp
+.cfi_adjust_cfa_offset	-136
+.Lno_data:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+.globl	ChaCha20_ctr32_ssse3_4x
+.hidden ChaCha20_ctr32_ssse3_4x
+.type	ChaCha20_ctr32_ssse3_4x,@function
+.align	32
+ChaCha20_ctr32_ssse3_4x:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	subq	$0x140+8,%rsp
+	movdqa	.Lsigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	.Lrot16(%rip),%r10
+	leaq	.Lrot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	.Linc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	.Loop_enter4x
+
+.align	32
+.Loop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	.Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r10),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	.Loop4x
+
+.align	32
+.Loop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,199
+.byte	102,15,56,0,207
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,215
+.byte	102,15,56,0,223
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,223
+.byte	102,15,56,0,199
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,222
+.byte	102,15,56,0,198
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,207
+.byte	102,15,56,0,215
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	.Loop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	.Ltail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	.Loop_outer4x
+
+	jmp	.Ldone4x
+
+.Ltail4x:
+	cmpq	$192,%rdx
+	jae	.L192_or_more4x
+	cmpq	$128,%rdx
+	jae	.L128_or_more4x
+	cmpq	$64,%rdx
+	jae	.L64_or_more4x
+
+
+	xorq	%r10,%r10
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	.Ldone4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+.Loop_tail4x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	.Loop_tail4x
+
+.Ldone4x:
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L4x_epilogue:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
+.globl	ChaCha20_ctr32_avx2
+.hidden ChaCha20_ctr32_avx2
+.type	ChaCha20_ctr32_avx2,@function
+.align	32
+ChaCha20_ctr32_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	.Lsigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	.Lrot16(%rip),%r10
+	leaq	.Lrot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	.Lincy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	.Loop_enter8x
+
+.align	32
+.Loop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	.Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r10),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	.Loop8x
+
+.align	32
+.Loop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	.Loop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	.Ltail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8x
+
+	jmp	.Ldone8x
+
+.Ltail8x:
+	cmpq	$448,%rdx
+	jae	.L448_or_more8x
+	cmpq	$384,%rdx
+	jae	.L384_or_more8x
+	cmpq	$320,%rdx
+	jae	.L320_or_more8x
+	cmpq	$256,%rdx
+	jae	.L256_or_more8x
+	cmpq	$192,%rdx
+	jae	.L192_or_more8x
+	cmpq	$128,%rdx
+	jae	.L128_or_more8x
+	cmpq	$64,%rdx
+	jae	.L64_or_more8x
+
+	xorq	%r10,%r10
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	.Ldone8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	.Ldone8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	.Ldone8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	.Ldone8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	.Ldone8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	.Ldone8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	.Ldone8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+.Loop_tail8x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	.Loop_tail8x
+
+.Ldone8x:
+	vzeroall
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L8x_epilogue:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
+#endif
diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-macosx.S b/ring-0.17.14/pregenerated/chacha-x86_64-macosx.S
new file mode 100644
index 0000000000..60cc8717d1
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-x86_64-macosx.S
@@ -0,0 +1,1468 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.section	__DATA,__const
+.p2align	6
+L$zero:
+.long	0,0,0,0
+L$one:
+.long	1,0,0,0
+L$inc:
+.long	0,1,2,3
+L$four:
+.long	4,4,4,4
+L$incy:
+.long	0,2,4,6,1,3,5,7
+L$eight:
+.long	8,8,8,8,8,8,8,8
+L$rot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+L$rot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+L$sigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.p2align	6
+L$zeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+L$fourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+L$incz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+L$sixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	_ChaCha20_ctr32_nohw
+.private_extern _ChaCha20_ctr32_nohw
+
+.p2align	6
+_ChaCha20_ctr32_nohw:
+
+_CET_ENDBR
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$64+24,%rsp
+
+L$ctr32_body:
+
+
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	L$one(%rip),%xmm4
+
+
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	%rdx,%rbp
+	jmp	L$oop_outer
+
+.p2align	5
+L$oop_outer:
+	movl	$0x61707865,%eax
+	movl	$0x3320646e,%ebx
+	movl	$0x79622d32,%ecx
+	movl	$0x6b206574,%edx
+	movl	16(%rsp),%r8d
+	movl	20(%rsp),%r9d
+	movl	24(%rsp),%r10d
+	movl	28(%rsp),%r11d
+	movd	%xmm3,%r12d
+	movl	52(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	60(%rsp),%r15d
+
+	movq	%rbp,64+0(%rsp)
+	movl	$10,%ebp
+	movq	%rsi,64+8(%rsp)
+.byte	102,72,15,126,214
+	movq	%rdi,64+16(%rsp)
+	movq	%rsi,%rdi
+	shrq	$32,%rdi
+	jmp	L$oop
+
+.p2align	5
+L$oop:
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$16,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$16,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$12,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$12,%r9d
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$8,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$8,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$7,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$7,%r9d
+	movl	%esi,32(%rsp)
+	movl	%edi,36(%rsp)
+	movl	40(%rsp),%esi
+	movl	44(%rsp),%edi
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$16,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$16,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$12,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$12,%r11d
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$8,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$8,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$7,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$7,%r11d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$16,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$16,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$12,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$12,%r10d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$8,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$8,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$7,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$7,%r10d
+	movl	%esi,40(%rsp)
+	movl	%edi,44(%rsp)
+	movl	32(%rsp),%esi
+	movl	36(%rsp),%edi
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$16,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$16,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$12,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$12,%r8d
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$8,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$8,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$7,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$7,%r8d
+	decl	%ebp
+	jnz	L$oop
+	movl	%edi,36(%rsp)
+	movl	%esi,32(%rsp)
+	movq	64(%rsp),%rbp
+	movdqa	%xmm2,%xmm1
+	movq	64+8(%rsp),%rsi
+	paddd	%xmm4,%xmm3
+	movq	64+16(%rsp),%rdi
+
+	addl	$0x61707865,%eax
+	addl	$0x3320646e,%ebx
+	addl	$0x79622d32,%ecx
+	addl	$0x6b206574,%edx
+	addl	16(%rsp),%r8d
+	addl	20(%rsp),%r9d
+	addl	24(%rsp),%r10d
+	addl	28(%rsp),%r11d
+	addl	48(%rsp),%r12d
+	addl	52(%rsp),%r13d
+	addl	56(%rsp),%r14d
+	addl	60(%rsp),%r15d
+	paddd	32(%rsp),%xmm1
+
+	cmpq	$64,%rbp
+	jb	L$tail
+
+	xorl	0(%rsi),%eax
+	xorl	4(%rsi),%ebx
+	xorl	8(%rsi),%ecx
+	xorl	12(%rsi),%edx
+	xorl	16(%rsi),%r8d
+	xorl	20(%rsi),%r9d
+	xorl	24(%rsi),%r10d
+	xorl	28(%rsi),%r11d
+	movdqu	32(%rsi),%xmm0
+	xorl	48(%rsi),%r12d
+	xorl	52(%rsi),%r13d
+	xorl	56(%rsi),%r14d
+	xorl	60(%rsi),%r15d
+	leaq	64(%rsi),%rsi
+	pxor	%xmm1,%xmm0
+
+	movdqa	%xmm2,32(%rsp)
+	movd	%xmm3,48(%rsp)
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movl	%r12d,48(%rdi)
+	movl	%r13d,52(%rdi)
+	movl	%r14d,56(%rdi)
+	movl	%r15d,60(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rbp
+	jnz	L$oop_outer
+
+	jmp	L$done
+
+.p2align	4
+L$tail:
+	movl	%eax,0(%rsp)
+	movl	%ebx,4(%rsp)
+	xorq	%rbx,%rbx
+	movl	%ecx,8(%rsp)
+	movl	%edx,12(%rsp)
+	movl	%r8d,16(%rsp)
+	movl	%r9d,20(%rsp)
+	movl	%r10d,24(%rsp)
+	movl	%r11d,28(%rsp)
+	movdqa	%xmm1,32(%rsp)
+	movl	%r12d,48(%rsp)
+	movl	%r13d,52(%rsp)
+	movl	%r14d,56(%rsp)
+	movl	%r15d,60(%rsp)
+
+L$oop_tail:
+	movzbl	(%rsi,%rbx,1),%eax
+	movzbl	(%rsp,%rbx,1),%edx
+	leaq	1(%rbx),%rbx
+	xorl	%edx,%eax
+	movb	%al,-1(%rdi,%rbx,1)
+	decq	%rbp
+	jnz	L$oop_tail
+
+L$done:
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$no_data:
+	ret
+
+
+.globl	_ChaCha20_ctr32_ssse3_4x
+.private_extern _ChaCha20_ctr32_ssse3_4x
+
+.p2align	5
+_ChaCha20_ctr32_ssse3_4x:
+
+_CET_ENDBR
+	movq	%rsp,%r9
+
+	subq	$0x140+8,%rsp
+	movdqa	L$sigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	L$rot16(%rip),%r10
+	leaq	L$rot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	L$inc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	L$oop_enter4x
+
+.p2align	5
+L$oop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	L$four(%rip),%xmm0
+
+L$oop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r10),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	L$oop4x
+
+.p2align	5
+L$oop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,199
+.byte	102,15,56,0,207
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,215
+.byte	102,15,56,0,223
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,223
+.byte	102,15,56,0,199
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,222
+.byte	102,15,56,0,198
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,207
+.byte	102,15,56,0,215
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	L$oop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	L$tail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	L$oop_outer4x
+
+	jmp	L$done4x
+
+L$tail4x:
+	cmpq	$192,%rdx
+	jae	L$192_or_more4x
+	cmpq	$128,%rdx
+	jae	L$128_or_more4x
+	cmpq	$64,%rdx
+	jae	L$64_or_more4x
+
+
+	xorq	%r10,%r10
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	L$oop_tail4x
+
+.p2align	5
+L$64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	L$done4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	L$oop_tail4x
+
+.p2align	5
+L$128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	L$done4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	L$oop_tail4x
+
+.p2align	5
+L$192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	L$done4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+L$oop_tail4x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	L$oop_tail4x
+
+L$done4x:
+	leaq	(%r9),%rsp
+
+L$4x_epilogue:
+	ret
+
+
+.globl	_ChaCha20_ctr32_avx2
+.private_extern _ChaCha20_ctr32_avx2
+
+.p2align	5
+_ChaCha20_ctr32_avx2:
+
+_CET_ENDBR
+	movq	%rsp,%r9
+
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	L$sigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	L$rot16(%rip),%r10
+	leaq	L$rot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	L$incy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	L$oop_enter8x
+
+.p2align	5
+L$oop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	L$eight(%rip),%ymm4,%ymm4
+
+L$oop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r10),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	L$oop8x
+
+.p2align	5
+L$oop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	L$oop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	L$tail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	L$oop_outer8x
+
+	jmp	L$done8x
+
+L$tail8x:
+	cmpq	$448,%rdx
+	jae	L$448_or_more8x
+	cmpq	$384,%rdx
+	jae	L$384_or_more8x
+	cmpq	$320,%rdx
+	jae	L$320_or_more8x
+	cmpq	$256,%rdx
+	jae	L$256_or_more8x
+	cmpq	$192,%rdx
+	jae	L$192_or_more8x
+	cmpq	$128,%rdx
+	jae	L$128_or_more8x
+	cmpq	$64,%rdx
+	jae	L$64_or_more8x
+
+	xorq	%r10,%r10
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	L$done8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	L$done8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	L$done8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	L$done8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	L$done8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	L$done8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	L$done8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+L$oop_tail8x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	L$oop_tail8x
+
+L$done8x:
+	vzeroall
+	leaq	(%r9),%rsp
+
+L$8x_epilogue:
+	ret
+
+
+#endif
diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm
new file mode 100644
index 0000000000..3a08c9500b
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm
@@ -0,0 +1,1752 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$zero:
+	DD	0,0,0,0
+$L$one:
+	DD	1,0,0,0
+$L$inc:
+	DD	0,1,2,3
+$L$four:
+	DD	4,4,4,4
+$L$incy:
+	DD	0,2,4,6,1,3,5,7
+$L$eight:
+	DD	8,8,8,8,8,8,8,8
+$L$rot16:
+	DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
+$L$rot24:
+	DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
+$L$sigma:
+	DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
+	DB	0
+ALIGN	64
+$L$zeroz:
+	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
+$L$fourz:
+	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
+$L$incz:
+	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+$L$sixteen:
+	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+	DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+	DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
+	DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
+	DB	108,46,111,114,103,62,0
+section	.text
+
+global	ChaCha20_ctr32_nohw
+
+ALIGN	64
+ChaCha20_ctr32_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,64+24
+
+$L$ctr32_body:
+
+
+	movdqu	xmm1,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	movdqa	xmm4,XMMWORD[$L$one]
+
+
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	mov	rbp,rdx
+	jmp	NEAR $L$oop_outer
+
+ALIGN	32
+$L$oop_outer:
+	mov	eax,0x61707865
+	mov	ebx,0x3320646e
+	mov	ecx,0x79622d32
+	mov	edx,0x6b206574
+	mov	r8d,DWORD[16+rsp]
+	mov	r9d,DWORD[20+rsp]
+	mov	r10d,DWORD[24+rsp]
+	mov	r11d,DWORD[28+rsp]
+	movd	r12d,xmm3
+	mov	r13d,DWORD[52+rsp]
+	mov	r14d,DWORD[56+rsp]
+	mov	r15d,DWORD[60+rsp]
+
+	mov	QWORD[((64+0))+rsp],rbp
+	mov	ebp,10
+	mov	QWORD[((64+8))+rsp],rsi
+DB	102,72,15,126,214
+	mov	QWORD[((64+16))+rsp],rdi
+	mov	rdi,rsi
+	shr	rdi,32
+	jmp	NEAR $L$oop
+
+ALIGN	32
+$L$oop:
+	add	eax,r8d
+	xor	r12d,eax
+	rol	r12d,16
+	add	ebx,r9d
+	xor	r13d,ebx
+	rol	r13d,16
+	add	esi,r12d
+	xor	r8d,esi
+	rol	r8d,12
+	add	edi,r13d
+	xor	r9d,edi
+	rol	r9d,12
+	add	eax,r8d
+	xor	r12d,eax
+	rol	r12d,8
+	add	ebx,r9d
+	xor	r13d,ebx
+	rol	r13d,8
+	add	esi,r12d
+	xor	r8d,esi
+	rol	r8d,7
+	add	edi,r13d
+	xor	r9d,edi
+	rol	r9d,7
+	mov	DWORD[32+rsp],esi
+	mov	DWORD[36+rsp],edi
+	mov	esi,DWORD[40+rsp]
+	mov	edi,DWORD[44+rsp]
+	add	ecx,r10d
+	xor	r14d,ecx
+	rol	r14d,16
+	add	edx,r11d
+	xor	r15d,edx
+	rol	r15d,16
+	add	esi,r14d
+	xor	r10d,esi
+	rol	r10d,12
+	add	edi,r15d
+	xor	r11d,edi
+	rol	r11d,12
+	add	ecx,r10d
+	xor	r14d,ecx
+	rol	r14d,8
+	add	edx,r11d
+	xor	r15d,edx
+	rol	r15d,8
+	add	esi,r14d
+	xor	r10d,esi
+	rol	r10d,7
+	add	edi,r15d
+	xor	r11d,edi
+	rol	r11d,7
+	add	eax,r9d
+	xor	r15d,eax
+	rol	r15d,16
+	add	ebx,r10d
+	xor	r12d,ebx
+	rol	r12d,16
+	add	esi,r15d
+	xor	r9d,esi
+	rol	r9d,12
+	add	edi,r12d
+	xor	r10d,edi
+	rol	r10d,12
+	add	eax,r9d
+	xor	r15d,eax
+	rol	r15d,8
+	add	ebx,r10d
+	xor	r12d,ebx
+	rol	r12d,8
+	add	esi,r15d
+	xor	r9d,esi
+	rol	r9d,7
+	add	edi,r12d
+	xor	r10d,edi
+	rol	r10d,7
+	mov	DWORD[40+rsp],esi
+	mov	DWORD[44+rsp],edi
+	mov	esi,DWORD[32+rsp]
+	mov	edi,DWORD[36+rsp]
+	add	ecx,r11d
+	xor	r13d,ecx
+	rol	r13d,16
+	add	edx,r8d
+	xor	r14d,edx
+	rol	r14d,16
+	add	esi,r13d
+	xor	r11d,esi
+	rol	r11d,12
+	add	edi,r14d
+	xor	r8d,edi
+	rol	r8d,12
+	add	ecx,r11d
+	xor	r13d,ecx
+	rol	r13d,8
+	add	edx,r8d
+	xor	r14d,edx
+	rol	r14d,8
+	add	esi,r13d
+	xor	r11d,esi
+	rol	r11d,7
+	add	edi,r14d
+	xor	r8d,edi
+	rol	r8d,7
+	dec	ebp
+	jnz	NEAR $L$oop
+	mov	DWORD[36+rsp],edi
+	mov	DWORD[32+rsp],esi
+	mov	rbp,QWORD[64+rsp]
+	movdqa	xmm1,xmm2
+	mov	rsi,QWORD[((64+8))+rsp]
+	paddd	xmm3,xmm4
+	mov	rdi,QWORD[((64+16))+rsp]
+
+	add	eax,0x61707865
+	add	ebx,0x3320646e
+	add	ecx,0x79622d32
+	add	edx,0x6b206574
+	add	r8d,DWORD[16+rsp]
+	add	r9d,DWORD[20+rsp]
+	add	r10d,DWORD[24+rsp]
+	add	r11d,DWORD[28+rsp]
+	add	r12d,DWORD[48+rsp]
+	add	r13d,DWORD[52+rsp]
+	add	r14d,DWORD[56+rsp]
+	add	r15d,DWORD[60+rsp]
+	paddd	xmm1,XMMWORD[32+rsp]
+
+	cmp	rbp,64
+	jb	NEAR $L$tail
+
+	xor	eax,DWORD[rsi]
+	xor	ebx,DWORD[4+rsi]
+	xor	ecx,DWORD[8+rsi]
+	xor	edx,DWORD[12+rsi]
+	xor	r8d,DWORD[16+rsi]
+	xor	r9d,DWORD[20+rsi]
+	xor	r10d,DWORD[24+rsi]
+	xor	r11d,DWORD[28+rsi]
+	movdqu	xmm0,XMMWORD[32+rsi]
+	xor	r12d,DWORD[48+rsi]
+	xor	r13d,DWORD[52+rsi]
+	xor	r14d,DWORD[56+rsi]
+	xor	r15d,DWORD[60+rsi]
+	lea	rsi,[64+rsi]
+	pxor	xmm0,xmm1
+
+	movdqa	XMMWORD[32+rsp],xmm2
+	movd	DWORD[48+rsp],xmm3
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	movdqu	XMMWORD[32+rdi],xmm0
+	mov	DWORD[48+rdi],r12d
+	mov	DWORD[52+rdi],r13d
+	mov	DWORD[56+rdi],r14d
+	mov	DWORD[60+rdi],r15d
+	lea	rdi,[64+rdi]
+
+	sub	rbp,64
+	jnz	NEAR $L$oop_outer
+
+	jmp	NEAR $L$done
+
+ALIGN	16
+$L$tail:
+	mov	DWORD[rsp],eax
+	mov	DWORD[4+rsp],ebx
+	xor	rbx,rbx
+	mov	DWORD[8+rsp],ecx
+	mov	DWORD[12+rsp],edx
+	mov	DWORD[16+rsp],r8d
+	mov	DWORD[20+rsp],r9d
+	mov	DWORD[24+rsp],r10d
+	mov	DWORD[28+rsp],r11d
+	movdqa	XMMWORD[32+rsp],xmm1
+	mov	DWORD[48+rsp],r12d
+	mov	DWORD[52+rsp],r13d
+	mov	DWORD[56+rsp],r14d
+	mov	DWORD[60+rsp],r15d
+
+$L$oop_tail:
+	movzx	eax,BYTE[rbx*1+rsi]
+	movzx	edx,BYTE[rbx*1+rsp]
+	lea	rbx,[1+rbx]
+	xor	eax,edx
+	mov	BYTE[((-1))+rbx*1+rdi],al
+	dec	rbp
+	jnz	NEAR $L$oop_tail
+
+$L$done:
+	lea	rsi,[((64+24+48))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$no_data:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_nohw:
+global	ChaCha20_ctr32_ssse3_4x
+
+ALIGN	32
+ChaCha20_ctr32_ssse3_4x:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_ssse3_4x:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9,rsp
+
+	sub	rsp,0x140+168
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$4x_body:
+	movdqa	xmm11,XMMWORD[$L$sigma]
+	movdqu	xmm15,XMMWORD[rcx]
+	movdqu	xmm7,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	lea	rcx,[256+rsp]
+	lea	r10,[$L$rot16]
+	lea	r11,[$L$rot24]
+
+	pshufd	xmm8,xmm11,0x00
+	pshufd	xmm9,xmm11,0x55
+	movdqa	XMMWORD[64+rsp],xmm8
+	pshufd	xmm10,xmm11,0xaa
+	movdqa	XMMWORD[80+rsp],xmm9
+	pshufd	xmm11,xmm11,0xff
+	movdqa	XMMWORD[96+rsp],xmm10
+	movdqa	XMMWORD[112+rsp],xmm11
+
+	pshufd	xmm12,xmm15,0x00
+	pshufd	xmm13,xmm15,0x55
+	movdqa	XMMWORD[(128-256)+rcx],xmm12
+	pshufd	xmm14,xmm15,0xaa
+	movdqa	XMMWORD[(144-256)+rcx],xmm13
+	pshufd	xmm15,xmm15,0xff
+	movdqa	XMMWORD[(160-256)+rcx],xmm14
+	movdqa	XMMWORD[(176-256)+rcx],xmm15
+
+	pshufd	xmm4,xmm7,0x00
+	pshufd	xmm5,xmm7,0x55
+	movdqa	XMMWORD[(192-256)+rcx],xmm4
+	pshufd	xmm6,xmm7,0xaa
+	movdqa	XMMWORD[(208-256)+rcx],xmm5
+	pshufd	xmm7,xmm7,0xff
+	movdqa	XMMWORD[(224-256)+rcx],xmm6
+	movdqa	XMMWORD[(240-256)+rcx],xmm7
+
+	pshufd	xmm0,xmm3,0x00
+	pshufd	xmm1,xmm3,0x55
+	paddd	xmm0,XMMWORD[$L$inc]
+	pshufd	xmm2,xmm3,0xaa
+	movdqa	XMMWORD[(272-256)+rcx],xmm1
+	pshufd	xmm3,xmm3,0xff
+	movdqa	XMMWORD[(288-256)+rcx],xmm2
+	movdqa	XMMWORD[(304-256)+rcx],xmm3
+
+	jmp	NEAR $L$oop_enter4x
+
+ALIGN	32
+$L$oop_outer4x:
+	movdqa	xmm8,XMMWORD[64+rsp]
+	movdqa	xmm9,XMMWORD[80+rsp]
+	movdqa	xmm10,XMMWORD[96+rsp]
+	movdqa	xmm11,XMMWORD[112+rsp]
+	movdqa	xmm12,XMMWORD[((128-256))+rcx]
+	movdqa	xmm13,XMMWORD[((144-256))+rcx]
+	movdqa	xmm14,XMMWORD[((160-256))+rcx]
+	movdqa	xmm15,XMMWORD[((176-256))+rcx]
+	movdqa	xmm4,XMMWORD[((192-256))+rcx]
+	movdqa	xmm5,XMMWORD[((208-256))+rcx]
+	movdqa	xmm6,XMMWORD[((224-256))+rcx]
+	movdqa	xmm7,XMMWORD[((240-256))+rcx]
+	movdqa	xmm0,XMMWORD[((256-256))+rcx]
+	movdqa	xmm1,XMMWORD[((272-256))+rcx]
+	movdqa	xmm2,XMMWORD[((288-256))+rcx]
+	movdqa	xmm3,XMMWORD[((304-256))+rcx]
+	paddd	xmm0,XMMWORD[$L$four]
+
+$L$oop_enter4x:
+	movdqa	XMMWORD[32+rsp],xmm6
+	movdqa	XMMWORD[48+rsp],xmm7
+	movdqa	xmm7,XMMWORD[r10]
+	mov	eax,10
+	movdqa	XMMWORD[(256-256)+rcx],xmm0
+	jmp	NEAR $L$oop4x
+
+ALIGN	32
+$L$oop4x:
+	paddd	xmm8,xmm12
+	paddd	xmm9,xmm13
+	pxor	xmm0,xmm8
+	pxor	xmm1,xmm9
+DB	102,15,56,0,199
+DB	102,15,56,0,207
+	paddd	xmm4,xmm0
+	paddd	xmm5,xmm1
+	pxor	xmm12,xmm4
+	pxor	xmm13,xmm5
+	movdqa	xmm6,xmm12
+	pslld	xmm12,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm13
+	pslld	xmm13,12
+	por	xmm12,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm13,xmm7
+	paddd	xmm8,xmm12
+	paddd	xmm9,xmm13
+	pxor	xmm0,xmm8
+	pxor	xmm1,xmm9
+DB	102,15,56,0,198
+DB	102,15,56,0,206
+	paddd	xmm4,xmm0
+	paddd	xmm5,xmm1
+	pxor	xmm12,xmm4
+	pxor	xmm13,xmm5
+	movdqa	xmm7,xmm12
+	pslld	xmm12,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm13
+	pslld	xmm13,7
+	por	xmm12,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm13,xmm6
+	movdqa	XMMWORD[rsp],xmm4
+	movdqa	XMMWORD[16+rsp],xmm5
+	movdqa	xmm4,XMMWORD[32+rsp]
+	movdqa	xmm5,XMMWORD[48+rsp]
+	paddd	xmm10,xmm14
+	paddd	xmm11,xmm15
+	pxor	xmm2,xmm10
+	pxor	xmm3,xmm11
+DB	102,15,56,0,215
+DB	102,15,56,0,223
+	paddd	xmm4,xmm2
+	paddd	xmm5,xmm3
+	pxor	xmm14,xmm4
+	pxor	xmm15,xmm5
+	movdqa	xmm6,xmm14
+	pslld	xmm14,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm15
+	pslld	xmm15,12
+	por	xmm14,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm15,xmm7
+	paddd	xmm10,xmm14
+	paddd	xmm11,xmm15
+	pxor	xmm2,xmm10
+	pxor	xmm3,xmm11
+DB	102,15,56,0,214
+DB	102,15,56,0,222
+	paddd	xmm4,xmm2
+	paddd	xmm5,xmm3
+	pxor	xmm14,xmm4
+	pxor	xmm15,xmm5
+	movdqa	xmm7,xmm14
+	pslld	xmm14,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm15
+	pslld	xmm15,7
+	por	xmm14,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm15,xmm6
+	paddd	xmm8,xmm13
+	paddd	xmm9,xmm14
+	pxor	xmm3,xmm8
+	pxor	xmm0,xmm9
+DB	102,15,56,0,223
+DB	102,15,56,0,199
+	paddd	xmm4,xmm3
+	paddd	xmm5,xmm0
+	pxor	xmm13,xmm4
+	pxor	xmm14,xmm5
+	movdqa	xmm6,xmm13
+	pslld	xmm13,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm14
+	pslld	xmm14,12
+	por	xmm13,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm14,xmm7
+	paddd	xmm8,xmm13
+	paddd	xmm9,xmm14
+	pxor	xmm3,xmm8
+	pxor	xmm0,xmm9
+DB	102,15,56,0,222
+DB	102,15,56,0,198
+	paddd	xmm4,xmm3
+	paddd	xmm5,xmm0
+	pxor	xmm13,xmm4
+	pxor	xmm14,xmm5
+	movdqa	xmm7,xmm13
+	pslld	xmm13,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm14
+	pslld	xmm14,7
+	por	xmm13,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm14,xmm6
+	movdqa	XMMWORD[32+rsp],xmm4
+	movdqa	XMMWORD[48+rsp],xmm5
+	movdqa	xmm4,XMMWORD[rsp]
+	movdqa	xmm5,XMMWORD[16+rsp]
+	paddd	xmm10,xmm15
+	paddd	xmm11,xmm12
+	pxor	xmm1,xmm10
+	pxor	xmm2,xmm11
+DB	102,15,56,0,207
+DB	102,15,56,0,215
+	paddd	xmm4,xmm1
+	paddd	xmm5,xmm2
+	pxor	xmm15,xmm4
+	pxor	xmm12,xmm5
+	movdqa	xmm6,xmm15
+	pslld	xmm15,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm12
+	pslld	xmm12,12
+	por	xmm15,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm12,xmm7
+	paddd	xmm10,xmm15
+	paddd	xmm11,xmm12
+	pxor	xmm1,xmm10
+	pxor	xmm2,xmm11
+DB	102,15,56,0,206
+DB	102,15,56,0,214
+	paddd	xmm4,xmm1
+	paddd	xmm5,xmm2
+	pxor	xmm15,xmm4
+	pxor	xmm12,xmm5
+	movdqa	xmm7,xmm15
+	pslld	xmm15,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm12
+	pslld	xmm12,7
+	por	xmm15,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm12,xmm6
+	dec	eax
+	jnz	NEAR $L$oop4x
+
+	paddd	xmm8,XMMWORD[64+rsp]
+	paddd	xmm9,XMMWORD[80+rsp]
+	paddd	xmm10,XMMWORD[96+rsp]
+	paddd	xmm11,XMMWORD[112+rsp]
+
+	movdqa	xmm6,xmm8
+	punpckldq	xmm8,xmm9
+	movdqa	xmm7,xmm10
+	punpckldq	xmm10,xmm11
+	punpckhdq	xmm6,xmm9
+	punpckhdq	xmm7,xmm11
+	movdqa	xmm9,xmm8
+	punpcklqdq	xmm8,xmm10
+	movdqa	xmm11,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm9,xmm10
+	punpckhqdq	xmm11,xmm7
+	paddd	xmm12,XMMWORD[((128-256))+rcx]
+	paddd	xmm13,XMMWORD[((144-256))+rcx]
+	paddd	xmm14,XMMWORD[((160-256))+rcx]
+	paddd	xmm15,XMMWORD[((176-256))+rcx]
+
+	movdqa	XMMWORD[rsp],xmm8
+	movdqa	XMMWORD[16+rsp],xmm9
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+
+	movdqa	xmm10,xmm12
+	punpckldq	xmm12,xmm13
+	movdqa	xmm7,xmm14
+	punpckldq	xmm14,xmm15
+	punpckhdq	xmm10,xmm13
+	punpckhdq	xmm7,xmm15
+	movdqa	xmm13,xmm12
+	punpcklqdq	xmm12,xmm14
+	movdqa	xmm15,xmm10
+	punpcklqdq	xmm10,xmm7
+	punpckhqdq	xmm13,xmm14
+	punpckhqdq	xmm15,xmm7
+	paddd	xmm4,XMMWORD[((192-256))+rcx]
+	paddd	xmm5,XMMWORD[((208-256))+rcx]
+	paddd	xmm8,XMMWORD[((224-256))+rcx]
+	paddd	xmm9,XMMWORD[((240-256))+rcx]
+
+	movdqa	XMMWORD[32+rsp],xmm6
+	movdqa	XMMWORD[48+rsp],xmm11
+
+	movdqa	xmm14,xmm4
+	punpckldq	xmm4,xmm5
+	movdqa	xmm7,xmm8
+	punpckldq	xmm8,xmm9
+	punpckhdq	xmm14,xmm5
+	punpckhdq	xmm7,xmm9
+	movdqa	xmm5,xmm4
+	punpcklqdq	xmm4,xmm8
+	movdqa	xmm9,xmm14
+	punpcklqdq	xmm14,xmm7
+	punpckhqdq	xmm5,xmm8
+	punpckhqdq	xmm9,xmm7
+	paddd	xmm0,XMMWORD[((256-256))+rcx]
+	paddd	xmm1,XMMWORD[((272-256))+rcx]
+	paddd	xmm2,XMMWORD[((288-256))+rcx]
+	paddd	xmm3,XMMWORD[((304-256))+rcx]
+
+	movdqa	xmm8,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm8,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm8
+	punpcklqdq	xmm8,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	cmp	rdx,64*4
+	jb	NEAR $L$tail4x
+
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[32+rsp]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm14
+	pxor	xmm7,xmm8
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[48+rsp]
+	pxor	xmm11,xmm15
+	pxor	xmm2,xmm9
+	pxor	xmm7,xmm3
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+
+	sub	rdx,64*4
+	jnz	NEAR $L$oop_outer4x
+
+	jmp	NEAR $L$done4x
+
+$L$tail4x:
+	cmp	rdx,192
+	jae	NEAR $L$192_or_more4x
+	cmp	rdx,128
+	jae	NEAR $L$128_or_more4x
+	cmp	rdx,64
+	jae	NEAR $L$64_or_more4x
+
+
+	xor	r10,r10
+
+	movdqa	XMMWORD[16+rsp],xmm12
+	movdqa	XMMWORD[32+rsp],xmm4
+	movdqa	XMMWORD[48+rsp],xmm0
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$64_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[16+rsp]
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm13
+	lea	rdi,[64+rdi]
+	movdqa	XMMWORD[32+rsp],xmm5
+	sub	rdx,64
+	movdqa	XMMWORD[48+rsp],xmm1
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$128_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	XMMWORD[112+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[32+rsp]
+	lea	rsi,[128+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm10
+	lea	rdi,[128+rdi]
+	movdqa	XMMWORD[32+rsp],xmm14
+	sub	rdx,128
+	movdqa	XMMWORD[48+rsp],xmm8
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$192_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[32+rsp]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm14
+	pxor	xmm7,xmm8
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[48+rsp]
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm15
+	lea	rdi,[64+rdi]
+	movdqa	XMMWORD[32+rsp],xmm9
+	sub	rdx,192
+	movdqa	XMMWORD[48+rsp],xmm3
+
+$L$oop_tail4x:
+	movzx	eax,BYTE[r10*1+rsi]
+	movzx	ecx,BYTE[r10*1+rsp]
+	lea	r10,[1+r10]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r10*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail4x
+
+$L$done4x:
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_ssse3_4x:
+global	ChaCha20_ctr32_avx2
+
+ALIGN	32
+ChaCha20_ctr32_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9,rsp
+
+	sub	rsp,0x280+168
+	and	rsp,-32
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$8x_body:
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
+	vbroadcasti128	ymm3,XMMWORD[rcx]
+	vbroadcasti128	ymm15,XMMWORD[16+rcx]
+	vbroadcasti128	ymm7,XMMWORD[r8]
+	lea	rcx,[256+rsp]
+	lea	rax,[512+rsp]
+	lea	r10,[$L$rot16]
+	lea	r11,[$L$rot24]
+
+	vpshufd	ymm8,ymm11,0x00
+	vpshufd	ymm9,ymm11,0x55
+	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
+	vpshufd	ymm10,ymm11,0xaa
+	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
+	vpshufd	ymm11,ymm11,0xff
+	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
+	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
+
+	vpshufd	ymm0,ymm3,0x00
+	vpshufd	ymm1,ymm3,0x55
+	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
+	vpshufd	ymm2,ymm3,0xaa
+	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
+	vpshufd	ymm3,ymm3,0xff
+	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
+	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
+
+	vpshufd	ymm12,ymm15,0x00
+	vpshufd	ymm13,ymm15,0x55
+	vmovdqa	YMMWORD[(384-512)+rax],ymm12
+	vpshufd	ymm14,ymm15,0xaa
+	vmovdqa	YMMWORD[(416-512)+rax],ymm13
+	vpshufd	ymm15,ymm15,0xff
+	vmovdqa	YMMWORD[(448-512)+rax],ymm14
+	vmovdqa	YMMWORD[(480-512)+rax],ymm15
+
+	vpshufd	ymm4,ymm7,0x00
+	vpshufd	ymm5,ymm7,0x55
+	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
+	vpshufd	ymm6,ymm7,0xaa
+	vmovdqa	YMMWORD[(544-512)+rax],ymm5
+	vpshufd	ymm7,ymm7,0xff
+	vmovdqa	YMMWORD[(576-512)+rax],ymm6
+	vmovdqa	YMMWORD[(608-512)+rax],ymm7
+
+	jmp	NEAR $L$oop_enter8x
+
+ALIGN	32
+$L$oop_outer8x:
+	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
+	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
+	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
+	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
+	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
+	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
+	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
+	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
+	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
+	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
+	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
+	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
+	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
+	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
+	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
+	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
+	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
+
+$L$oop_enter8x:
+	vmovdqa	YMMWORD[64+rsp],ymm14
+	vmovdqa	YMMWORD[96+rsp],ymm15
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vmovdqa	YMMWORD[(512-512)+rax],ymm4
+	mov	eax,10
+	jmp	NEAR $L$oop8x
+
+ALIGN	32
+$L$oop8x:
+	vpaddd	ymm8,ymm8,ymm0
+	vpxor	ymm4,ymm8,ymm4
+	vpshufb	ymm4,ymm4,ymm15
+	vpaddd	ymm9,ymm9,ymm1
+	vpxor	ymm5,ymm9,ymm5
+	vpshufb	ymm5,ymm5,ymm15
+	vpaddd	ymm12,ymm12,ymm4
+	vpxor	ymm0,ymm12,ymm0
+	vpslld	ymm14,ymm0,12
+	vpsrld	ymm0,ymm0,20
+	vpor	ymm0,ymm14,ymm0
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm5
+	vpxor	ymm1,ymm13,ymm1
+	vpslld	ymm15,ymm1,12
+	vpsrld	ymm1,ymm1,20
+	vpor	ymm1,ymm15,ymm1
+	vpaddd	ymm8,ymm8,ymm0
+	vpxor	ymm4,ymm8,ymm4
+	vpshufb	ymm4,ymm4,ymm14
+	vpaddd	ymm9,ymm9,ymm1
+	vpxor	ymm5,ymm9,ymm5
+	vpshufb	ymm5,ymm5,ymm14
+	vpaddd	ymm12,ymm12,ymm4
+	vpxor	ymm0,ymm12,ymm0
+	vpslld	ymm15,ymm0,7
+	vpsrld	ymm0,ymm0,25
+	vpor	ymm0,ymm15,ymm0
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm5
+	vpxor	ymm1,ymm13,ymm1
+	vpslld	ymm14,ymm1,7
+	vpsrld	ymm1,ymm1,25
+	vpor	ymm1,ymm14,ymm1
+	vmovdqa	YMMWORD[rsp],ymm12
+	vmovdqa	YMMWORD[32+rsp],ymm13
+	vmovdqa	ymm12,YMMWORD[64+rsp]
+	vmovdqa	ymm13,YMMWORD[96+rsp]
+	vpaddd	ymm10,ymm10,ymm2
+	vpxor	ymm6,ymm10,ymm6
+	vpshufb	ymm6,ymm6,ymm15
+	vpaddd	ymm11,ymm11,ymm3
+	vpxor	ymm7,ymm11,ymm7
+	vpshufb	ymm7,ymm7,ymm15
+	vpaddd	ymm12,ymm12,ymm6
+	vpxor	ymm2,ymm12,ymm2
+	vpslld	ymm14,ymm2,12
+	vpsrld	ymm2,ymm2,20
+	vpor	ymm2,ymm14,ymm2
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm7
+	vpxor	ymm3,ymm13,ymm3
+	vpslld	ymm15,ymm3,12
+	vpsrld	ymm3,ymm3,20
+	vpor	ymm3,ymm15,ymm3
+	vpaddd	ymm10,ymm10,ymm2
+	vpxor	ymm6,ymm10,ymm6
+	vpshufb	ymm6,ymm6,ymm14
+	vpaddd	ymm11,ymm11,ymm3
+	vpxor	ymm7,ymm11,ymm7
+	vpshufb	ymm7,ymm7,ymm14
+	vpaddd	ymm12,ymm12,ymm6
+	vpxor	ymm2,ymm12,ymm2
+	vpslld	ymm15,ymm2,7
+	vpsrld	ymm2,ymm2,25
+	vpor	ymm2,ymm15,ymm2
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm7
+	vpxor	ymm3,ymm13,ymm3
+	vpslld	ymm14,ymm3,7
+	vpsrld	ymm3,ymm3,25
+	vpor	ymm3,ymm14,ymm3
+	vpaddd	ymm8,ymm8,ymm1
+	vpxor	ymm7,ymm8,ymm7
+	vpshufb	ymm7,ymm7,ymm15
+	vpaddd	ymm9,ymm9,ymm2
+	vpxor	ymm4,ymm9,ymm4
+	vpshufb	ymm4,ymm4,ymm15
+	vpaddd	ymm12,ymm12,ymm7
+	vpxor	ymm1,ymm12,ymm1
+	vpslld	ymm14,ymm1,12
+	vpsrld	ymm1,ymm1,20
+	vpor	ymm1,ymm14,ymm1
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm4
+	vpxor	ymm2,ymm13,ymm2
+	vpslld	ymm15,ymm2,12
+	vpsrld	ymm2,ymm2,20
+	vpor	ymm2,ymm15,ymm2
+	vpaddd	ymm8,ymm8,ymm1
+	vpxor	ymm7,ymm8,ymm7
+	vpshufb	ymm7,ymm7,ymm14
+	vpaddd	ymm9,ymm9,ymm2
+	vpxor	ymm4,ymm9,ymm4
+	vpshufb	ymm4,ymm4,ymm14
+	vpaddd	ymm12,ymm12,ymm7
+	vpxor	ymm1,ymm12,ymm1
+	vpslld	ymm15,ymm1,7
+	vpsrld	ymm1,ymm1,25
+	vpor	ymm1,ymm15,ymm1
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm4
+	vpxor	ymm2,ymm13,ymm2
+	vpslld	ymm14,ymm2,7
+	vpsrld	ymm2,ymm2,25
+	vpor	ymm2,ymm14,ymm2
+	vmovdqa	YMMWORD[64+rsp],ymm12
+	vmovdqa	YMMWORD[96+rsp],ymm13
+	vmovdqa	ymm12,YMMWORD[rsp]
+	vmovdqa	ymm13,YMMWORD[32+rsp]
+	vpaddd	ymm10,ymm10,ymm3
+	vpxor	ymm5,ymm10,ymm5
+	vpshufb	ymm5,ymm5,ymm15
+	vpaddd	ymm11,ymm11,ymm0
+	vpxor	ymm6,ymm11,ymm6
+	vpshufb	ymm6,ymm6,ymm15
+	vpaddd	ymm12,ymm12,ymm5
+	vpxor	ymm3,ymm12,ymm3
+	vpslld	ymm14,ymm3,12
+	vpsrld	ymm3,ymm3,20
+	vpor	ymm3,ymm14,ymm3
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm6
+	vpxor	ymm0,ymm13,ymm0
+	vpslld	ymm15,ymm0,12
+	vpsrld	ymm0,ymm0,20
+	vpor	ymm0,ymm15,ymm0
+	vpaddd	ymm10,ymm10,ymm3
+	vpxor	ymm5,ymm10,ymm5
+	vpshufb	ymm5,ymm5,ymm14
+	vpaddd	ymm11,ymm11,ymm0
+	vpxor	ymm6,ymm11,ymm6
+	vpshufb	ymm6,ymm6,ymm14
+	vpaddd	ymm12,ymm12,ymm5
+	vpxor	ymm3,ymm12,ymm3
+	vpslld	ymm15,ymm3,7
+	vpsrld	ymm3,ymm3,25
+	vpor	ymm3,ymm15,ymm3
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm6
+	vpxor	ymm0,ymm13,ymm0
+	vpslld	ymm14,ymm0,7
+	vpsrld	ymm0,ymm0,25
+	vpor	ymm0,ymm14,ymm0
+	dec	eax
+	jnz	NEAR $L$oop8x
+
+	lea	rax,[512+rsp]
+	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
+	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
+	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
+	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
+
+	vpunpckldq	ymm14,ymm8,ymm9
+	vpunpckldq	ymm15,ymm10,ymm11
+	vpunpckhdq	ymm8,ymm8,ymm9
+	vpunpckhdq	ymm10,ymm10,ymm11
+	vpunpcklqdq	ymm9,ymm14,ymm15
+	vpunpckhqdq	ymm14,ymm14,ymm15
+	vpunpcklqdq	ymm11,ymm8,ymm10
+	vpunpckhqdq	ymm8,ymm8,ymm10
+	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
+	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
+	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
+	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
+
+	vpunpckldq	ymm10,ymm0,ymm1
+	vpunpckldq	ymm15,ymm2,ymm3
+	vpunpckhdq	ymm0,ymm0,ymm1
+	vpunpckhdq	ymm2,ymm2,ymm3
+	vpunpcklqdq	ymm1,ymm10,ymm15
+	vpunpckhqdq	ymm10,ymm10,ymm15
+	vpunpcklqdq	ymm3,ymm0,ymm2
+	vpunpckhqdq	ymm0,ymm0,ymm2
+	vperm2i128	ymm15,ymm9,ymm1,0x20
+	vperm2i128	ymm1,ymm9,ymm1,0x31
+	vperm2i128	ymm9,ymm14,ymm10,0x20
+	vperm2i128	ymm10,ymm14,ymm10,0x31
+	vperm2i128	ymm14,ymm11,ymm3,0x20
+	vperm2i128	ymm3,ymm11,ymm3,0x31
+	vperm2i128	ymm11,ymm8,ymm0,0x20
+	vperm2i128	ymm0,ymm8,ymm0,0x31
+	vmovdqa	YMMWORD[rsp],ymm15
+	vmovdqa	YMMWORD[32+rsp],ymm9
+	vmovdqa	ymm15,YMMWORD[64+rsp]
+	vmovdqa	ymm9,YMMWORD[96+rsp]
+
+	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
+	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
+	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
+	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
+
+	vpunpckldq	ymm2,ymm12,ymm13
+	vpunpckldq	ymm8,ymm15,ymm9
+	vpunpckhdq	ymm12,ymm12,ymm13
+	vpunpckhdq	ymm15,ymm15,ymm9
+	vpunpcklqdq	ymm13,ymm2,ymm8
+	vpunpckhqdq	ymm2,ymm2,ymm8
+	vpunpcklqdq	ymm9,ymm12,ymm15
+	vpunpckhqdq	ymm12,ymm12,ymm15
+	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
+	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
+	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
+	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
+
+	vpunpckldq	ymm15,ymm4,ymm5
+	vpunpckldq	ymm8,ymm6,ymm7
+	vpunpckhdq	ymm4,ymm4,ymm5
+	vpunpckhdq	ymm6,ymm6,ymm7
+	vpunpcklqdq	ymm5,ymm15,ymm8
+	vpunpckhqdq	ymm15,ymm15,ymm8
+	vpunpcklqdq	ymm7,ymm4,ymm6
+	vpunpckhqdq	ymm4,ymm4,ymm6
+	vperm2i128	ymm8,ymm13,ymm5,0x20
+	vperm2i128	ymm5,ymm13,ymm5,0x31
+	vperm2i128	ymm13,ymm2,ymm15,0x20
+	vperm2i128	ymm15,ymm2,ymm15,0x31
+	vperm2i128	ymm2,ymm9,ymm7,0x20
+	vperm2i128	ymm7,ymm9,ymm7,0x31
+	vperm2i128	ymm9,ymm12,ymm4,0x20
+	vperm2i128	ymm4,ymm12,ymm4,0x31
+	vmovdqa	ymm6,YMMWORD[rsp]
+	vmovdqa	ymm12,YMMWORD[32+rsp]
+
+	cmp	rdx,64*8
+	jb	NEAR $L$tail8x
+
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm12,ymm12,YMMWORD[rsi]
+	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm12
+	vmovdqu	YMMWORD[32+rdi],ymm13
+	vmovdqu	YMMWORD[64+rdi],ymm10
+	vmovdqu	YMMWORD[96+rdi],ymm15
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm14,ymm14,YMMWORD[rsi]
+	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm14
+	vmovdqu	YMMWORD[32+rdi],ymm2
+	vmovdqu	YMMWORD[64+rdi],ymm3
+	vmovdqu	YMMWORD[96+rdi],ymm7
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm11,ymm11,YMMWORD[rsi]
+	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm11
+	vmovdqu	YMMWORD[32+rdi],ymm9
+	vmovdqu	YMMWORD[64+rdi],ymm0
+	vmovdqu	YMMWORD[96+rdi],ymm4
+	lea	rdi,[128+rdi]
+
+	sub	rdx,64*8
+	jnz	NEAR $L$oop_outer8x
+
+	jmp	NEAR $L$done8x
+
+$L$tail8x:
+	cmp	rdx,448
+	jae	NEAR $L$448_or_more8x
+	cmp	rdx,384
+	jae	NEAR $L$384_or_more8x
+	cmp	rdx,320
+	jae	NEAR $L$320_or_more8x
+	cmp	rdx,256
+	jae	NEAR $L$256_or_more8x
+	cmp	rdx,192
+	jae	NEAR $L$192_or_more8x
+	cmp	rdx,128
+	jae	NEAR $L$128_or_more8x
+	cmp	rdx,64
+	jae	NEAR $L$64_or_more8x
+
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm6
+	vmovdqa	YMMWORD[32+rsp],ymm8
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$64_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	je	NEAR $L$done8x
+
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm1
+	lea	rdi,[64+rdi]
+	sub	rdx,64
+	vmovdqa	YMMWORD[32+rsp],ymm5
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$128_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	je	NEAR $L$done8x
+
+	lea	rsi,[128+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm12
+	lea	rdi,[128+rdi]
+	sub	rdx,128
+	vmovdqa	YMMWORD[32+rsp],ymm13
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$192_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	je	NEAR $L$done8x
+
+	lea	rsi,[192+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm10
+	lea	rdi,[192+rdi]
+	sub	rdx,192
+	vmovdqa	YMMWORD[32+rsp],ymm15
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$256_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	je	NEAR $L$done8x
+
+	lea	rsi,[256+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm14
+	lea	rdi,[256+rdi]
+	sub	rdx,256
+	vmovdqa	YMMWORD[32+rsp],ymm2
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$320_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	je	NEAR $L$done8x
+
+	lea	rsi,[320+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm3
+	lea	rdi,[320+rdi]
+	sub	rdx,320
+	vmovdqa	YMMWORD[32+rsp],ymm7
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$384_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	vmovdqu	YMMWORD[320+rdi],ymm3
+	vmovdqu	YMMWORD[352+rdi],ymm7
+	je	NEAR $L$done8x
+
+	lea	rsi,[384+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm11
+	lea	rdi,[384+rdi]
+	sub	rdx,384
+	vmovdqa	YMMWORD[32+rsp],ymm9
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$448_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	vmovdqu	YMMWORD[320+rdi],ymm3
+	vmovdqu	YMMWORD[352+rdi],ymm7
+	vmovdqu	YMMWORD[384+rdi],ymm11
+	vmovdqu	YMMWORD[416+rdi],ymm9
+	je	NEAR $L$done8x
+
+	lea	rsi,[448+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm0
+	lea	rdi,[448+rdi]
+	sub	rdx,448
+	vmovdqa	YMMWORD[32+rsp],ymm4
+
+$L$oop_tail8x:
+	movzx	eax,BYTE[r10*1+rsi]
+	movzx	ecx,BYTE[r10*1+rsp]
+	lea	r10,[1+r10]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r10*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail8x
+
+$L$done8x:
+	vzeroall
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$8x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_avx2:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	lea	r10,[$L$ctr32_body]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$no_data]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[((64+24+48))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+
+ALIGN	16
+ssse3_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-40))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,4
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+	DD	$L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ChaCha20_ctr32_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ctr32_ssse3_4x:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
+$L$SEH_info_ChaCha20_ctr32_avx2:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-nasm.o b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.o
new file mode 100644
index 0000000000..f606b4015f
Binary files /dev/null and b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S
new file mode 100644
index 0000000000..36e007434a
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S
@@ -0,0 +1,3008 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+.section	__TEXT,__const
+
+.align	7
+Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long	1,2,3,4
+Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+
+.align	6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, Lpoly_hash_intro
+	ret
+
+Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+	cbz	x4, Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	_chacha20_poly1305_seal
+.private_extern	_chacha20_poly1305_seal
+
+.align	6
+_chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	Lseal_main_loop
+
+Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	Lseal_tail
+
+Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	Lseal_tail_64
+
+Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+	cbz	x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+	cbz	x4, Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+	b	Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	_chacha20_poly1305_open
+.private_extern	_chacha20_poly1305_open
+
+.align	6
+_chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	Lopen_tail
+
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, Lopen_main_loop_rounds_short
+
+.align	5
+Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	Lopen_main_loop
+
+Lopen_tail:
+
+	cbz	x2, Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	Lopen_tail_64
+	cmp	x2, #128
+	b.le	Lopen_tail_128
+
+Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+	cbz	x4, Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	Lopen_tail_64_store
+
+Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_128_rounds
+	cbz	x4, Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	Lopen_tail_64_store
+
+Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_64_rounds
+	cbz	x4, Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	Lopen_tail_64_store
+
+Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_16_store
+
+Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_128_store:
+	cmp	x2, #64
+	b.lt	Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+Lopen_128_hash_64:
+	cbz	x4, Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_128_hash_64
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S
new file mode 100644
index 0000000000..611f4366e0
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S
@@ -0,0 +1,3008 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+.section	.rodata
+
+.align	7
+.Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Linc:
+.long	1,2,3,4
+.Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.type	.Lpoly_hash_ad_internal,%function
+.align	6
+.Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, .Lpoly_hash_intro
+	ret
+
+.Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	.Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	.Lpoly_hash_ad_internal
+
+.Lpoly_hash_ad_tail:
+	cbz	x4, .Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+.Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	.Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+.Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+.size	.Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	chacha20_poly1305_seal
+.hidden	chacha20_poly1305_seal
+.type	chacha20_poly1305_seal,%function
+.align	6
+chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	.Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+.Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	.Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+.Lseal_main_loop:
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+.Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	.Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	.Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	.Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	.Lseal_main_loop
+
+.Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	.Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	.Lseal_tail
+
+.Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	.Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	.Lseal_tail_64
+
+.Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, .Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+.Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	.Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+.Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	.Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+.Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	.Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+.Lseal_hash_extra:
+	cbz	x4, .Lseal_finalize
+
+.Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	.Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	.Lseal_hash_extra_loop
+
+.Lseal_hash_extra_tail:
+	cbz	x4, .Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+.Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	.Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+.Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+.Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+	b	.Lseal_tail
+.cfi_endproc
+.size	chacha20_poly1305_seal,.-chacha20_poly1305_seal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	chacha20_poly1305_open
+.hidden	chacha20_poly1305_open
+.type	chacha20_poly1305_open,%function
+.align	6
+chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	.Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+.Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+
+.Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+.Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	.Lopen_tail
+
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, .Lopen_main_loop_rounds_short
+
+.align	5
+.Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+.Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	.Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	.Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	.Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	.Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	.Lopen_main_loop
+
+.Lopen_tail:
+
+	cbz	x2, .Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	.Lopen_tail_64
+	cmp	x2, #128
+	b.le	.Lopen_tail_128
+
+.Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, .Lopen_tail_192_rounds_no_hash
+
+.Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+.Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	.Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	.Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+.Lopen_tail_192_hash:
+	cbz	x4, .Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	.Lopen_tail_192_hash
+
+.Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	.Lopen_tail_64_store
+
+.Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+.Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	.Lopen_tail_128_rounds
+	cbz	x4, .Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	.Lopen_tail_128_rounds
+
+.Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	.Lopen_tail_64_store
+
+.Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+.Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	.Lopen_tail_64_rounds
+	cbz	x4, .Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	.Lopen_tail_64_rounds
+
+.Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+.Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	.Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	.Lopen_tail_64_store
+
+.Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, .Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+.Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	.Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+.Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	.Lopen_tail_16_store
+
+.Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+.Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+
+.Lopen_128_store:
+	cmp	x2, #64
+	b.lt	.Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+.Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+.Lopen_128_hash_64:
+	cbz	x4, .Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	.Lopen_128_hash_64
+.cfi_endproc
+.size	chacha20_poly1305_open,.-chacha20_poly1305_open
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S
new file mode 100644
index 0000000000..75f34c5e6b
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S
@@ -0,0 +1,3014 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+.section	.rodata
+
+.align	7
+Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long	1,2,3,4
+Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.def Lpoly_hash_ad_internal
+   .type 32
+.endef
+.align	6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, Lpoly_hash_intro
+	ret
+
+Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+	cbz	x4, Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	chacha20_poly1305_seal
+
+.def chacha20_poly1305_seal
+   .type 32
+.endef
+.align	6
+chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	Lseal_main_loop
+
+Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	Lseal_tail
+
+Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	Lseal_tail_64
+
+Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+	cbz	x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+	cbz	x4, Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+	b	Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	chacha20_poly1305_open
+
+.def chacha20_poly1305_open
+   .type 32
+.endef
+.align	6
+chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	Lopen_tail
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, Lopen_main_loop_rounds_short
+
+.align	5
+Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	Lopen_main_loop
+
+Lopen_tail:
+
+	cbz	x2, Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	Lopen_tail_64
+	cmp	x2, #128
+	b.le	Lopen_tail_128
+
+Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+	cbz	x4, Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	Lopen_tail_64_store
+
+Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_128_rounds
+	cbz	x4, Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	Lopen_tail_64_store
+
+Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_64_rounds
+	cbz	x4, Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	Lopen_tail_64_store
+
+Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_16_store
+
+Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_128_store:
+	cmp	x2, #64
+	b.lt	Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+Lopen_128_hash_64:
+	cbz	x4, Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_128_hash_64
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S
new file mode 100644
index 0000000000..499d70cc83
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S
@@ -0,0 +1,8940 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section	.rodata
+.align	64
+chacha20_poly1305_constants:
+.Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lrol16:
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.Lavx2_init:
+.long	0,0,0,0
+.Lsse_inc:
+.long	1,0,0,0
+.Lavx2_inc:
+.long	2,0,0,0,2,0,0,0
+.Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align	16
+.Land_masks:
+.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text	
+
+.type	poly_hash_ad_internal,@function
+.align	64
+poly_hash_ad_internal:
+.cfi_startproc	
+.cfi_def_cfa	rsp, 8
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	cmpq	$13,%r8
+	jne	.Lhash_ad_loop
+.Lpoly_fast_tls_ad:
+
+	movq	(%rcx),%r10
+	movq	5(%rcx),%r11
+	shrq	$24,%r11
+	movq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	ret
+.Lhash_ad_loop:
+
+	cmpq	$16,%r8
+	jb	.Lhash_ad_tail
+	addq	0+0(%rcx),%r10
+	adcq	8+0(%rcx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rcx),%rcx
+	subq	$16,%r8
+	jmp	.Lhash_ad_loop
+.Lhash_ad_tail:
+	cmpq	$0,%r8
+	je	.Lhash_ad_done
+
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	addq	%r8,%rcx
+.Lhash_ad_tail_loop:
+	shldq	$8,%r13,%r14
+	shlq	$8,%r13
+	movzbq	-1(%rcx),%r15
+	xorq	%r15,%r13
+	decq	%rcx
+	decq	%r8
+	jne	.Lhash_ad_tail_loop
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lhash_ad_done:
+	ret
+.cfi_endproc	
+.size	poly_hash_ad_internal, .-poly_hash_ad_internal
+
+.globl	chacha20_poly1305_open_sse41
+.hidden chacha20_poly1305_open_sse41
+.type	chacha20_poly1305_open_sse41,@function
+.align	64
+chacha20_poly1305_open_sse41:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	cmpq	$128,%rbx
+	jbe	.Lopen_sse_128
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movq	$10,%r10
+.Lopen_sse_init_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jne	.Lopen_sse_init_rounds
+
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+
+	pand	.Lclamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_sse_main_loop:
+	cmpq	$256,%rbx
+	jb	.Lopen_sse_tail
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+
+
+	movq	$4,%rcx
+	movq	%rsi,%r8
+.Lopen_sse_main_loop_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+
+	leaq	16(%r8),%r8
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%rcx
+	jge	.Lopen_sse_main_loop_rounds
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	cmpq	$-6,%rcx
+	jg	.Lopen_sse_main_loop_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	0+80(%rbp),%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	.Lopen_sse_main_loop
+.Lopen_sse_tail:
+
+	testq	%rbx,%rbx
+	jz	.Lopen_sse_finalize
+	cmpq	$192,%rbx
+	ja	.Lopen_sse_tail_256
+	cmpq	$128,%rbx
+	ja	.Lopen_sse_tail_192
+	cmpq	$64,%rbx
+	ja	.Lopen_sse_tail_128
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	cmpq	$16,%rcx
+	jb	.Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+.Lopen_sse_tail_64_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	cmpq	$16,%rcx
+	jae	.Lopen_sse_tail_64_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_64_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_128:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+.Lopen_sse_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_128_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_192:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+	movq	%rbx,%rcx
+	movq	$160,%r8
+	cmpq	$160,%rcx
+	cmovgq	%r8,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+.Lopen_sse_tail_192_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_192_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_192_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_192_rounds
+	cmpq	$176,%rbx
+	jb	.Lopen_sse_tail_192_finish
+	addq	0+160(%rsi),%r10
+	adcq	8+160(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	cmpq	$192,%rbx
+	jb	.Lopen_sse_tail_192_finish
+	addq	0+176(%rsi),%r10
+	adcq	8+176(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_192_finish:
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_256:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+	xorq	%r8,%r8
+.Lopen_sse_tail_256_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	movdqa	0+80(%rbp),%xmm11
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+	movdqa	0+80(%rbp),%xmm9
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+	movdqa	0+80(%rbp),%xmm11
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+	movdqa	0+80(%rbp),%xmm9
+
+	addq	$16,%r8
+	cmpq	$160,%r8
+	jb	.Lopen_sse_tail_256_rounds_and_x1hash
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+.Lopen_sse_tail_256_hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%r8
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_256_hash
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movdqa	0+80(%rbp),%xmm12
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	leaq	192(%rdi),%rdi
+
+
+.Lopen_sse_tail_64_dec_loop:
+	cmpq	$16,%rbx
+	jb	.Lopen_sse_tail_16_init
+	subq	$16,%rbx
+	movdqu	(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	jmp	.Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
+	movdqa	%xmm0,%xmm1
+
+
+.Lopen_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	.Lopen_sse_finalize
+
+
+
+	pxor	%xmm3,%xmm3
+	leaq	-1(%rsi,%rbx,1),%rsi
+	movq	%rbx,%r8
+.Lopen_sse_tail_16_compose:
+	pslldq	$1,%xmm3
+	pinsrb	$0,(%rsi),%xmm3
+	subq	$1,%rsi
+	subq	$1,%r8
+	jnz	.Lopen_sse_tail_16_compose
+
+.byte	102,73,15,126,221
+	pextrq	$1,%xmm3,%r14
+
+	pxor	%xmm1,%xmm3
+
+
+.Lopen_sse_tail_16_extract:
+	pextrb	$0,%xmm3,(%rdi)
+	psrldq	$1,%xmm3
+	addq	$1,%rdi
+	subq	$1,%rbx
+	jne	.Lopen_sse_tail_16_extract
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lopen_sse_finalize:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+.cfi_remember_state	
+	addq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	-(288 + 32)
+
+	popq	%r9
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r9
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	ret
+
+.Lopen_sse_128:
+.cfi_restore_state	
+	movdqu	.Lchacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm13,%xmm15
+	movq	$10,%r10
+
+.Lopen_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	.Lopen_sse_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm9
+	paddd	%xmm11,%xmm10
+	paddd	%xmm15,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm14
+
+	pand	.Lclamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_sse_128_xor_hash:
+	cmpq	$16,%rbx
+	jb	.Lopen_sse_tail_16
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm1
+	movdqu	%xmm1,0(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,%xmm13
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm10
+	jmp	.Lopen_sse_128_xor_hash
+.size	chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41
+.cfi_endproc	
+
+
+
+
+
+
+
+.globl	chacha20_poly1305_seal_sse41
+.hidden chacha20_poly1305_seal_sse41
+.type	chacha20_poly1305_seal_sse41,@function
+.align	64
+chacha20_poly1305_seal_sse41:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	cmpq	$128,%rbx
+	jbe	.Lseal_sse_128
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm12
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+	movq	$10,%r10
+.Lseal_sse_init_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jnz	.Lseal_sse_init_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+
+	pand	.Lclamp(%rip),%xmm3
+	movdqa	%xmm3,0+0(%rbp)
+	movdqa	%xmm7,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	cmpq	$192,%rbx
+	ja	.Lseal_sse_main_init
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 128(%rdi)
+	movdqu	%xmm4,16 + 128(%rdi)
+	movdqu	%xmm8,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	movq	$2,%rcx
+	movq	$8,%r8
+	cmpq	$64,%rbx
+	jbe	.Lseal_sse_tail_64
+	cmpq	$128,%rbx
+	jbe	.Lseal_sse_tail_128
+	cmpq	$192,%rbx
+	jbe	.Lseal_sse_tail_192
+
+.Lseal_sse_main_loop:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+.align	32
+.Lseal_sse_main_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	leaq	16(%rdi),%rdi
+	decq	%r8
+	jge	.Lseal_sse_main_rounds
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_main_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	movdqa	%xmm14,0+80(%rbp)
+	movdqa	%xmm14,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm14
+	pxor	%xmm3,%xmm14
+	movdqu	%xmm14,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm14
+	pxor	%xmm7,%xmm14
+	movdqu	%xmm14,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm14
+	pxor	%xmm11,%xmm14
+	movdqu	%xmm14,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm14
+	pxor	%xmm15,%xmm14
+	movdqu	%xmm14,48 + 0(%rdi)
+
+	movdqa	0+80(%rbp),%xmm14
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	cmpq	$256,%rbx
+	ja	.Lseal_sse_main_loop_xor
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor:
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	movq	$6,%rcx
+	movq	$4,%r8
+	cmpq	$192,%rbx
+	jg	.Lseal_sse_main_loop
+	movq	%rbx,%rcx
+	testq	%rbx,%rbx
+	je	.Lseal_sse_128_tail_hash
+	movq	$6,%rcx
+	cmpq	$128,%rbx
+	ja	.Lseal_sse_tail_192
+	cmpq	$64,%rbx
+	ja	.Lseal_sse_tail_128
+
+.Lseal_sse_tail_64:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+.Lseal_sse_tail_64_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_64_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_64_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_64_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	.Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_128:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+.Lseal_sse_tail_128_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_128_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_128_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_128_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	movq	$64,%rcx
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+
+.Lseal_sse_tail_192:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+.Lseal_sse_tail_192_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_192_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_192_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_192_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+
+.Lseal_sse_128_tail_hash:
+	cmpq	$16,%rcx
+	jb	.Lseal_sse_128_tail_xor
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	leaq	16(%rdi),%rdi
+	jmp	.Lseal_sse_128_tail_hash
+
+.Lseal_sse_128_tail_xor:
+	cmpq	$16,%rbx
+	jb	.Lseal_sse_tail_16
+	subq	$16,%rbx
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,0(%rdi)
+
+	addq	0(%rdi),%r10
+	adcq	8(%rdi),%r11
+	adcq	$1,%r12
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	jmp	.Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	.Lprocess_blocks_of_extra_in
+
+	movq	%rbx,%r8
+	movq	%rbx,%rcx
+	leaq	-1(%rsi,%rbx,1),%rsi
+	pxor	%xmm15,%xmm15
+.Lseal_sse_tail_16_compose:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	decq	%rcx
+	jne	.Lseal_sse_tail_16_compose
+
+
+	pxor	%xmm0,%xmm15
+
+
+	movq	%rbx,%rcx
+	movdqu	%xmm15,%xmm0
+.Lseal_sse_tail_16_extract:
+	pextrb	$0,%xmm0,(%rdi)
+	psrldq	$1,%xmm0
+	addq	$1,%rdi
+	subq	$1,%rcx
+	jnz	.Lseal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	movq	288 + 0 + 32(%rsp),%r9
+	movq	56(%r9),%r14
+	movq	48(%r9),%r13
+	testq	%r14,%r14
+	jz	.Lprocess_partial_block
+
+	movq	$16,%r15
+	subq	%rbx,%r15
+	cmpq	%r15,%r14
+
+	jge	.Lload_extra_in
+	movq	%r14,%r15
+
+.Lload_extra_in:
+
+
+	leaq	-1(%r13,%r15,1),%rsi
+
+
+	addq	%r15,%r13
+	subq	%r15,%r14
+	movq	%r13,48(%r9)
+	movq	%r14,56(%r9)
+
+
+
+	addq	%r15,%r8
+
+
+	pxor	%xmm11,%xmm11
+.Lload_extra_load_loop:
+	pslldq	$1,%xmm11
+	pinsrb	$0,(%rsi),%xmm11
+	leaq	-1(%rsi),%rsi
+	subq	$1,%r15
+	jnz	.Lload_extra_load_loop
+
+
+
+
+	movq	%rbx,%r15
+
+.Lload_extra_shift_loop:
+	pslldq	$1,%xmm11
+	subq	$1,%r15
+	jnz	.Lload_extra_shift_loop
+
+
+
+
+	leaq	.Land_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+
+
+	por	%xmm11,%xmm15
+
+
+
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lprocess_blocks_of_extra_in:
+
+	movq	288+32+0 (%rsp),%r9
+	movq	48(%r9),%rsi
+	movq	56(%r9),%r8
+	movq	%r8,%rcx
+	shrq	$4,%r8
+
+.Lprocess_extra_hash_loop:
+	jz	process_extra_in_trailer
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rsi),%rsi
+	subq	$1,%r8
+	jmp	.Lprocess_extra_hash_loop
+process_extra_in_trailer:
+	andq	$15,%rcx
+	movq	%rcx,%rbx
+	jz	.Ldo_length_block
+	leaq	-1(%rsi,%rcx,1),%rsi
+
+.Lprocess_extra_in_trailer_load:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	subq	$1,%rcx
+	jnz	.Lprocess_extra_in_trailer_load
+
+.Lprocess_partial_block:
+
+	leaq	.Land_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Ldo_length_block:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+.cfi_remember_state	
+	addq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	-(288 + 32)
+
+	popq	%r9
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r9
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	ret
+
+.Lseal_sse_128:
+.cfi_restore_state	
+	movdqu	.Lchacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm14
+	movdqa	%xmm14,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	movq	$10,%r10
+
+.Lseal_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	.Lseal_sse_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm8
+	paddd	%xmm11,%xmm9
+	paddd	%xmm15,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm13
+
+	pand	.Lclamp(%rip),%xmm2
+	movdqa	%xmm2,0+0(%rbp)
+	movdqa	%xmm6,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	jmp	.Lseal_sse_128_tail_xor
+.size	chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41
+.cfi_endproc	
+
+
+.globl	chacha20_poly1305_open_avx2
+.hidden chacha20_poly1305_open_avx2
+.type	chacha20_poly1305_open_avx2,@function
+.align	64
+chacha20_poly1305_open_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	vzeroupper
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	.Lavx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	.Lopen_avx2_192
+	cmpq	$320,%rbx
+	jbe	.Lopen_avx2_320
+
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	movq	$10,%r10
+.Lopen_avx2_init_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	decq	%r10
+	jne	.Lopen_avx2_init_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	xorq	%rcx,%rcx
+.Lopen_avx2_init_hash:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%rcx
+	cmpq	$64,%rcx
+	jne	.Lopen_avx2_init_hash
+
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0(%rdi)
+	vmovdqu	%ymm4,32(%rdi)
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	subq	$64,%rbx
+.Lopen_avx2_main_loop:
+
+	cmpq	$512,%rbx
+	jb	.Lopen_avx2_main_loop_done
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+.Lopen_avx2_main_loop_rounds:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rsi,%rcx,1),%r10
+	adcq	8+16(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rsi,%rcx,1),%r10
+	adcq	8+32(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rcx),%rcx
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	cmpq	$60*8,%rcx
+	jne	.Lopen_avx2_main_loop_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+60*8(%rsi),%r10
+	adcq	8+60*8(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	addq	0+60*8+16(%rsi),%r10
+	adcq	8+60*8+16(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	leaq	512(%rdi),%rdi
+	subq	$512,%rbx
+	jmp	.Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
+	testq	%rbx,%rbx
+	vzeroupper
+	je	.Lopen_sse_finalize
+
+	cmpq	$384,%rbx
+	ja	.Lopen_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	.Lopen_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	.Lopen_avx2_tail_256
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	testq	%rcx,%rcx
+	je	.Lopen_avx2_tail_128_rounds
+.Lopen_avx2_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_avx2_tail_128_rounds:
+	addq	$16,%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_avx2_tail_128_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_256:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$128,%rcx
+	shrq	$4,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+.Lopen_avx2_tail_256_rounds_and_x1hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+.Lopen_avx2_tail_256_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+
+	incq	%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_256_rounds_and_x1hash
+	cmpq	$10,%r8
+	jne	.Lopen_avx2_tail_256_rounds
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+.Lopen_avx2_tail_256_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	.Lopen_avx2_tail_256_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	.Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	subq	$128,%rbx
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_384:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$256,%rcx
+	shrq	$4,%rcx
+	addq	$6,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+.Lopen_avx2_tail_384_rounds_and_x2hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+.Lopen_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+	incq	%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_384_rounds_and_x2hash
+	cmpq	$10,%r8
+	jne	.Lopen_avx2_tail_384_rounds_and_x1hash
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+.Lopen_avx2_384_tail_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	.Lopen_avx2_384_tail_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	.Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_512:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+	movq	%rsi,%r8
+.Lopen_avx2_tail_512_rounds_and_x2hash:
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+.Lopen_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+16(%r8),%r10
+	adcq	8+16(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%r8),%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	incq	%rcx
+	cmpq	$4,%rcx
+	jl	.Lopen_avx2_tail_512_rounds_and_x2hash
+	cmpq	$10,%rcx
+	jne	.Lopen_avx2_tail_512_rounds_and_x1hash
+	movq	%rbx,%rcx
+	subq	$384,%rcx
+	andq	$-16,%rcx
+.Lopen_avx2_tail_512_hash:
+	testq	%rcx,%rcx
+	je	.Lopen_avx2_tail_512_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	subq	$16,%rcx
+	jmp	.Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	384(%rsi),%rsi
+	leaq	384(%rdi),%rdi
+	subq	$384,%rbx
+.Lopen_avx2_tail_128_xor:
+	cmpq	$32,%rbx
+	jb	.Lopen_avx2_tail_32_xor
+	subq	$32,%rbx
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	jmp	.Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	.Lopen_avx2_exit
+	subq	$16,%rbx
+
+	vpxor	(%rsi),%xmm0,%xmm1
+	vmovdqu	%xmm1,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
+	vmovdqa	%xmm0,%xmm1
+.Lopen_avx2_exit:
+	vzeroupper
+	jmp	.Lopen_sse_tail_16
+
+.Lopen_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+.Lopen_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	.Lopen_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+.Lopen_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_avx2_short_hash_and_xor_loop:
+	cmpq	$32,%rbx
+	jb	.Lopen_avx2_short_tail_32
+	subq	$32,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rsi),%r10
+	adcq	8+16(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	.Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	.Lopen_avx2_short_tail_32_exit
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm1
+.Lopen_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	.Lopen_sse_tail_16
+
+.Lopen_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+.Lopen_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	.Lopen_avx2_320_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	.Lopen_avx2_short
+.size	chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc	
+
+
+.globl	chacha20_poly1305_seal_avx2
+.hidden chacha20_poly1305_seal_avx2
+.type	chacha20_poly1305_seal_avx2,@function
+.align	64
+chacha20_poly1305_seal_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	vzeroupper
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	.Lavx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	.Lseal_avx2_192
+	cmpq	$320,%rbx
+	jbe	.Lseal_avx2_320
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,%ymm15
+	vpaddd	.Lavx2_inc(%rip),%ymm15,%ymm14
+	vpaddd	.Lavx2_inc(%rip),%ymm14,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm15,0+256(%rbp)
+	movq	$10,%r10
+.Lseal_avx2_init_rounds:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%r10
+	jnz	.Lseal_avx2_init_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
+	vpand	.Lclamp(%rip),%ymm15,%ymm15
+	vmovdqa	%ymm15,0+0(%rbp)
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+64(%rsi),%ymm15,%ymm15
+	vpxor	32+64(%rsi),%ymm2,%ymm2
+	vpxor	64+64(%rsi),%ymm6,%ymm6
+	vpxor	96+64(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm15,0+64(%rdi)
+	vmovdqu	%ymm2,32+64(%rdi)
+	vmovdqu	%ymm6,64+64(%rdi)
+	vmovdqu	%ymm10,96+64(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+192(%rsi),%ymm15,%ymm15
+	vpxor	32+192(%rsi),%ymm1,%ymm1
+	vpxor	64+192(%rsi),%ymm5,%ymm5
+	vpxor	96+192(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm15,0+192(%rdi)
+	vmovdqu	%ymm1,32+192(%rdi)
+	vmovdqu	%ymm5,64+192(%rdi)
+	vmovdqu	%ymm9,96+192(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm15,%ymm8
+
+	leaq	320(%rsi),%rsi
+	subq	$320,%rbx
+	movq	$320,%rcx
+	cmpq	$128,%rbx
+	jbe	.Lseal_avx2_short_hash_remainder
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+	vpxor	64(%rsi),%ymm8,%ymm8
+	vpxor	96(%rsi),%ymm12,%ymm12
+	vmovdqu	%ymm0,320(%rdi)
+	vmovdqu	%ymm4,352(%rdi)
+	vmovdqu	%ymm8,384(%rdi)
+	vmovdqu	%ymm12,416(%rdi)
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	movq	$8,%rcx
+	movq	$2,%r8
+	cmpq	$128,%rbx
+	jbe	.Lseal_avx2_tail_128
+	cmpq	$256,%rbx
+	jbe	.Lseal_avx2_tail_256
+	cmpq	$384,%rbx
+	jbe	.Lseal_avx2_tail_384
+	cmpq	$512,%rbx
+	jbe	.Lseal_avx2_tail_512
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+
+	subq	$16,%rdi
+	movq	$9,%rcx
+	jmp	.Lseal_avx2_main_loop_rounds_entry
+.align	32
+.Lseal_avx2_main_loop:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	movq	$10,%rcx
+.align	32
+.Lseal_avx2_main_loop_rounds:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lseal_avx2_main_loop_rounds_entry:
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rdi),%r10
+	adcq	8+32(%rdi),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rdi),%rdi
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%rcx
+	jne	.Lseal_avx2_main_loop_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	subq	$512,%rbx
+	cmpq	$512,%rbx
+	jg	.Lseal_avx2_main_loop
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	movq	$10,%rcx
+	xorq	%r8,%r8
+
+	cmpq	$384,%rbx
+	ja	.Lseal_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	.Lseal_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	.Lseal_avx2_tail_256
+
+.Lseal_avx2_tail_128:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_128_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_128_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	.Lseal_avx2_short_loop
+
+.Lseal_avx2_tail_256:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+.Lseal_avx2_tail_256_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_256_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$128,%rcx
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_384:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+.Lseal_avx2_tail_384_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_384_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$256,%rcx
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_512:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_512_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_512_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$384,%rcx
+	leaq	384(%rsi),%rsi
+	subq	$384,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+.Lseal_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	.Lseal_avx2_320_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	.Lseal_avx2_short
+
+.Lseal_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+.Lseal_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	.Lseal_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+.Lseal_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	xorq	%rcx,%rcx
+.Lseal_avx2_short_hash_remainder:
+	cmpq	$16,%rcx
+	jb	.Lseal_avx2_short_loop
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	addq	$16,%rdi
+	jmp	.Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
+	cmpq	$32,%rbx
+	jb	.Lseal_avx2_short_tail
+	subq	$32,%rbx
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	.Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
+	cmpq	$16,%rbx
+	jb	.Lseal_avx2_exit
+	subq	$16,%rbx
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm0
+.Lseal_avx2_exit:
+	vzeroupper
+	jmp	.Lseal_sse_tail_16
+.cfi_endproc	
+.size	chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
+#endif
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S
new file mode 100644
index 0000000000..d8040ddcfb
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S
@@ -0,0 +1,8898 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section	__DATA,__const
+.p2align	6
+chacha20_poly1305_constants:
+L$chacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+L$rol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+L$rol16:
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+L$avx2_init:
+.long	0,0,0,0
+L$sse_inc:
+.long	1,0,0,0
+L$avx2_inc:
+.long	2,0,0,0,2,0,0,0
+L$clamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.p2align	4
+L$and_masks:
+.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text	
+
+
+.p2align	6
+poly_hash_ad_internal:
+
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	cmpq	$13,%r8
+	jne	L$hash_ad_loop
+L$poly_fast_tls_ad:
+
+	movq	(%rcx),%r10
+	movq	5(%rcx),%r11
+	shrq	$24,%r11
+	movq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	ret
+L$hash_ad_loop:
+
+	cmpq	$16,%r8
+	jb	L$hash_ad_tail
+	addq	0+0(%rcx),%r10
+	adcq	8+0(%rcx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rcx),%rcx
+	subq	$16,%r8
+	jmp	L$hash_ad_loop
+L$hash_ad_tail:
+	cmpq	$0,%r8
+	je	L$hash_ad_done
+
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	addq	%r8,%rcx
+L$hash_ad_tail_loop:
+	shldq	$8,%r13,%r14
+	shlq	$8,%r13
+	movzbq	-1(%rcx),%r15
+	xorq	%r15,%r13
+	decq	%rcx
+	decq	%r8
+	jne	L$hash_ad_tail_loop
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$hash_ad_done:
+	ret
+
+
+
+.globl	_chacha20_poly1305_open_sse41
+.private_extern _chacha20_poly1305_open_sse41
+
+.p2align	6
+_chacha20_poly1305_open_sse41:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
+
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	cmpq	$128,%rbx
+	jbe	L$open_sse_128
+
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movq	$10,%r10
+L$open_sse_init_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jne	L$open_sse_init_rounds
+
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+
+	pand	L$clamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+L$open_sse_main_loop:
+	cmpq	$256,%rbx
+	jb	L$open_sse_tail
+
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	L$sse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+
+
+	movq	$4,%rcx
+	movq	%rsi,%r8
+L$open_sse_main_loop_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+
+	leaq	16(%r8),%r8
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%rcx
+	jge	L$open_sse_main_loop_rounds
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	cmpq	$-6,%rcx
+	jg	L$open_sse_main_loop_rounds
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	0+80(%rbp),%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	L$open_sse_main_loop
+L$open_sse_tail:
+
+	testq	%rbx,%rbx
+	jz	L$open_sse_finalize
+	cmpq	$192,%rbx
+	ja	L$open_sse_tail_256
+	cmpq	$128,%rbx
+	ja	L$open_sse_tail_192
+	cmpq	$64,%rbx
+	ja	L$open_sse_tail_128
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	cmpq	$16,%rcx
+	jb	L$open_sse_tail_64_rounds
+L$open_sse_tail_64_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+L$open_sse_tail_64_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	cmpq	$16,%rcx
+	jae	L$open_sse_tail_64_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_sse_tail_64_rounds
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_128:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+L$open_sse_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_sse_tail_128_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	cmpq	%rcx,%r8
+	jb	L$open_sse_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_sse_tail_128_rounds
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	jmp	L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_192:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+	movq	%rbx,%rcx
+	movq	$160,%r8
+	cmpq	$160,%rcx
+	cmovgq	%r8,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+L$open_sse_tail_192_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_sse_tail_192_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	cmpq	%rcx,%r8
+	jb	L$open_sse_tail_192_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_sse_tail_192_rounds
+	cmpq	$176,%rbx
+	jb	L$open_sse_tail_192_finish
+	addq	0+160(%rsi),%r10
+	adcq	8+160(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	cmpq	$192,%rbx
+	jb	L$open_sse_tail_192_finish
+	addq	0+176(%rsi),%r10
+	adcq	8+176(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_sse_tail_192_finish:
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	jmp	L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_256:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	L$sse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+	xorq	%r8,%r8
+L$open_sse_tail_256_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	movdqa	0+80(%rbp),%xmm11
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+	movdqa	0+80(%rbp),%xmm9
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+	movdqa	0+80(%rbp),%xmm11
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+	movdqa	0+80(%rbp),%xmm9
+
+	addq	$16,%r8
+	cmpq	$160,%r8
+	jb	L$open_sse_tail_256_rounds_and_x1hash
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+L$open_sse_tail_256_hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%r8
+	cmpq	%rcx,%r8
+	jb	L$open_sse_tail_256_hash
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movdqa	0+80(%rbp),%xmm12
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	leaq	192(%rdi),%rdi
+
+
+L$open_sse_tail_64_dec_loop:
+	cmpq	$16,%rbx
+	jb	L$open_sse_tail_16_init
+	subq	$16,%rbx
+	movdqu	(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	jmp	L$open_sse_tail_64_dec_loop
+L$open_sse_tail_16_init:
+	movdqa	%xmm0,%xmm1
+
+
+L$open_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	L$open_sse_finalize
+
+
+
+	pxor	%xmm3,%xmm3
+	leaq	-1(%rsi,%rbx,1),%rsi
+	movq	%rbx,%r8
+L$open_sse_tail_16_compose:
+	pslldq	$1,%xmm3
+	pinsrb	$0,(%rsi),%xmm3
+	subq	$1,%rsi
+	subq	$1,%r8
+	jnz	L$open_sse_tail_16_compose
+
+.byte	102,73,15,126,221
+	pextrq	$1,%xmm3,%r14
+
+	pxor	%xmm1,%xmm3
+
+
+L$open_sse_tail_16_extract:
+	pextrb	$0,%xmm3,(%rdi)
+	psrldq	$1,%xmm3
+	addq	$1,%rdi
+	subq	$1,%rbx
+	jne	L$open_sse_tail_16_extract
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$open_sse_finalize:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+
+	addq	$288 + 0 + 32,%rsp
+
+
+	popq	%r9
+
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+	ret
+
+L$open_sse_128:
+
+	movdqu	L$chacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm13,%xmm15
+	movq	$10,%r10
+
+L$open_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	L$open_sse_128_rounds
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm9
+	paddd	%xmm11,%xmm10
+	paddd	%xmm15,%xmm13
+	paddd	L$sse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm14
+
+	pand	L$clamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+L$open_sse_128_xor_hash:
+	cmpq	$16,%rbx
+	jb	L$open_sse_tail_16
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm1
+	movdqu	%xmm1,0(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,%xmm13
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm10
+	jmp	L$open_sse_128_xor_hash
+
+
+
+
+
+
+
+
+
+.globl	_chacha20_poly1305_seal_sse41
+.private_extern _chacha20_poly1305_seal_sse41
+
+.p2align	6
+_chacha20_poly1305_seal_sse41:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	cmpq	$128,%rbx
+	jbe	L$seal_sse_128
+
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm14
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	L$sse_inc(%rip),%xmm12
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+	movq	$10,%r10
+L$seal_sse_init_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jnz	L$seal_sse_init_rounds
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+
+	pand	L$clamp(%rip),%xmm3
+	movdqa	%xmm3,0+0(%rbp)
+	movdqa	%xmm7,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	cmpq	$192,%rbx
+	ja	L$seal_sse_main_init
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	jmp	L$seal_sse_128_tail_hash
+L$seal_sse_main_init:
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 128(%rdi)
+	movdqu	%xmm4,16 + 128(%rdi)
+	movdqu	%xmm8,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	movq	$2,%rcx
+	movq	$8,%r8
+	cmpq	$64,%rbx
+	jbe	L$seal_sse_tail_64
+	cmpq	$128,%rbx
+	jbe	L$seal_sse_tail_128
+	cmpq	$192,%rbx
+	jbe	L$seal_sse_tail_192
+
+L$seal_sse_main_loop:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	L$sse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+.p2align	5
+L$seal_sse_main_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	leaq	16(%rdi),%rdi
+	decq	%r8
+	jge	L$seal_sse_main_rounds
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_main_rounds
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	movdqa	%xmm14,0+80(%rbp)
+	movdqa	%xmm14,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm14
+	pxor	%xmm3,%xmm14
+	movdqu	%xmm14,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm14
+	pxor	%xmm7,%xmm14
+	movdqu	%xmm14,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm14
+	pxor	%xmm11,%xmm14
+	movdqu	%xmm14,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm14
+	pxor	%xmm15,%xmm14
+	movdqu	%xmm14,48 + 0(%rdi)
+
+	movdqa	0+80(%rbp),%xmm14
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	cmpq	$256,%rbx
+	ja	L$seal_sse_main_loop_xor
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	jmp	L$seal_sse_128_tail_hash
+L$seal_sse_main_loop_xor:
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	movq	$6,%rcx
+	movq	$4,%r8
+	cmpq	$192,%rbx
+	jg	L$seal_sse_main_loop
+	movq	%rbx,%rcx
+	testq	%rbx,%rbx
+	je	L$seal_sse_128_tail_hash
+	movq	$6,%rcx
+	cmpq	$128,%rbx
+	ja	L$seal_sse_tail_192
+	cmpq	$64,%rbx
+	ja	L$seal_sse_tail_128
+
+L$seal_sse_tail_64:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+L$seal_sse_tail_64_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_sse_tail_64_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_tail_64_rounds_and_x2hash
+	decq	%r8
+	jge	L$seal_sse_tail_64_rounds_and_x1hash
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	L$seal_sse_128_tail_xor
+
+L$seal_sse_tail_128:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+L$seal_sse_tail_128_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_sse_tail_128_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_tail_128_rounds_and_x2hash
+	decq	%r8
+	jge	L$seal_sse_tail_128_rounds_and_x1hash
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	movq	$64,%rcx
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	jmp	L$seal_sse_128_tail_hash
+
+L$seal_sse_tail_192:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+L$seal_sse_tail_192_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_sse_tail_192_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_tail_192_rounds_and_x2hash
+	decq	%r8
+	jge	L$seal_sse_tail_192_rounds_and_x1hash
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+
+L$seal_sse_128_tail_hash:
+	cmpq	$16,%rcx
+	jb	L$seal_sse_128_tail_xor
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	leaq	16(%rdi),%rdi
+	jmp	L$seal_sse_128_tail_hash
+
+L$seal_sse_128_tail_xor:
+	cmpq	$16,%rbx
+	jb	L$seal_sse_tail_16
+	subq	$16,%rbx
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,0(%rdi)
+
+	addq	0(%rdi),%r10
+	adcq	8(%rdi),%r11
+	adcq	$1,%r12
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	jmp	L$seal_sse_128_tail_xor
+
+L$seal_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	L$process_blocks_of_extra_in
+
+	movq	%rbx,%r8
+	movq	%rbx,%rcx
+	leaq	-1(%rsi,%rbx,1),%rsi
+	pxor	%xmm15,%xmm15
+L$seal_sse_tail_16_compose:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	decq	%rcx
+	jne	L$seal_sse_tail_16_compose
+
+
+	pxor	%xmm0,%xmm15
+
+
+	movq	%rbx,%rcx
+	movdqu	%xmm15,%xmm0
+L$seal_sse_tail_16_extract:
+	pextrb	$0,%xmm0,(%rdi)
+	psrldq	$1,%xmm0
+	addq	$1,%rdi
+	subq	$1,%rcx
+	jnz	L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	movq	288 + 0 + 32(%rsp),%r9
+	movq	56(%r9),%r14
+	movq	48(%r9),%r13
+	testq	%r14,%r14
+	jz	L$process_partial_block
+
+	movq	$16,%r15
+	subq	%rbx,%r15
+	cmpq	%r15,%r14
+
+	jge	L$load_extra_in
+	movq	%r14,%r15
+
+L$load_extra_in:
+
+
+	leaq	-1(%r13,%r15,1),%rsi
+
+
+	addq	%r15,%r13
+	subq	%r15,%r14
+	movq	%r13,48(%r9)
+	movq	%r14,56(%r9)
+
+
+
+	addq	%r15,%r8
+
+
+	pxor	%xmm11,%xmm11
+L$load_extra_load_loop:
+	pslldq	$1,%xmm11
+	pinsrb	$0,(%rsi),%xmm11
+	leaq	-1(%rsi),%rsi
+	subq	$1,%r15
+	jnz	L$load_extra_load_loop
+
+
+
+
+	movq	%rbx,%r15
+
+L$load_extra_shift_loop:
+	pslldq	$1,%xmm11
+	subq	$1,%r15
+	jnz	L$load_extra_shift_loop
+
+
+
+
+	leaq	L$and_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+
+
+	por	%xmm11,%xmm15
+
+
+
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$process_blocks_of_extra_in:
+
+	movq	288+32+0 (%rsp),%r9
+	movq	48(%r9),%rsi
+	movq	56(%r9),%r8
+	movq	%r8,%rcx
+	shrq	$4,%r8
+
+L$process_extra_hash_loop:
+	jz	process_extra_in_trailer
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rsi),%rsi
+	subq	$1,%r8
+	jmp	L$process_extra_hash_loop
+process_extra_in_trailer:
+	andq	$15,%rcx
+	movq	%rcx,%rbx
+	jz	L$do_length_block
+	leaq	-1(%rsi,%rcx,1),%rsi
+
+L$process_extra_in_trailer_load:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	subq	$1,%rcx
+	jnz	L$process_extra_in_trailer_load
+
+L$process_partial_block:
+
+	leaq	L$and_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$do_length_block:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+
+	addq	$288 + 0 + 32,%rsp
+
+
+	popq	%r9
+
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+	ret
+
+L$seal_sse_128:
+
+	movdqu	L$chacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm14
+	movdqa	%xmm14,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	movq	$10,%r10
+
+L$seal_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	L$seal_sse_128_rounds
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm8
+	paddd	%xmm11,%xmm9
+	paddd	%xmm15,%xmm12
+	paddd	L$sse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm13
+
+	pand	L$clamp(%rip),%xmm2
+	movdqa	%xmm2,0+0(%rbp)
+	movdqa	%xmm6,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	jmp	L$seal_sse_128_tail_xor
+
+
+
+
+.globl	_chacha20_poly1305_open_avx2
+.private_extern _chacha20_poly1305_open_avx2
+
+.p2align	6
+_chacha20_poly1305_open_avx2:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
+
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	vzeroupper
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	L$avx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	L$open_avx2_192
+	cmpq	$320,%rbx
+	jbe	L$open_avx2_320
+
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	movq	$10,%r10
+L$open_avx2_init_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	decq	%r10
+	jne	L$open_avx2_init_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	xorq	%rcx,%rcx
+L$open_avx2_init_hash:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%rcx
+	cmpq	$64,%rcx
+	jne	L$open_avx2_init_hash
+
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0(%rdi)
+	vmovdqu	%ymm4,32(%rdi)
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	subq	$64,%rbx
+L$open_avx2_main_loop:
+
+	cmpq	$512,%rbx
+	jb	L$open_avx2_main_loop_done
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+L$open_avx2_main_loop_rounds:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rsi,%rcx,1),%r10
+	adcq	8+16(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rsi,%rcx,1),%r10
+	adcq	8+32(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rcx),%rcx
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	cmpq	$60*8,%rcx
+	jne	L$open_avx2_main_loop_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+60*8(%rsi),%r10
+	adcq	8+60*8(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	addq	0+60*8+16(%rsi),%r10
+	adcq	8+60*8+16(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	leaq	512(%rdi),%rdi
+	subq	$512,%rbx
+	jmp	L$open_avx2_main_loop
+L$open_avx2_main_loop_done:
+	testq	%rbx,%rbx
+	vzeroupper
+	je	L$open_sse_finalize
+
+	cmpq	$384,%rbx
+	ja	L$open_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	L$open_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	L$open_avx2_tail_256
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	testq	%rcx,%rcx
+	je	L$open_avx2_tail_128_rounds
+L$open_avx2_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_avx2_tail_128_rounds:
+	addq	$16,%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	L$open_avx2_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_avx2_tail_128_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_256:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$128,%rcx
+	shrq	$4,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+L$open_avx2_tail_256_rounds_and_x1hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+L$open_avx2_tail_256_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+
+	incq	%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	cmpq	%rcx,%r8
+	jb	L$open_avx2_tail_256_rounds_and_x1hash
+	cmpq	$10,%r8
+	jne	L$open_avx2_tail_256_rounds
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+L$open_avx2_tail_256_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	L$open_avx2_tail_256_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	L$open_avx2_tail_256_hash
+L$open_avx2_tail_256_done:
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	subq	$128,%rbx
+	jmp	L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_384:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$256,%rcx
+	shrq	$4,%rcx
+	addq	$6,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+L$open_avx2_tail_384_rounds_and_x2hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+L$open_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+	incq	%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	L$open_avx2_tail_384_rounds_and_x2hash
+	cmpq	$10,%r8
+	jne	L$open_avx2_tail_384_rounds_and_x1hash
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+L$open_avx2_384_tail_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	L$open_avx2_384_tail_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	L$open_avx2_384_tail_hash
+L$open_avx2_384_tail_done:
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_512:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+	movq	%rsi,%r8
+L$open_avx2_tail_512_rounds_and_x2hash:
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+L$open_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+16(%r8),%r10
+	adcq	8+16(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%r8),%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	incq	%rcx
+	cmpq	$4,%rcx
+	jl	L$open_avx2_tail_512_rounds_and_x2hash
+	cmpq	$10,%rcx
+	jne	L$open_avx2_tail_512_rounds_and_x1hash
+	movq	%rbx,%rcx
+	subq	$384,%rcx
+	andq	$-16,%rcx
+L$open_avx2_tail_512_hash:
+	testq	%rcx,%rcx
+	je	L$open_avx2_tail_512_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	subq	$16,%rcx
+	jmp	L$open_avx2_tail_512_hash
+L$open_avx2_tail_512_done:
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	384(%rsi),%rsi
+	leaq	384(%rdi),%rdi
+	subq	$384,%rbx
+L$open_avx2_tail_128_xor:
+	cmpq	$32,%rbx
+	jb	L$open_avx2_tail_32_xor
+	subq	$32,%rbx
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	jmp	L$open_avx2_tail_128_xor
+L$open_avx2_tail_32_xor:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	L$open_avx2_exit
+	subq	$16,%rbx
+
+	vpxor	(%rsi),%xmm0,%xmm1
+	vmovdqu	%xmm1,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
+	vmovdqa	%xmm0,%xmm1
+L$open_avx2_exit:
+	vzeroupper
+	jmp	L$open_sse_tail_16
+
+L$open_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+L$open_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	L$open_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+L$open_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+L$open_avx2_short_hash_and_xor_loop:
+	cmpq	$32,%rbx
+	jb	L$open_avx2_short_tail_32
+	subq	$32,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rsi),%r10
+	adcq	8+16(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	L$open_avx2_short_hash_and_xor_loop
+L$open_avx2_short_tail_32:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	L$open_avx2_short_tail_32_exit
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm1
+L$open_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	L$open_sse_tail_16
+
+L$open_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+L$open_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	L$open_avx2_320_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	L$open_avx2_short
+
+
+
+
+.globl	_chacha20_poly1305_seal_avx2
+.private_extern _chacha20_poly1305_seal_avx2
+
+.p2align	6
+_chacha20_poly1305_seal_avx2:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	vzeroupper
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	L$avx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	L$seal_avx2_192
+	cmpq	$320,%rbx
+	jbe	L$seal_avx2_320
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,%ymm15
+	vpaddd	L$avx2_inc(%rip),%ymm15,%ymm14
+	vpaddd	L$avx2_inc(%rip),%ymm14,%ymm13
+	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm15,0+256(%rbp)
+	movq	$10,%r10
+L$seal_avx2_init_rounds:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%r10
+	jnz	L$seal_avx2_init_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
+	vpand	L$clamp(%rip),%ymm15,%ymm15
+	vmovdqa	%ymm15,0+0(%rbp)
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+64(%rsi),%ymm15,%ymm15
+	vpxor	32+64(%rsi),%ymm2,%ymm2
+	vpxor	64+64(%rsi),%ymm6,%ymm6
+	vpxor	96+64(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm15,0+64(%rdi)
+	vmovdqu	%ymm2,32+64(%rdi)
+	vmovdqu	%ymm6,64+64(%rdi)
+	vmovdqu	%ymm10,96+64(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+192(%rsi),%ymm15,%ymm15
+	vpxor	32+192(%rsi),%ymm1,%ymm1
+	vpxor	64+192(%rsi),%ymm5,%ymm5
+	vpxor	96+192(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm15,0+192(%rdi)
+	vmovdqu	%ymm1,32+192(%rdi)
+	vmovdqu	%ymm5,64+192(%rdi)
+	vmovdqu	%ymm9,96+192(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm15,%ymm8
+
+	leaq	320(%rsi),%rsi
+	subq	$320,%rbx
+	movq	$320,%rcx
+	cmpq	$128,%rbx
+	jbe	L$seal_avx2_short_hash_remainder
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+	vpxor	64(%rsi),%ymm8,%ymm8
+	vpxor	96(%rsi),%ymm12,%ymm12
+	vmovdqu	%ymm0,320(%rdi)
+	vmovdqu	%ymm4,352(%rdi)
+	vmovdqu	%ymm8,384(%rdi)
+	vmovdqu	%ymm12,416(%rdi)
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	movq	$8,%rcx
+	movq	$2,%r8
+	cmpq	$128,%rbx
+	jbe	L$seal_avx2_tail_128
+	cmpq	$256,%rbx
+	jbe	L$seal_avx2_tail_256
+	cmpq	$384,%rbx
+	jbe	L$seal_avx2_tail_384
+	cmpq	$512,%rbx
+	jbe	L$seal_avx2_tail_512
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+
+	subq	$16,%rdi
+	movq	$9,%rcx
+	jmp	L$seal_avx2_main_loop_rounds_entry
+.p2align	5
+L$seal_avx2_main_loop:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	movq	$10,%rcx
+.p2align	5
+L$seal_avx2_main_loop_rounds:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$seal_avx2_main_loop_rounds_entry:
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rdi),%r10
+	adcq	8+32(%rdi),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rdi),%rdi
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%rcx
+	jne	L$seal_avx2_main_loop_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	subq	$512,%rbx
+	cmpq	$512,%rbx
+	jg	L$seal_avx2_main_loop
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	movq	$10,%rcx
+	xorq	%r8,%r8
+
+	cmpq	$384,%rbx
+	ja	L$seal_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	L$seal_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	L$seal_avx2_tail_256
+
+L$seal_avx2_tail_128:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+L$seal_avx2_tail_128_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_128_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	L$seal_avx2_short_loop
+
+L$seal_avx2_tail_256:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+L$seal_avx2_tail_256_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_256_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$128,%rcx
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	jmp	L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_tail_384:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+L$seal_avx2_tail_384_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_384_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$256,%rcx
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	jmp	L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_tail_512:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+L$seal_avx2_tail_512_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_512_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$384,%rcx
+	leaq	384(%rsi),%rsi
+	subq	$384,%rbx
+	jmp	L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+L$seal_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	L$seal_avx2_320_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	L$seal_avx2_short
+
+L$seal_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+L$seal_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	L$seal_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+L$seal_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	xorq	%rcx,%rcx
+L$seal_avx2_short_hash_remainder:
+	cmpq	$16,%rcx
+	jb	L$seal_avx2_short_loop
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	addq	$16,%rdi
+	jmp	L$seal_avx2_short_hash_remainder
+L$seal_avx2_short_loop:
+	cmpq	$32,%rbx
+	jb	L$seal_avx2_short_tail
+	subq	$32,%rbx
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	L$seal_avx2_short_loop
+L$seal_avx2_short_tail:
+	cmpq	$16,%rbx
+	jb	L$seal_avx2_exit
+	subq	$16,%rbx
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm0
+L$seal_avx2_exit:
+	vzeroupper
+	jmp	L$seal_sse_tail_16
+
+
+#endif
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm
new file mode 100644
index 0000000000..127dbe30f2
--- /dev/null
+++ b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm
@@ -0,0 +1,9021 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.rdata rdata align=8
+ALIGN	64
+chacha20_poly1305_constants:
+$L$chacha20_consts:
+	DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+	DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+$L$rol8:
+	DB	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+	DB	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+$L$rol16:
+	DB	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+	DB	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+$L$avx2_init:
+	DD	0,0,0,0
+$L$sse_inc:
+	DD	1,0,0,0
+$L$avx2_inc:
+	DD	2,0,0,0,2,0,0,0
+$L$clamp:
+	DQ	0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC
+	DQ	0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF
+ALIGN	16
+$L$and_masks:
+	DB	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+section	.text code align=64
+
+
+
+ALIGN	64
+poly_hash_ad_internal:
+
+
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+	cmp	r8,13
+	jne	NEAR $L$hash_ad_loop
+$L$poly_fast_tls_ad:
+
+	mov	r10,QWORD[rcx]
+	mov	r11,QWORD[5+rcx]
+	shr	r11,24
+	mov	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	ret
+$L$hash_ad_loop:
+
+	cmp	r8,16
+	jb	NEAR $L$hash_ad_tail
+	add	r10,QWORD[((0+0))+rcx]
+	adc	r11,QWORD[((8+0))+rcx]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rcx,[16+rcx]
+	sub	r8,16
+	jmp	NEAR $L$hash_ad_loop
+$L$hash_ad_tail:
+	cmp	r8,0
+	je	NEAR $L$hash_ad_done
+
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	add	rcx,r8
+$L$hash_ad_tail_loop:
+	shld	r14,r13,8
+	shl	r13,8
+	movzx	r15,BYTE[((-1))+rcx]
+	xor	r13,r15
+	dec	rcx
+	dec	r8
+	jne	NEAR $L$hash_ad_tail_loop
+
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$hash_ad_done:
+	ret
+
+
+
+global	chacha20_poly1305_open_sse41
+
+ALIGN	64
+chacha20_poly1305_open_sse41:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_open_sse41:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+
+	cmp	rbx,128
+	jbe	NEAR $L$open_sse_128
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqu	xmm4,XMMWORD[r9]
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqu	xmm12,XMMWORD[32+r9]
+
+	movdqa	xmm7,xmm12
+
+	movdqa	XMMWORD[(160+48)+rbp],xmm4
+	movdqa	XMMWORD[(160+64)+rbp],xmm8
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	mov	r10,10
+$L$open_sse_init_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	r10
+	jne	NEAR $L$open_sse_init_rounds
+
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+
+	pand	xmm0,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm0
+	movdqa	XMMWORD[(160+16)+rbp],xmm4
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_sse_main_loop:
+	cmp	rbx,16*16
+	jb	NEAR $L$open_sse_tail
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+
+
+	mov	rcx,4
+	mov	r8,rsi
+$L$open_sse_main_loop_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+
+	lea	r8,[16+r8]
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	rcx
+	jge	NEAR $L$open_sse_main_loop_rounds
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	cmp	rcx,-6
+	jg	NEAR $L$open_sse_main_loop_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqa	XMMWORD[(160+80)+rbp],xmm12
+	movdqu	xmm12,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm12,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm12,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm12,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm12,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm12
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 192))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 192))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 192))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 192))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,XMMWORD[((160+80))+rbp]
+	movdqu	XMMWORD[(0 + 192)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 192)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 192)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 192)+rdi],xmm15
+
+	lea	rsi,[256+rsi]
+	lea	rdi,[256+rdi]
+	sub	rbx,16*16
+	jmp	NEAR $L$open_sse_main_loop
+$L$open_sse_tail:
+
+	test	rbx,rbx
+	jz	NEAR $L$open_sse_finalize
+	cmp	rbx,12*16
+	ja	NEAR $L$open_sse_tail_256
+	cmp	rbx,8*16
+	ja	NEAR $L$open_sse_tail_192
+	cmp	rbx,4*16
+	ja	NEAR $L$open_sse_tail_128
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm12,XMMWORD[((160+96))+rbp]
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+
+	xor	r8,r8
+	mov	rcx,rbx
+	cmp	rcx,16
+	jb	NEAR $L$open_sse_tail_64_rounds
+$L$open_sse_tail_64_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+$L$open_sse_tail_64_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	cmp	rcx,16
+	jae	NEAR $L$open_sse_tail_64_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_64_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_128:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm13,XMMWORD[((160+96))+rbp]
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+
+	mov	rcx,rbx
+	and	rcx,-16
+	xor	r8,r8
+$L$open_sse_tail_128_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_128_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_128_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_128_rounds
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+
+	sub	rbx,4*16
+	lea	rsi,[64+rsi]
+	lea	rdi,[64+rdi]
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_192:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm14,XMMWORD[((160+96))+rbp]
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+
+	mov	rcx,rbx
+	mov	r8,10*16
+	cmp	rcx,10*16
+	cmovg	rcx,r8
+	and	rcx,-16
+	xor	r8,r8
+$L$open_sse_tail_192_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_192_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_192_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_192_rounds
+	cmp	rbx,11*16
+	jb	NEAR $L$open_sse_tail_192_finish
+	add	r10,QWORD[((0+160))+rsi]
+	adc	r11,QWORD[((8+160))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	cmp	rbx,12*16
+	jb	NEAR $L$open_sse_tail_192_finish
+	add	r10,QWORD[((0+176))+rsi]
+	adc	r11,QWORD[((8+176))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_192_finish:
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+	lea	rdi,[128+rdi]
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_256:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+	xor	r8,r8
+$L$open_sse_tail_256_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	movdqa	XMMWORD[(160+80)+rbp],xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm11
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm11
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm11
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm11
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm11
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	movdqa	xmm11,XMMWORD[((160+80))+rbp]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	XMMWORD[(160+80)+rbp],xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol16]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,12
+	psrld	xmm7,20
+	pxor	xmm7,xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol8]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,7
+	psrld	xmm7,25
+	pxor	xmm7,xmm9
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+	movdqa	xmm9,XMMWORD[((160+80))+rbp]
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	movdqa	XMMWORD[(160+80)+rbp],xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm11
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm11
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm11
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm11
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm11
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+	movdqa	xmm11,XMMWORD[((160+80))+rbp]
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	movdqa	XMMWORD[(160+80)+rbp],xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol16]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,12
+	psrld	xmm7,20
+	pxor	xmm7,xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol8]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,7
+	psrld	xmm7,25
+	pxor	xmm7,xmm9
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+	movdqa	xmm9,XMMWORD[((160+80))+rbp]
+
+	add	r8,16
+	cmp	r8,10*16
+	jb	NEAR $L$open_sse_tail_256_rounds_and_x1hash
+
+	mov	rcx,rbx
+	and	rcx,-16
+$L$open_sse_tail_256_hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	add	r8,16
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_256_hash
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqa	XMMWORD[(160+80)+rbp],xmm12
+	movdqu	xmm12,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm12,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm12,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm12,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm12,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm12
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	movdqa	xmm12,XMMWORD[((160+80))+rbp]
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	lea	rdi,[192+rdi]
+
+
+$L$open_sse_tail_64_dec_loop:
+	cmp	rbx,16
+	jb	NEAR $L$open_sse_tail_16_init
+	sub	rbx,16
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm0,xmm3
+	movdqu	XMMWORD[rdi],xmm0
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	movdqa	xmm0,xmm4
+	movdqa	xmm4,xmm8
+	movdqa	xmm8,xmm12
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+$L$open_sse_tail_16_init:
+	movdqa	xmm1,xmm0
+
+
+$L$open_sse_tail_16:
+	test	rbx,rbx
+	jz	NEAR $L$open_sse_finalize
+
+
+
+	pxor	xmm3,xmm3
+	lea	rsi,[((-1))+rbx*1+rsi]
+	mov	r8,rbx
+$L$open_sse_tail_16_compose:
+	pslldq	xmm3,1
+	pinsrb	xmm3,BYTE[rsi],0
+	sub	rsi,1
+	sub	r8,1
+	jnz	NEAR $L$open_sse_tail_16_compose
+
+DB	102,73,15,126,221
+	pextrq	r14,xmm3,1
+
+	pxor	xmm3,xmm1
+
+
+$L$open_sse_tail_16_extract:
+	pextrb	XMMWORD[rdi],xmm3,0
+	psrldq	xmm3,1
+	add	rdi,1
+	sub	rbx,1
+	jne	NEAR $L$open_sse_tail_16_extract
+
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$open_sse_finalize:
+	add	r10,QWORD[((0+160+32))+rbp]
+	adc	r11,QWORD[((8+160+32))+rbp]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	mov	r13,r10
+	mov	r14,r11
+	mov	r15,r12
+	sub	r10,-5
+	sbb	r11,-1
+	sbb	r12,3
+	cmovc	r10,r13
+	cmovc	r11,r14
+	cmovc	r12,r15
+
+	add	r10,QWORD[((0+160+16))+rbp]
+	adc	r11,QWORD[((8+160+16))+rbp]
+
+	movaps	xmm6,XMMWORD[((0+0))+rbp]
+	movaps	xmm7,XMMWORD[((16+0))+rbp]
+	movaps	xmm8,XMMWORD[((32+0))+rbp]
+	movaps	xmm9,XMMWORD[((48+0))+rbp]
+	movaps	xmm10,XMMWORD[((64+0))+rbp]
+	movaps	xmm11,XMMWORD[((80+0))+rbp]
+	movaps	xmm12,XMMWORD[((96+0))+rbp]
+	movaps	xmm13,XMMWORD[((112+0))+rbp]
+	movaps	xmm14,XMMWORD[((128+0))+rbp]
+	movaps	xmm15,XMMWORD[((144+0))+rbp]
+
+
+	add	rsp,288 + 160 + 32
+
+
+	pop	r9
+
+	mov	QWORD[r9],r10
+	mov	QWORD[8+r9],r11
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$open_sse_128:
+
+	movdqu	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqu	xmm4,XMMWORD[r9]
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqu	xmm12,XMMWORD[32+r9]
+	movdqa	xmm13,xmm12
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm13
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm13
+	mov	r10,10
+
+$L$open_sse_128_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	dec	r10
+	jnz	NEAR $L$open_sse_128_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,xmm7
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	paddd	xmm9,xmm11
+	paddd	xmm10,xmm11
+	paddd	xmm13,xmm15
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	paddd	xmm14,xmm15
+
+	pand	xmm0,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm0
+	movdqa	XMMWORD[(160+16)+rbp],xmm4
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_sse_128_xor_hash:
+	cmp	rbx,16
+	jb	NEAR $L$open_sse_tail_16
+	sub	rbx,16
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+
+
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm1,xmm3
+	movdqu	XMMWORD[rdi],xmm1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	movdqa	xmm1,xmm5
+	movdqa	xmm5,xmm9
+	movdqa	xmm9,xmm13
+	movdqa	xmm13,xmm2
+	movdqa	xmm2,xmm6
+	movdqa	xmm6,xmm10
+	movdqa	xmm10,xmm14
+	jmp	NEAR $L$open_sse_128_xor_hash
+$L$SEH_end_chacha20_poly1305_open_sse41:
+
+
+
+
+
+
+
+
+global	chacha20_poly1305_seal_sse41
+
+ALIGN	64
+chacha20_poly1305_seal_sse41:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_seal_sse41:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,QWORD[56+r9]
+	add	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+	mov	rbx,rdx
+
+	cmp	rbx,128
+	jbe	NEAR $L$seal_sse_128
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqu	xmm4,XMMWORD[r9]
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqu	xmm12,XMMWORD[32+r9]
+
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+
+	movdqa	XMMWORD[(160+48)+rbp],xmm4
+	movdqa	XMMWORD[(160+64)+rbp],xmm8
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+	mov	r10,10
+$L$seal_sse_init_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	r10
+	jnz	NEAR $L$seal_sse_init_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+
+	pand	xmm3,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm3
+	movdqa	XMMWORD[(160+16)+rbp],xmm7
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	cmp	rbx,12*16
+	ja	NEAR $L$seal_sse_main_init
+	mov	rcx,8*16
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_init:
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,xmm12
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	mov	rcx,12*16
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	mov	rcx,2
+	mov	r8,8
+	cmp	rbx,4*16
+	jbe	NEAR $L$seal_sse_tail_64
+	cmp	rbx,8*16
+	jbe	NEAR $L$seal_sse_tail_128
+	cmp	rbx,12*16
+	jbe	NEAR $L$seal_sse_tail_192
+
+$L$seal_sse_main_loop:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+ALIGN	32
+$L$seal_sse_main_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	lea	rdi,[16+rdi]
+	dec	r8
+	jge	NEAR $L$seal_sse_main_rounds
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_main_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	movdqa	XMMWORD[(160+80)+rbp],xmm14
+	movdqa	XMMWORD[(160+80)+rbp],xmm14
+	movdqu	xmm14,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm14,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm14,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm14,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm14,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm14
+
+	movdqa	xmm14,XMMWORD[((160+80))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	cmp	rbx,16*16
+	ja	NEAR $L$seal_sse_main_loop_xor
+
+	mov	rcx,12*16
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_loop_xor:
+	movdqu	xmm3,XMMWORD[((0 + 192))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 192))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 192))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 192))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,xmm12
+	movdqu	XMMWORD[(0 + 192)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 192)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 192)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 192)+rdi],xmm15
+
+	lea	rsi,[256+rsi]
+	sub	rbx,16*16
+	mov	rcx,6
+	mov	r8,4
+	cmp	rbx,12*16
+	jg	NEAR $L$seal_sse_main_loop
+	mov	rcx,rbx
+	test	rbx,rbx
+	je	NEAR $L$seal_sse_128_tail_hash
+	mov	rcx,6
+	cmp	rbx,8*16
+	ja	NEAR $L$seal_sse_tail_192
+	cmp	rbx,4*16
+	ja	NEAR $L$seal_sse_tail_128
+
+$L$seal_sse_tail_64:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm12,XMMWORD[((160+96))+rbp]
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+
+$L$seal_sse_tail_64_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_64_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_64_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_64_rounds_and_x1hash
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	jmp	NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_128:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm13,XMMWORD[((160+96))+rbp]
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+
+$L$seal_sse_tail_128_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_128_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_128_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_128_rounds_and_x1hash
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+
+	mov	rcx,4*16
+	sub	rbx,4*16
+	lea	rsi,[64+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_tail_192:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm14,XMMWORD[((160+96))+rbp]
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+
+$L$seal_sse_tail_192_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_192_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_192_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_192_rounds_and_x1hash
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	mov	rcx,8*16
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+
+$L$seal_sse_128_tail_hash:
+	cmp	rcx,16
+	jb	NEAR $L$seal_sse_128_tail_xor
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+	lea	rdi,[16+rdi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_128_tail_xor:
+	cmp	rbx,16
+	jb	NEAR $L$seal_sse_tail_16
+	sub	rbx,16
+
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm0,xmm3
+	movdqu	XMMWORD[rdi],xmm0
+
+	add	r10,QWORD[rdi]
+	adc	r11,QWORD[8+rdi]
+	adc	r12,1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm4,xmm8
+	movdqa	xmm8,xmm12
+	movdqa	xmm12,xmm1
+	movdqa	xmm1,xmm5
+	movdqa	xmm5,xmm9
+	movdqa	xmm9,xmm13
+	jmp	NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_16:
+	test	rbx,rbx
+	jz	NEAR $L$process_blocks_of_extra_in
+
+	mov	r8,rbx
+	mov	rcx,rbx
+	lea	rsi,[((-1))+rbx*1+rsi]
+	pxor	xmm15,xmm15
+$L$seal_sse_tail_16_compose:
+	pslldq	xmm15,1
+	pinsrb	xmm15,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	dec	rcx
+	jne	NEAR $L$seal_sse_tail_16_compose
+
+
+	pxor	xmm15,xmm0
+
+
+	mov	rcx,rbx
+	movdqu	xmm0,xmm15
+$L$seal_sse_tail_16_extract:
+	pextrb	XMMWORD[rdi],xmm0,0
+	psrldq	xmm0,1
+	add	rdi,1
+	sub	rcx,1
+	jnz	NEAR $L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	mov	r9,QWORD[((288 + 160 + 32))+rsp]
+	mov	r14,QWORD[56+r9]
+	mov	r13,QWORD[48+r9]
+	test	r14,r14
+	jz	NEAR $L$process_partial_block
+
+	mov	r15,16
+	sub	r15,rbx
+	cmp	r14,r15
+
+	jge	NEAR $L$load_extra_in
+	mov	r15,r14
+
+$L$load_extra_in:
+
+
+	lea	rsi,[((-1))+r15*1+r13]
+
+
+	add	r13,r15
+	sub	r14,r15
+	mov	QWORD[48+r9],r13
+	mov	QWORD[56+r9],r14
+
+
+
+	add	r8,r15
+
+
+	pxor	xmm11,xmm11
+$L$load_extra_load_loop:
+	pslldq	xmm11,1
+	pinsrb	xmm11,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	sub	r15,1
+	jnz	NEAR $L$load_extra_load_loop
+
+
+
+
+	mov	r15,rbx
+
+$L$load_extra_shift_loop:
+	pslldq	xmm11,1
+	sub	r15,1
+	jnz	NEAR $L$load_extra_shift_loop
+
+
+
+
+	lea	r15,[$L$and_masks]
+	shl	rbx,4
+	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
+
+
+	por	xmm15,xmm11
+
+
+
+DB	102,77,15,126,253
+	pextrq	r14,xmm15,1
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$process_blocks_of_extra_in:
+
+	mov	r9,QWORD[((288+32+160 ))+rsp]
+	mov	rsi,QWORD[48+r9]
+	mov	r8,QWORD[56+r9]
+	mov	rcx,r8
+	shr	r8,4
+
+$L$process_extra_hash_loop:
+	jz	NEAR process_extra_in_trailer
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rsi,[16+rsi]
+	sub	r8,1
+	jmp	NEAR $L$process_extra_hash_loop
+process_extra_in_trailer:
+	and	rcx,15
+	mov	rbx,rcx
+	jz	NEAR $L$do_length_block
+	lea	rsi,[((-1))+rcx*1+rsi]
+
+$L$process_extra_in_trailer_load:
+	pslldq	xmm15,1
+	pinsrb	xmm15,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	sub	rcx,1
+	jnz	NEAR $L$process_extra_in_trailer_load
+
+$L$process_partial_block:
+
+	lea	r15,[$L$and_masks]
+	shl	rbx,4
+	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
+DB	102,77,15,126,253
+	pextrq	r14,xmm15,1
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$do_length_block:
+	add	r10,QWORD[((0+160+32))+rbp]
+	adc	r11,QWORD[((8+160+32))+rbp]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	mov	r13,r10
+	mov	r14,r11
+	mov	r15,r12
+	sub	r10,-5
+	sbb	r11,-1
+	sbb	r12,3
+	cmovc	r10,r13
+	cmovc	r11,r14
+	cmovc	r12,r15
+
+	add	r10,QWORD[((0+160+16))+rbp]
+	adc	r11,QWORD[((8+160+16))+rbp]
+
+	movaps	xmm6,XMMWORD[((0+0))+rbp]
+	movaps	xmm7,XMMWORD[((16+0))+rbp]
+	movaps	xmm8,XMMWORD[((32+0))+rbp]
+	movaps	xmm9,XMMWORD[((48+0))+rbp]
+	movaps	xmm10,XMMWORD[((64+0))+rbp]
+	movaps	xmm11,XMMWORD[((80+0))+rbp]
+	movaps	xmm12,XMMWORD[((96+0))+rbp]
+	movaps	xmm13,XMMWORD[((112+0))+rbp]
+	movaps	xmm14,XMMWORD[((128+0))+rbp]
+	movaps	xmm15,XMMWORD[((144+0))+rbp]
+
+
+	add	rsp,288 + 160 + 32
+
+
+	pop	r9
+
+	mov	QWORD[r9],r10
+	mov	QWORD[8+r9],r11
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$seal_sse_128:
+
+	movdqu	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqu	xmm4,XMMWORD[r9]
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqu	xmm14,XMMWORD[32+r9]
+	movdqa	xmm12,xmm14
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm12
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm12
+	mov	r10,10
+
+$L$seal_sse_128_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	dec	r10
+	jnz	NEAR $L$seal_sse_128_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,xmm7
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	paddd	xmm8,xmm11
+	paddd	xmm9,xmm11
+	paddd	xmm12,xmm15
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	paddd	xmm13,xmm15
+
+	pand	xmm2,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm2
+	movdqa	XMMWORD[(160+16)+rbp],xmm6
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	jmp	NEAR $L$seal_sse_128_tail_xor
+$L$SEH_end_chacha20_poly1305_seal_sse41:
+
+
+
+global	chacha20_poly1305_open_avx2
+
+ALIGN	64
+chacha20_poly1305_open_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_open_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+
+	vzeroupper
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vbroadcasti128	ymm4,XMMWORD[r9]
+	vbroadcasti128	ymm8,XMMWORD[16+r9]
+	vbroadcasti128	ymm12,XMMWORD[32+r9]
+	vpaddd	ymm12,ymm12,YMMWORD[$L$avx2_init]
+	cmp	rbx,6*32
+	jbe	NEAR $L$open_avx2_192
+	cmp	rbx,10*32
+	jbe	NEAR $L$open_avx2_320
+
+	vmovdqa	YMMWORD[(160+64)+rbp],ymm4
+	vmovdqa	YMMWORD[(160+96)+rbp],ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	mov	r10,10
+$L$open_avx2_init_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_init_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+
+	xor	rcx,rcx
+$L$open_avx2_init_hash:
+	add	r10,QWORD[((0+0))+rcx*1+rsi]
+	adc	r11,QWORD[((8+0))+rcx*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	add	rcx,16
+	cmp	rcx,2*32
+	jne	NEAR $L$open_avx2_init_hash
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vpxor	ymm4,ymm4,YMMWORD[32+rsi]
+
+	vmovdqu	YMMWORD[rdi],ymm0
+	vmovdqu	YMMWORD[32+rdi],ymm4
+	lea	rsi,[64+rsi]
+	lea	rdi,[64+rdi]
+	sub	rbx,2*32
+$L$open_avx2_main_loop:
+
+	cmp	rbx,16*32
+	jb	NEAR $L$open_avx2_main_loop_done
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	rcx,rcx
+$L$open_avx2_main_loop_rounds:
+	add	r10,QWORD[((0+0))+rcx*1+rsi]
+	adc	r11,QWORD[((8+0))+rcx*1+rsi]
+	adc	r12,1
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	add	r15,rax
+	adc	r9,rdx
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	add	r10,QWORD[((0+16))+rcx*1+rsi]
+	adc	r11,QWORD[((8+16))+rcx*1+rsi]
+	adc	r12,1
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	add	r10,QWORD[((0+32))+rcx*1+rsi]
+	adc	r11,QWORD[((8+32))+rcx*1+rsi]
+	adc	r12,1
+
+	lea	rcx,[48+rcx]
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	cmp	rcx,10*6*8
+	jne	NEAR $L$open_avx2_main_loop_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	add	r10,QWORD[((0+480))+rsi]
+	adc	r11,QWORD[((8+480))+rsi]
+	adc	r12,1
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	add	r10,QWORD[((0+480+16))+rsi]
+	adc	r11,QWORD[((8+480+16))+rsi]
+	adc	r12,1
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm12,ymm8,0x02
+	vperm2i128	ymm8,ymm12,ymm8,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+384))+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[((32+384))+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[((64+384))+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[((96+384))+rsi]
+	vmovdqu	YMMWORD[(0+384)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+384)+rdi],ymm0
+	vmovdqu	YMMWORD[(64+384)+rdi],ymm4
+	vmovdqu	YMMWORD[(96+384)+rdi],ymm8
+
+	lea	rsi,[512+rsi]
+	lea	rdi,[512+rdi]
+	sub	rbx,16*32
+	jmp	NEAR $L$open_avx2_main_loop
+$L$open_avx2_main_loop_done:
+	test	rbx,rbx
+	vzeroupper
+	je	NEAR $L$open_sse_finalize
+
+	cmp	rbx,12*32
+	ja	NEAR $L$open_avx2_tail_512
+	cmp	rbx,8*32
+	ja	NEAR $L$open_avx2_tail_384
+	cmp	rbx,4*32
+	ja	NEAR $L$open_avx2_tail_256
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	r8,r8
+	mov	rcx,rbx
+	and	rcx,-16
+	test	rcx,rcx
+	je	NEAR $L$open_avx2_tail_128_rounds
+$L$open_avx2_tail_128_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_avx2_tail_128_rounds:
+	add	r8,16
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_128_rounds_and_x1hash
+	cmp	r8,160
+	jne	NEAR $L$open_avx2_tail_128_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_256:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+
+	mov	QWORD[((160+128))+rbp],rbx
+	mov	rcx,rbx
+	sub	rcx,4*32
+	shr	rcx,4
+	mov	r8,10
+	cmp	rcx,10
+	cmovg	rcx,r8
+	mov	rbx,rsi
+	xor	r8,r8
+$L$open_avx2_tail_256_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+$L$open_avx2_tail_256_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+
+	inc	r8
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_256_rounds_and_x1hash
+	cmp	r8,10
+	jne	NEAR $L$open_avx2_tail_256_rounds
+	mov	r8,rbx
+	sub	rbx,rsi
+	mov	rcx,rbx
+	mov	rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_tail_256_hash:
+	add	rcx,16
+	cmp	rcx,rbx
+	jg	NEAR $L$open_avx2_tail_256_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	jmp	NEAR $L$open_avx2_tail_256_hash
+$L$open_avx2_tail_256_done:
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+0))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+0))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[128+rsi]
+	lea	rdi,[128+rdi]
+	sub	rbx,4*32
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_384:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+
+	mov	QWORD[((160+128))+rbp],rbx
+	mov	rcx,rbx
+	sub	rcx,8*32
+	shr	rcx,4
+	add	rcx,6
+	mov	r8,10
+	cmp	rcx,10
+	cmovg	rcx,r8
+	mov	rbx,rsi
+	xor	r8,r8
+$L$open_avx2_tail_384_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+$L$open_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+	inc	r8
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_384_rounds_and_x2hash
+	cmp	r8,10
+	jne	NEAR $L$open_avx2_tail_384_rounds_and_x1hash
+	mov	r8,rbx
+	sub	rbx,rsi
+	mov	rcx,rbx
+	mov	rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_384_tail_hash:
+	add	rcx,16
+	cmp	rcx,rbx
+	jg	NEAR $L$open_avx2_384_tail_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	jmp	NEAR $L$open_avx2_384_tail_hash
+$L$open_avx2_384_tail_done:
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+0))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+0))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+128))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+128))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[256+rsi]
+	lea	rdi,[256+rdi]
+	sub	rbx,8*32
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_512:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	rcx,rcx
+	mov	r8,rsi
+$L$open_avx2_tail_512_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+$L$open_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	add	r10,QWORD[((0+16))+r8]
+	adc	r11,QWORD[((8+16))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[32+r8]
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	inc	rcx
+	cmp	rcx,4
+	jl	NEAR $L$open_avx2_tail_512_rounds_and_x2hash
+	cmp	rcx,10
+	jne	NEAR $L$open_avx2_tail_512_rounds_and_x1hash
+	mov	rcx,rbx
+	sub	rcx,12*32
+	and	rcx,-16
+$L$open_avx2_tail_512_hash:
+	test	rcx,rcx
+	je	NEAR $L$open_avx2_tail_512_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	sub	rcx,2*8
+	jmp	NEAR $L$open_avx2_tail_512_hash
+$L$open_avx2_tail_512_done:
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[384+rsi]
+	lea	rdi,[384+rdi]
+	sub	rbx,12*32
+$L$open_avx2_tail_128_xor:
+	cmp	rbx,32
+	jb	NEAR $L$open_avx2_tail_32_xor
+	sub	rbx,32
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+	lea	rdi,[32+rdi]
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	jmp	NEAR $L$open_avx2_tail_128_xor
+$L$open_avx2_tail_32_xor:
+	cmp	rbx,16
+	vmovdqa	xmm1,xmm0
+	jb	NEAR $L$open_avx2_exit
+	sub	rbx,16
+
+	vpxor	xmm1,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	vperm2i128	ymm0,ymm0,ymm0,0x11
+	vmovdqa	xmm1,xmm0
+$L$open_avx2_exit:
+	vzeroupper
+	jmp	NEAR $L$open_sse_tail_16
+
+$L$open_avx2_192:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm11,ymm12
+	vmovdqa	ymm15,ymm13
+	mov	r10,10
+$L$open_avx2_192_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_192_rounds
+	vpaddd	ymm0,ymm0,ymm2
+	vpaddd	ymm1,ymm1,ymm2
+	vpaddd	ymm4,ymm4,ymm6
+	vpaddd	ymm5,ymm5,ymm6
+	vpaddd	ymm8,ymm8,ymm10
+	vpaddd	ymm9,ymm9,ymm10
+	vpaddd	ymm12,ymm12,ymm11
+	vpaddd	ymm13,ymm13,ymm15
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+$L$open_avx2_short:
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_avx2_short_hash_and_xor_loop:
+	cmp	rbx,32
+	jb	NEAR $L$open_avx2_short_tail_32
+	sub	rbx,32
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rsi]
+	adc	r11,QWORD[((8+16))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+	lea	rdi,[32+rdi]
+
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	vmovdqa	ymm12,ymm1
+	vmovdqa	ymm1,ymm5
+	vmovdqa	ymm5,ymm9
+	vmovdqa	ymm9,ymm13
+	vmovdqa	ymm13,ymm2
+	vmovdqa	ymm2,ymm6
+	jmp	NEAR $L$open_avx2_short_hash_and_xor_loop
+$L$open_avx2_short_tail_32:
+	cmp	rbx,16
+	vmovdqa	xmm1,xmm0
+	jb	NEAR $L$open_avx2_short_tail_32_exit
+	sub	rbx,16
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	vpxor	xmm3,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm3
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	vextracti128	xmm1,ymm0,1
+$L$open_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	NEAR $L$open_sse_tail_16
+
+$L$open_avx2_320:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	mov	r10,10
+$L$open_avx2_320_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_320_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,ymm7
+	vpaddd	ymm5,ymm5,ymm7
+	vpaddd	ymm6,ymm6,ymm7
+	vpaddd	ymm8,ymm8,ymm11
+	vpaddd	ymm9,ymm9,ymm11
+	vpaddd	ymm10,ymm10,ymm11
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+	vperm2i128	ymm9,ymm6,ymm2,0x02
+	vperm2i128	ymm13,ymm14,ymm10,0x02
+	vperm2i128	ymm2,ymm6,ymm2,0x13
+	vperm2i128	ymm6,ymm14,ymm10,0x13
+	jmp	NEAR $L$open_avx2_short
+$L$SEH_end_chacha20_poly1305_open_avx2:
+
+
+
+global	chacha20_poly1305_seal_avx2
+
+ALIGN	64
+chacha20_poly1305_seal_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_seal_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,QWORD[56+r9]
+	add	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+	mov	rbx,rdx
+
+	vzeroupper
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vbroadcasti128	ymm4,XMMWORD[r9]
+	vbroadcasti128	ymm8,XMMWORD[16+r9]
+	vbroadcasti128	ymm12,XMMWORD[32+r9]
+	vpaddd	ymm12,ymm12,YMMWORD[$L$avx2_init]
+	cmp	rbx,6*32
+	jbe	NEAR $L$seal_avx2_192
+	cmp	rbx,10*32
+	jbe	NEAR $L$seal_avx2_320
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm7,ymm4
+	vmovdqa	YMMWORD[(160+64)+rbp],ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+96)+rbp],ymm8
+	vmovdqa	ymm15,ymm12
+	vpaddd	ymm14,ymm15,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm14,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	mov	r10,10
+$L$seal_avx2_init_rounds:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	dec	r10
+	jnz	NEAR $L$seal_avx2_init_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vperm2i128	ymm15,ymm7,ymm3,0x02
+	vperm2i128	ymm3,ymm7,ymm3,0x13
+	vpand	ymm15,ymm15,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm15
+	mov	r8,r8
+	call	poly_hash_ad_internal
+
+	vpxor	ymm3,ymm3,YMMWORD[rsi]
+	vpxor	ymm11,ymm11,YMMWORD[32+rsi]
+	vmovdqu	YMMWORD[rdi],ymm3
+	vmovdqu	YMMWORD[32+rdi],ymm11
+	vperm2i128	ymm15,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm15,ymm15,YMMWORD[((0+64))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+64))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+64))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+64))+rsi]
+	vmovdqu	YMMWORD[(0+64)+rdi],ymm15
+	vmovdqu	YMMWORD[(32+64)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+64)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+64)+rdi],ymm10
+	vperm2i128	ymm15,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm15,ymm15,YMMWORD[((0+192))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+192))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+192))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+192))+rsi]
+	vmovdqu	YMMWORD[(0+192)+rdi],ymm15
+	vmovdqu	YMMWORD[(32+192)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+192)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+192)+rdi],ymm9
+	vperm2i128	ymm15,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm15
+
+	lea	rsi,[320+rsi]
+	sub	rbx,10*32
+	mov	rcx,10*32
+	cmp	rbx,4*32
+	jbe	NEAR $L$seal_avx2_short_hash_remainder
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vpxor	ymm4,ymm4,YMMWORD[32+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[64+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[96+rsi]
+	vmovdqu	YMMWORD[320+rdi],ymm0
+	vmovdqu	YMMWORD[352+rdi],ymm4
+	vmovdqu	YMMWORD[384+rdi],ymm8
+	vmovdqu	YMMWORD[416+rdi],ymm12
+	lea	rsi,[128+rsi]
+	sub	rbx,4*32
+	mov	rcx,8
+	mov	r8,2
+	cmp	rbx,4*32
+	jbe	NEAR $L$seal_avx2_tail_128
+	cmp	rbx,8*32
+	jbe	NEAR $L$seal_avx2_tail_256
+	cmp	rbx,12*32
+	jbe	NEAR $L$seal_avx2_tail_384
+	cmp	rbx,16*32
+	jbe	NEAR $L$seal_avx2_tail_512
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+
+	sub	rdi,16
+	mov	rcx,9
+	jmp	NEAR $L$seal_avx2_main_loop_rounds_entry
+ALIGN	32
+$L$seal_avx2_main_loop:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	mov	rcx,10
+ALIGN	32
+$L$seal_avx2_main_loop_rounds:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	add	r15,rax
+	adc	r9,rdx
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$seal_avx2_main_loop_rounds_entry:
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	add	r10,QWORD[((0+32))+rdi]
+	adc	r11,QWORD[((8+32))+rdi]
+	adc	r12,1
+
+	lea	rdi,[48+rdi]
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	dec	rcx
+	jne	NEAR $L$seal_avx2_main_loop_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm12,ymm8,0x02
+	vperm2i128	ymm8,ymm12,ymm8,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+384))+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[((32+384))+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[((64+384))+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[((96+384))+rsi]
+	vmovdqu	YMMWORD[(0+384)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+384)+rdi],ymm0
+	vmovdqu	YMMWORD[(64+384)+rdi],ymm4
+	vmovdqu	YMMWORD[(96+384)+rdi],ymm8
+
+	lea	rsi,[512+rsi]
+	sub	rbx,16*32
+	cmp	rbx,16*32
+	jg	NEAR $L$seal_avx2_main_loop
+
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	mov	rcx,10
+	xor	r8,r8
+
+	cmp	rbx,12*32
+	ja	NEAR $L$seal_avx2_tail_512
+	cmp	rbx,8*32
+	ja	NEAR $L$seal_avx2_tail_384
+	cmp	rbx,4*32
+	ja	NEAR $L$seal_avx2_tail_256
+
+$L$seal_avx2_tail_128:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_128_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_128_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	jmp	NEAR $L$seal_avx2_short_loop
+
+$L$seal_avx2_tail_256:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+
+$L$seal_avx2_tail_256_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_256_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+0))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+0))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,4*32
+	lea	rsi,[128+rsi]
+	sub	rbx,4*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_384:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+
+$L$seal_avx2_tail_384_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_384_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+0))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+0))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+128))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+128))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,8*32
+	lea	rsi,[256+rsi]
+	sub	rbx,8*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_512:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_512_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r15,rax
+	adc	r9,rdx
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	add	r15,rax
+	adc	r9,rdx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_512_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,12*32
+	lea	rsi,[384+rsi]
+	sub	rbx,12*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_320:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	mov	r10,10
+$L$seal_avx2_320_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	dec	r10
+	jne	NEAR $L$seal_avx2_320_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,ymm7
+	vpaddd	ymm5,ymm5,ymm7
+	vpaddd	ymm6,ymm6,ymm7
+	vpaddd	ymm8,ymm8,ymm11
+	vpaddd	ymm9,ymm9,ymm11
+	vpaddd	ymm10,ymm10,ymm11
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+	vperm2i128	ymm9,ymm6,ymm2,0x02
+	vperm2i128	ymm13,ymm14,ymm10,0x02
+	vperm2i128	ymm2,ymm6,ymm2,0x13
+	vperm2i128	ymm6,ymm14,ymm10,0x13
+	jmp	NEAR $L$seal_avx2_short
+
+$L$seal_avx2_192:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm11,ymm12
+	vmovdqa	ymm15,ymm13
+	mov	r10,10
+$L$seal_avx2_192_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+
+	dec	r10
+	jne	NEAR $L$seal_avx2_192_rounds
+	vpaddd	ymm0,ymm0,ymm2
+	vpaddd	ymm1,ymm1,ymm2
+	vpaddd	ymm4,ymm4,ymm6
+	vpaddd	ymm5,ymm5,ymm6
+	vpaddd	ymm8,ymm8,ymm10
+	vpaddd	ymm9,ymm9,ymm10
+	vpaddd	ymm12,ymm12,ymm11
+	vpaddd	ymm13,ymm13,ymm15
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+$L$seal_avx2_short:
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	xor	rcx,rcx
+$L$seal_avx2_short_hash_remainder:
+	cmp	rcx,16
+	jb	NEAR $L$seal_avx2_short_loop
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+	add	rdi,16
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+$L$seal_avx2_short_loop:
+	cmp	rbx,32
+	jb	NEAR $L$seal_avx2_short_tail
+	sub	rbx,32
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	vmovdqa	ymm12,ymm1
+	vmovdqa	ymm1,ymm5
+	vmovdqa	ymm5,ymm9
+	vmovdqa	ymm9,ymm13
+	vmovdqa	ymm13,ymm2
+	vmovdqa	ymm2,ymm6
+	jmp	NEAR $L$seal_avx2_short_loop
+$L$seal_avx2_short_tail:
+	cmp	rbx,16
+	jb	NEAR $L$seal_avx2_exit
+	sub	rbx,16
+	vpxor	xmm3,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm3
+	lea	rsi,[16+rsi]
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	vextracti128	xmm0,ymm0,1
+$L$seal_avx2_exit:
+	vzeroupper
+	jmp	NEAR $L$seal_sse_tail_16
+
+$L$SEH_end_chacha20_poly1305_seal_avx2:
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o
new file mode 100644
index 0000000000..08c6c5d2bc
Binary files /dev/null and b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/ghash-armv4-linux32.S b/ring-0.17.14/pregenerated/ghash-armv4-linux32.S
new file mode 100644
index 0000000000..1209e5ea16
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-armv4-linux32.S
@@ -0,0 +1,242 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64	d7,[r1]!		@ load H
+	vmov.i8	q8,#0xe1
+	vld1.64	d6,[r1]
+	vshl.i64	d17,#57
+	vshr.u64	d16,#63		@ t0=0xc2....01
+	vdup.8	q9,d7[7]
+	vshr.u64	d26,d6,#63
+	vshr.s8	q9,#7			@ broadcast carry bit
+	vshl.i64	q3,q3,#1
+	vand	q8,q8,q9
+	vorr	d7,d26		@ H<<<=1
+	veor	q3,q3,q8		@ twisted H
+	vstmia	r0,{q3}
+
+	bx	lr					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64	d7,[r0]!		@ load Xi
+	vld1.64	d6,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+	mov	r3,#16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64	d1,[r0]!		@ load Xi
+	vld1.64	d0,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64	d7,[r2]!		@ load inp
+	vld1.64	d6,[r2]!
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	veor	q3,q0			@ inp^=Xi
+.Lgmult_neon:
+	vext.8	d16, d26, d26, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d0, d6, d6, #1	@ B1
+	vmull.p8	q0, d26, d0		@ E = A*B1
+	vext.8	d18, d26, d26, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d26, d22		@ G = A*B2
+	vext.8	d20, d26, d26, #3	@ A3
+	veor	q8, q8, q0		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d0, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q0, d26, d0		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d26, d22		@ K = A*B4
+	veor	q10, q10, q0		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q0, d26, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q0, q0, q8
+	veor	q0, q0, q10
+	veor	d6,d6,d7	@ Karatsuba pre-processing
+	vext.8	d16, d28, d28, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d2, d6, d6, #1	@ B1
+	vmull.p8	q1, d28, d2		@ E = A*B1
+	vext.8	d18, d28, d28, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d28, d22		@ G = A*B2
+	vext.8	d20, d28, d28, #3	@ A3
+	veor	q8, q8, q1		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d2, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q1, d28, d2		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d28, d22		@ K = A*B4
+	veor	q10, q10, q1		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q1, d28, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q1, q1, q8
+	veor	q1, q1, q10
+	vext.8	d16, d27, d27, #1	@ A1
+	vmull.p8	q8, d16, d7		@ F = A1*B
+	vext.8	d4, d7, d7, #1	@ B1
+	vmull.p8	q2, d27, d4		@ E = A*B1
+	vext.8	d18, d27, d27, #2	@ A2
+	vmull.p8	q9, d18, d7		@ H = A2*B
+	vext.8	d22, d7, d7, #2	@ B2
+	vmull.p8	q11, d27, d22		@ G = A*B2
+	vext.8	d20, d27, d27, #3	@ A3
+	veor	q8, q8, q2		@ L = E + F
+	vmull.p8	q10, d20, d7		@ J = A3*B
+	vext.8	d4, d7, d7, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q2, d27, d4		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d7, d7, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d27, d22		@ K = A*B4
+	veor	q10, q10, q2		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q2, d27, d7		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q2, q2, q8
+	veor	q2, q2, q10
+	veor	q1,q1,q0		@ Karatsuba post-processing
+	veor	q1,q1,q2
+	veor	d1,d1,d2
+	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	q9,q0,#57		@ 1st phase
+	vshl.i64	q10,q0,#62
+	veor	q10,q10,q9		@
+	vshl.i64	q9,q0,#63
+	veor	q10, q10, q9		@
+	veor	d1,d1,d20	@
+	veor	d4,d4,d21
+
+	vshr.u64	q10,q0,#1		@ 2nd phase
+	veor	q2,q2,q0
+	veor	q0,q0,q10		@
+	vshr.u64	q10,q10,#6
+	vshr.u64	q0,q0,#1		@
+	veor	q0,q0,q2		@
+	veor	q0,q0,q10		@
+
+	subs	r3,#16
+	bne	.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	sub	r0,#16
+	vst1.64	d1,[r0]!		@ write out Xi
+	vst1.64	d0,[r0]
+
+	bx	lr					@ bx lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S b/ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S
new file mode 100644
index 0000000000..790cf2fe39
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S
@@ -0,0 +1,333 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+.text
+
+.globl	_gcm_init_neon
+.private_extern	_gcm_init_neon
+
+.align	4
+_gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	_gcm_gmult_neon
+.private_extern	_gcm_gmult_neon
+
+.align	4
+_gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	_gcm_ghash_neon
+.private_extern	_gcm_ghash_neon
+
+.align	4
+_gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	__TEXT,__const
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S b/ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S
new file mode 100644
index 0000000000..7ddf782779
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S
@@ -0,0 +1,333 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+.text
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	.Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S b/ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S
new file mode 100644
index 0000000000..db3d424f11
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S
@@ -0,0 +1,339 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+.text
+
+.globl	gcm_init_neon
+
+.def gcm_init_neon
+   .type 32
+.endef
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	gcm_gmult_neon
+
+.def gcm_gmult_neon
+   .type 32
+.endef
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	gcm_ghash_neon
+
+.def gcm_ghash_neon
+   .type 32
+.endef
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	.rodata
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/ghash-x86-elf.S b/ring-0.17.14/pregenerated/ghash-x86-elf.S
new file mode 100644
index 0000000000..6c1753d421
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-x86-elf.S
@@ -0,0 +1,274 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	gcm_init_clmul
+.hidden	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	.L000pic
+.L000pic:
+	popl	%ecx
+	leal	.Lbswap-.L000pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
+	ret
+.size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl	gcm_ghash_clmul
+.hidden	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	.L001pic
+.L001pic:
+	popl	%ecx
+	leal	.Lbswap-.L001pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	.L002odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm6,%xmm3
+	leal	32(%esi),%esi
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	nop
+	subl	$32,%ebx
+	jbe	.L003even_tail
+	jmp	.L004mod_loop
+.align	32
+.L004mod_loop:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+	nop
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movups	(%edx),%xmm2
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
+	movdqu	16(%esi),%xmm6
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+.byte	102,15,56,0,245
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
+	pxor	%xmm4,%xmm1
+.byte	102,15,58,68,250,17
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,58,68,221,0
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	.L004mod_loop
+.L003even_tail:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testl	%ebx,%ebx
+	jnz	.L005done
+	movups	(%edx),%xmm2
+.L002odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.L005done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align	64
+.Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/ghash-x86-win32n.asm b/ring-0.17.14/pregenerated/ghash-x86-win32n.asm
new file mode 100644
index 0000000000..5b1cd80f36
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-x86-win32n.asm
@@ -0,0 +1,277 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_gcm_init_clmul
+align	16
+_gcm_init_clmul:
+L$_gcm_init_clmul_begin:
+	mov	edx,DWORD [4+esp]
+	mov	eax,DWORD [8+esp]
+	call	L$000pic
+L$000pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$000pic)+ecx]
+	movdqu	xmm2,[eax]
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm4,xmm2,255
+	movdqa	xmm3,xmm2
+	psllq	xmm2,1
+	pxor	xmm5,xmm5
+	psrlq	xmm3,63
+	pcmpgtd	xmm5,xmm4
+	pslldq	xmm3,8
+	por	xmm2,xmm3
+	pand	xmm5,[16+ecx]
+	pxor	xmm2,xmm5
+	movdqa	xmm0,xmm2
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pshufd	xmm4,xmm2,78
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm2
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm2,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm2
+	movdqu	[edx],xmm2
+	pxor	xmm4,xmm0
+	movdqu	[16+edx],xmm0
+db	102,15,58,15,227,8
+	movdqu	[32+edx],xmm4
+	ret
+global	_gcm_ghash_clmul
+align	16
+_gcm_ghash_clmul:
+L$_gcm_ghash_clmul_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	eax,DWORD [20+esp]
+	mov	edx,DWORD [24+esp]
+	mov	esi,DWORD [28+esp]
+	mov	ebx,DWORD [32+esp]
+	call	L$001pic
+L$001pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$001pic)+ecx]
+	movdqu	xmm0,[eax]
+	movdqa	xmm5,[ecx]
+	movdqu	xmm2,[edx]
+db	102,15,56,0,197
+	sub	ebx,16
+	jz	NEAR L$002odd_tail
+	movdqu	xmm3,[esi]
+	movdqu	xmm6,[16+esi]
+db	102,15,56,0,221
+db	102,15,56,0,245
+	movdqu	xmm5,[32+edx]
+	pxor	xmm0,xmm3
+	pshufd	xmm3,xmm6,78
+	movdqa	xmm7,xmm6
+	pxor	xmm3,xmm6
+	lea	esi,[32+esi]
+db	102,15,58,68,242,0
+db	102,15,58,68,250,17
+db	102,15,58,68,221,0
+	movups	xmm2,[16+edx]
+	nop
+	sub	ebx,32
+	jbe	NEAR L$003even_tail
+	jmp	NEAR L$004mod_loop
+align	32
+L$004mod_loop:
+	pshufd	xmm4,xmm0,78
+	movdqa	xmm1,xmm0
+	pxor	xmm4,xmm0
+	nop
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,229,16
+	movups	xmm2,[edx]
+	xorps	xmm0,xmm6
+	movdqa	xmm5,[ecx]
+	xorps	xmm1,xmm7
+	movdqu	xmm7,[esi]
+	pxor	xmm3,xmm0
+	movdqu	xmm6,[16+esi]
+	pxor	xmm3,xmm1
+db	102,15,56,0,253
+	pxor	xmm4,xmm3
+	movdqa	xmm3,xmm4
+	psrldq	xmm4,8
+	pslldq	xmm3,8
+	pxor	xmm1,xmm4
+	pxor	xmm0,xmm3
+db	102,15,56,0,245
+	pxor	xmm1,xmm7
+	movdqa	xmm7,xmm6
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+db	102,15,58,68,242,0
+	movups	xmm5,[32+edx]
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	pshufd	xmm3,xmm7,78
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm3,xmm7
+	pxor	xmm1,xmm4
+db	102,15,58,68,250,17
+	movups	xmm2,[16+edx]
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+db	102,15,58,68,221,0
+	lea	esi,[32+esi]
+	sub	ebx,32
+	ja	NEAR L$004mod_loop
+L$003even_tail:
+	pshufd	xmm4,xmm0,78
+	movdqa	xmm1,xmm0
+	pxor	xmm4,xmm0
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,229,16
+	movdqa	xmm5,[ecx]
+	xorps	xmm0,xmm6
+	xorps	xmm1,xmm7
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+	pxor	xmm4,xmm3
+	movdqa	xmm3,xmm4
+	psrldq	xmm4,8
+	pslldq	xmm3,8
+	pxor	xmm1,xmm4
+	pxor	xmm0,xmm3
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	test	ebx,ebx
+	jnz	NEAR L$005done
+	movups	xmm2,[edx]
+L$002odd_tail:
+	movdqu	xmm3,[esi]
+db	102,15,56,0,221
+	pxor	xmm0,xmm3
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pshufd	xmm4,xmm2,78
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm2
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+L$005done:
+db	102,15,56,0,197
+	movdqu	[eax],xmm0
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$bswap:
+db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+db	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/ghash-x86-win32n.o b/ring-0.17.14/pregenerated/ghash-x86-win32n.o
new file mode 100644
index 0000000000..f78a0eb0fe
Binary files /dev/null and b/ring-0.17.14/pregenerated/ghash-x86-win32n.o differ
diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-elf.S b/ring-0.17.14/pregenerated/ghash-x86_64-elf.S
new file mode 100644
index 0000000000..4c957b4dde
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-x86_64-elf.S
@@ -0,0 +1,1062 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.globl	gcm_init_clmul
+.hidden gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.cfi_startproc	
+
+_CET_ENDBR
+.L_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	.L0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	pshufd	$78,%xmm2,%xmm6
+	movdqa	%xmm2,%xmm0
+	pxor	%xmm2,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%rdi)
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm5
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqu	%xmm5,48(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,64(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,80(%rdi)
+	ret
+.cfi_endproc	
+
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	32
+gcm_ghash_clmul:
+.cfi_startproc	
+
+_CET_ENDBR
+.L_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm10
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm7
+.byte	102,65,15,56,0,194
+
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+
+	movdqu	16(%rsi),%xmm6
+	cmpq	$0x30,%rcx
+	jb	.Lskip4x
+
+	subq	$0x30,%rcx
+	movq	$0xA040608020C0E000,%rax
+	movdqu	48(%rsi),%xmm14
+	movdqu	64(%rsi),%xmm15
+
+
+
+
+	movdqu	48(%rdx),%xmm3
+	movdqu	32(%rdx),%xmm11
+.byte	102,65,15,56,0,218
+.byte	102,69,15,56,0,218
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm11,%xmm12
+.byte	102,68,15,58,68,222,0
+.byte	102,68,15,58,68,238,17
+.byte	102,68,15,58,68,231,16
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+	xorps	%xmm12,%xmm4
+
+	movdqu	16(%rdx),%xmm11
+	movdqu	0(%rdx),%xmm8
+.byte	102,69,15,56,0,218
+.byte	102,69,15,56,0,194
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,58,68,238,17
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+.byte	102,65,15,58,68,199,0
+	xorps	%xmm12,%xmm4
+	movdqu	48(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,65,15,58,68,207,17
+	xorps	%xmm3,%xmm0
+	movdqu	32(%rdx),%xmm3
+	movdqa	%xmm11,%xmm13
+.byte	102,68,15,58,68,199,16
+	pshufd	$78,%xmm11,%xmm12
+	xorps	%xmm5,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,65,15,56,0,218
+	movups	32(%rsi),%xmm7
+	xorps	%xmm4,%xmm8
+.byte	102,68,15,58,68,218,0
+	pshufd	$78,%xmm3,%xmm4
+
+	pxor	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm5
+	pxor	%xmm1,%xmm8
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm8,%xmm9
+.byte	102,68,15,58,68,234,17
+	pslldq	$8,%xmm8
+	psrldq	$8,%xmm9
+	pxor	%xmm8,%xmm0
+	movdqa	.L7_mask(%rip),%xmm8
+	pxor	%xmm9,%xmm1
+.byte	102,76,15,110,200
+
+	pand	%xmm0,%xmm8
+.byte	102,69,15,56,0,200
+	pxor	%xmm0,%xmm9
+.byte	102,68,15,58,68,231,0
+	psllq	$57,%xmm9
+	movdqa	%xmm9,%xmm8
+	pslldq	$8,%xmm9
+.byte	102,15,58,68,222,0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	movdqu	0(%rdx),%xmm8
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	movdqu	16(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,15,58,68,231,16
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+.byte	102,69,15,56,0,194
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+
+	movdqa	%xmm11,%xmm13
+	pxor	%xmm12,%xmm4
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm1
+.byte	102,69,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+.byte	102,65,15,58,68,199,0
+.byte	102,65,15,58,68,207,17
+.byte	102,68,15,58,68,199,16
+	xorps	%xmm12,%xmm4
+	xorps	%xmm3,%xmm0
+	xorps	%xmm5,%xmm1
+	pxor	%xmm0,%xmm1
+	pxor	%xmm4,%xmm8
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm0,%xmm1
+
+	movdqa	%xmm8,%xmm9
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm9
+	pxor	%xmm8,%xmm1
+	pxor	%xmm9,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	addq	$0x40,%rcx
+	jz	.Ldone
+	movdqu	32(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+.Lskip4x:
+
+
+
+
+
+	movdqu	(%rdx),%xmm8
+	movdqu	16(%rdx),%xmm3
+.byte	102,69,15,56,0,194
+.byte	102,65,15,56,0,218
+	pxor	%xmm8,%xmm0
+
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	leaq	32(%rdx),%rdx
+	nop
+	subq	$0x20,%rcx
+	jbe	.Leven_tail
+	nop
+	jmp	.Lmod_loop
+
+.align	32
+.Lmod_loop:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	movdqu	(%rdx),%xmm9
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,56,0,202
+	movdqu	16(%rdx),%xmm3
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm9,%xmm1
+	pxor	%xmm8,%xmm4
+.byte	102,65,15,56,0,218
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm3,%xmm5
+
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm0,%xmm8
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm8
+.byte	102,15,58,68,218,0
+	psllq	$1,%xmm0
+	pxor	%xmm8,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm8
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pshufd	$78,%xmm5,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm5,%xmm4
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,234,17
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+	pxor	%xmm9,%xmm0
+	leaq	32(%rdx),%rdx
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,231,0
+	pxor	%xmm1,%xmm0
+
+	subq	$0x20,%rcx
+	ja	.Lmod_loop
+
+.Leven_tail:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm1,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testq	%rcx,%rcx
+	jnz	.Ldone
+
+.Lodd_tail:
+	movdqu	(%rdx),%xmm8
+.byte	102,69,15,56,0,194
+	pxor	%xmm8,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,223,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.Ldone:
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+	ret
+.cfi_endproc	
+
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl	gcm_init_avx
+.hidden gcm_init_avx
+.type	gcm_init_avx,@function
+.align	32
+gcm_init_avx:
+.cfi_startproc	
+
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	ret
+
+.cfi_endproc	
+.size	gcm_init_avx,.-gcm_init_avx
+.globl	gcm_ghash_avx
+.hidden gcm_ghash_avx
+.type	gcm_ghash_avx,@function
+.align	32
+gcm_ghash_avx:
+.cfi_startproc	
+
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	ret
+.cfi_endproc	
+
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long	7,0,7,0
+.align	64
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-macosx.S b/ring-0.17.14/pregenerated/ghash-x86_64-macosx.S
new file mode 100644
index 0000000000..dc9786ad8c
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-x86_64-macosx.S
@@ -0,0 +1,1062 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+.globl	_gcm_init_clmul
+.private_extern _gcm_init_clmul
+
+.p2align	4
+_gcm_init_clmul:
+
+
+_CET_ENDBR
+L$_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	L$0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	pshufd	$78,%xmm2,%xmm6
+	movdqa	%xmm2,%xmm0
+	pxor	%xmm2,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%rdi)
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm5
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqu	%xmm5,48(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,64(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,80(%rdi)
+	ret
+
+
+
+.globl	_gcm_ghash_clmul
+.private_extern _gcm_ghash_clmul
+
+.p2align	5
+_gcm_ghash_clmul:
+
+
+_CET_ENDBR
+L$_ghash_clmul:
+	movdqa	L$bswap_mask(%rip),%xmm10
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm7
+.byte	102,65,15,56,0,194
+
+	subq	$0x10,%rcx
+	jz	L$odd_tail
+
+	movdqu	16(%rsi),%xmm6
+	cmpq	$0x30,%rcx
+	jb	L$skip4x
+
+	subq	$0x30,%rcx
+	movq	$0xA040608020C0E000,%rax
+	movdqu	48(%rsi),%xmm14
+	movdqu	64(%rsi),%xmm15
+
+
+
+
+	movdqu	48(%rdx),%xmm3
+	movdqu	32(%rdx),%xmm11
+.byte	102,65,15,56,0,218
+.byte	102,69,15,56,0,218
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm11,%xmm12
+.byte	102,68,15,58,68,222,0
+.byte	102,68,15,58,68,238,17
+.byte	102,68,15,58,68,231,16
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+	xorps	%xmm12,%xmm4
+
+	movdqu	16(%rdx),%xmm11
+	movdqu	0(%rdx),%xmm8
+.byte	102,69,15,56,0,218
+.byte	102,69,15,56,0,194
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,58,68,238,17
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jc	L$tail4x
+
+	jmp	L$mod4_loop
+.p2align	5
+L$mod4_loop:
+.byte	102,65,15,58,68,199,0
+	xorps	%xmm12,%xmm4
+	movdqu	48(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,65,15,58,68,207,17
+	xorps	%xmm3,%xmm0
+	movdqu	32(%rdx),%xmm3
+	movdqa	%xmm11,%xmm13
+.byte	102,68,15,58,68,199,16
+	pshufd	$78,%xmm11,%xmm12
+	xorps	%xmm5,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,65,15,56,0,218
+	movups	32(%rsi),%xmm7
+	xorps	%xmm4,%xmm8
+.byte	102,68,15,58,68,218,0
+	pshufd	$78,%xmm3,%xmm4
+
+	pxor	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm5
+	pxor	%xmm1,%xmm8
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm8,%xmm9
+.byte	102,68,15,58,68,234,17
+	pslldq	$8,%xmm8
+	psrldq	$8,%xmm9
+	pxor	%xmm8,%xmm0
+	movdqa	L$7_mask(%rip),%xmm8
+	pxor	%xmm9,%xmm1
+.byte	102,76,15,110,200
+
+	pand	%xmm0,%xmm8
+.byte	102,69,15,56,0,200
+	pxor	%xmm0,%xmm9
+.byte	102,68,15,58,68,231,0
+	psllq	$57,%xmm9
+	movdqa	%xmm9,%xmm8
+	pslldq	$8,%xmm9
+.byte	102,15,58,68,222,0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	movdqu	0(%rdx),%xmm8
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	movdqu	16(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,15,58,68,231,16
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+.byte	102,69,15,56,0,194
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+
+	movdqa	%xmm11,%xmm13
+	pxor	%xmm12,%xmm4
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm1
+.byte	102,69,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jnc	L$mod4_loop
+
+L$tail4x:
+.byte	102,65,15,58,68,199,0
+.byte	102,65,15,58,68,207,17
+.byte	102,68,15,58,68,199,16
+	xorps	%xmm12,%xmm4
+	xorps	%xmm3,%xmm0
+	xorps	%xmm5,%xmm1
+	pxor	%xmm0,%xmm1
+	pxor	%xmm4,%xmm8
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm0,%xmm1
+
+	movdqa	%xmm8,%xmm9
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm9
+	pxor	%xmm8,%xmm1
+	pxor	%xmm9,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	addq	$0x40,%rcx
+	jz	L$done
+	movdqu	32(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$odd_tail
+L$skip4x:
+
+
+
+
+
+	movdqu	(%rdx),%xmm8
+	movdqu	16(%rdx),%xmm3
+.byte	102,69,15,56,0,194
+.byte	102,65,15,56,0,218
+	pxor	%xmm8,%xmm0
+
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	leaq	32(%rdx),%rdx
+	nop
+	subq	$0x20,%rcx
+	jbe	L$even_tail
+	nop
+	jmp	L$mod_loop
+
+.p2align	5
+L$mod_loop:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	movdqu	(%rdx),%xmm9
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,56,0,202
+	movdqu	16(%rdx),%xmm3
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm9,%xmm1
+	pxor	%xmm8,%xmm4
+.byte	102,65,15,56,0,218
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm3,%xmm5
+
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm0,%xmm8
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm8
+.byte	102,15,58,68,218,0
+	psllq	$1,%xmm0
+	pxor	%xmm8,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm8
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pshufd	$78,%xmm5,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm5,%xmm4
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,234,17
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+	pxor	%xmm9,%xmm0
+	leaq	32(%rdx),%rdx
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,231,0
+	pxor	%xmm1,%xmm0
+
+	subq	$0x20,%rcx
+	ja	L$mod_loop
+
+L$even_tail:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm1,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testq	%rcx,%rcx
+	jnz	L$done
+
+L$odd_tail:
+	movdqu	(%rdx),%xmm8
+.byte	102,69,15,56,0,194
+	pxor	%xmm8,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,223,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+L$done:
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+	ret
+
+
+
+.globl	_gcm_init_avx
+.private_extern _gcm_init_avx
+
+.p2align	5
+_gcm_init_avx:
+
+
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	L$init_start_avx
+.p2align	5
+L$init_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+L$init_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	L$init_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	ret
+
+
+
+.globl	_gcm_ghash_avx
+.private_extern _gcm_ghash_avx
+
+.p2align	5
+_gcm_ghash_avx:
+
+
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	L$0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	L$bswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	L$short_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	L$tail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	L$oop8x_avx
+
+.p2align	5
+L$oop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	L$oop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	L$tail_no_xor_avx
+
+.p2align	5
+L$short_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	L$tail_avx
+
+.p2align	5
+L$tail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+L$tail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	L$short_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	ret
+
+
+
+.section	__DATA,__const
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$7_mask:
+.long	7,0,7,0
+.p2align	6
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm
new file mode 100644
index 0000000000..eb04eed2d6
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm
@@ -0,0 +1,1277 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+global	gcm_init_clmul
+
+ALIGN	16
+gcm_init_clmul:
+
+$L$SEH_begin_gcm_init_clmul_1:
+_CET_ENDBR
+$L$_init_clmul:
+	sub	rsp,0x18
+$L$SEH_prologue_gcm_init_clmul_2:
+	movaps	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_init_clmul_3:
+$L$SEH_endprologue_gcm_init_clmul_4:
+	movdqu	xmm2,XMMWORD[rdx]
+	pshufd	xmm2,xmm2,78
+
+
+	pshufd	xmm4,xmm2,255
+	movdqa	xmm3,xmm2
+	psllq	xmm2,1
+	pxor	xmm5,xmm5
+	psrlq	xmm3,63
+	pcmpgtd	xmm5,xmm4
+	pslldq	xmm3,8
+	por	xmm2,xmm3
+
+
+	pand	xmm5,XMMWORD[$L$0x1c2_polynomial]
+	pxor	xmm2,xmm5
+
+
+	pshufd	xmm6,xmm2,78
+	movdqa	xmm0,xmm2
+	pxor	xmm6,xmm2
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm2,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm2
+	movdqu	XMMWORD[rcx],xmm2
+	pxor	xmm4,xmm0
+	movdqu	XMMWORD[16+rcx],xmm0
+DB	102,15,58,15,227,8
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	movdqa	xmm5,xmm0
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm5,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm5
+	movdqu	XMMWORD[48+rcx],xmm5
+	pxor	xmm4,xmm0
+	movdqu	XMMWORD[64+rcx],xmm0
+DB	102,15,58,15,227,8
+	movdqu	XMMWORD[80+rcx],xmm4
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+	ret
+
+$L$SEH_end_gcm_init_clmul_5:
+
+global	gcm_ghash_clmul
+
+ALIGN	32
+gcm_ghash_clmul:
+
+$L$SEH_begin_gcm_ghash_clmul_1:
+_CET_ENDBR
+$L$_ghash_clmul:
+	lea	rax,[((-136))+rsp]
+	lea	rsp,[((-32))+rax]
+$L$SEH_prologue_gcm_ghash_clmul_2:
+	movaps	XMMWORD[(-32)+rax],xmm6
+$L$SEH_prologue_gcm_ghash_clmul_3:
+	movaps	XMMWORD[(-16)+rax],xmm7
+$L$SEH_prologue_gcm_ghash_clmul_4:
+	movaps	XMMWORD[rax],xmm8
+$L$SEH_prologue_gcm_ghash_clmul_5:
+	movaps	XMMWORD[16+rax],xmm9
+$L$SEH_prologue_gcm_ghash_clmul_6:
+	movaps	XMMWORD[32+rax],xmm10
+$L$SEH_prologue_gcm_ghash_clmul_7:
+	movaps	XMMWORD[48+rax],xmm11
+$L$SEH_prologue_gcm_ghash_clmul_8:
+	movaps	XMMWORD[64+rax],xmm12
+$L$SEH_prologue_gcm_ghash_clmul_9:
+	movaps	XMMWORD[80+rax],xmm13
+$L$SEH_prologue_gcm_ghash_clmul_10:
+	movaps	XMMWORD[96+rax],xmm14
+$L$SEH_prologue_gcm_ghash_clmul_11:
+	movaps	XMMWORD[112+rax],xmm15
+$L$SEH_prologue_gcm_ghash_clmul_12:
+$L$SEH_endprologue_gcm_ghash_clmul_13:
+	movdqa	xmm10,XMMWORD[$L$bswap_mask]
+
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[rdx]
+	movdqu	xmm7,XMMWORD[32+rdx]
+DB	102,65,15,56,0,194
+
+	sub	r9,0x10
+	jz	NEAR $L$odd_tail
+
+	movdqu	xmm6,XMMWORD[16+rdx]
+	cmp	r9,0x30
+	jb	NEAR $L$skip4x
+
+	sub	r9,0x30
+	mov	rax,0xA040608020C0E000
+	movdqu	xmm14,XMMWORD[48+rdx]
+	movdqu	xmm15,XMMWORD[64+rdx]
+
+
+
+
+	movdqu	xmm3,XMMWORD[48+r8]
+	movdqu	xmm11,XMMWORD[32+r8]
+DB	102,65,15,56,0,218
+DB	102,69,15,56,0,218
+	movdqa	xmm5,xmm3
+	pshufd	xmm4,xmm3,78
+	pxor	xmm4,xmm3
+DB	102,15,58,68,218,0
+DB	102,15,58,68,234,17
+DB	102,15,58,68,231,0
+
+	movdqa	xmm13,xmm11
+	pshufd	xmm12,xmm11,78
+	pxor	xmm12,xmm11
+DB	102,68,15,58,68,222,0
+DB	102,68,15,58,68,238,17
+DB	102,68,15,58,68,231,16
+	xorps	xmm3,xmm11
+	xorps	xmm5,xmm13
+	movups	xmm7,XMMWORD[80+rdx]
+	xorps	xmm4,xmm12
+
+	movdqu	xmm11,XMMWORD[16+r8]
+	movdqu	xmm8,XMMWORD[r8]
+DB	102,69,15,56,0,218
+DB	102,69,15,56,0,194
+	movdqa	xmm13,xmm11
+	pshufd	xmm12,xmm11,78
+	pxor	xmm0,xmm8
+	pxor	xmm12,xmm11
+DB	102,69,15,58,68,222,0
+	movdqa	xmm1,xmm0
+	pshufd	xmm8,xmm0,78
+	pxor	xmm8,xmm0
+DB	102,69,15,58,68,238,17
+DB	102,68,15,58,68,231,0
+	xorps	xmm3,xmm11
+	xorps	xmm5,xmm13
+
+	lea	r8,[64+r8]
+	sub	r9,0x40
+	jc	NEAR $L$tail4x
+
+	jmp	NEAR $L$mod4_loop
+ALIGN	32
+$L$mod4_loop:
+DB	102,65,15,58,68,199,0
+	xorps	xmm4,xmm12
+	movdqu	xmm11,XMMWORD[48+r8]
+DB	102,69,15,56,0,218
+DB	102,65,15,58,68,207,17
+	xorps	xmm0,xmm3
+	movdqu	xmm3,XMMWORD[32+r8]
+	movdqa	xmm13,xmm11
+DB	102,68,15,58,68,199,16
+	pshufd	xmm12,xmm11,78
+	xorps	xmm1,xmm5
+	pxor	xmm12,xmm11
+DB	102,65,15,56,0,218
+	movups	xmm7,XMMWORD[32+rdx]
+	xorps	xmm8,xmm4
+DB	102,68,15,58,68,218,0
+	pshufd	xmm4,xmm3,78
+
+	pxor	xmm8,xmm0
+	movdqa	xmm5,xmm3
+	pxor	xmm8,xmm1
+	pxor	xmm4,xmm3
+	movdqa	xmm9,xmm8
+DB	102,68,15,58,68,234,17
+	pslldq	xmm8,8
+	psrldq	xmm9,8
+	pxor	xmm0,xmm8
+	movdqa	xmm8,XMMWORD[$L$7_mask]
+	pxor	xmm1,xmm9
+DB	102,76,15,110,200
+
+	pand	xmm8,xmm0
+DB	102,69,15,56,0,200
+	pxor	xmm9,xmm0
+DB	102,68,15,58,68,231,0
+	psllq	xmm9,57
+	movdqa	xmm8,xmm9
+	pslldq	xmm9,8
+DB	102,15,58,68,222,0
+	psrldq	xmm8,8
+	pxor	xmm0,xmm9
+	pxor	xmm1,xmm8
+	movdqu	xmm8,XMMWORD[r8]
+
+	movdqa	xmm9,xmm0
+	psrlq	xmm0,1
+DB	102,15,58,68,238,17
+	xorps	xmm3,xmm11
+	movdqu	xmm11,XMMWORD[16+r8]
+DB	102,69,15,56,0,218
+DB	102,15,58,68,231,16
+	xorps	xmm5,xmm13
+	movups	xmm7,XMMWORD[80+rdx]
+DB	102,69,15,56,0,194
+	pxor	xmm1,xmm9
+	pxor	xmm9,xmm0
+	psrlq	xmm0,5
+
+	movdqa	xmm13,xmm11
+	pxor	xmm4,xmm12
+	pshufd	xmm12,xmm11,78
+	pxor	xmm0,xmm9
+	pxor	xmm1,xmm8
+	pxor	xmm12,xmm11
+DB	102,69,15,58,68,222,0
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	movdqa	xmm1,xmm0
+DB	102,69,15,58,68,238,17
+	xorps	xmm3,xmm11
+	pshufd	xmm8,xmm0,78
+	pxor	xmm8,xmm0
+
+DB	102,68,15,58,68,231,0
+	xorps	xmm5,xmm13
+
+	lea	r8,[64+r8]
+	sub	r9,0x40
+	jnc	NEAR $L$mod4_loop
+
+$L$tail4x:
+DB	102,65,15,58,68,199,0
+DB	102,65,15,58,68,207,17
+DB	102,68,15,58,68,199,16
+	xorps	xmm4,xmm12
+	xorps	xmm0,xmm3
+	xorps	xmm1,xmm5
+	pxor	xmm1,xmm0
+	pxor	xmm8,xmm4
+
+	pxor	xmm8,xmm1
+	pxor	xmm1,xmm0
+
+	movdqa	xmm9,xmm8
+	psrldq	xmm8,8
+	pslldq	xmm9,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm9
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	add	r9,0x40
+	jz	NEAR $L$done
+	movdqu	xmm7,XMMWORD[32+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$odd_tail
+$L$skip4x:
+
+
+
+
+
+	movdqu	xmm8,XMMWORD[r8]
+	movdqu	xmm3,XMMWORD[16+r8]
+DB	102,69,15,56,0,194
+DB	102,65,15,56,0,218
+	pxor	xmm0,xmm8
+
+	movdqa	xmm5,xmm3
+	pshufd	xmm4,xmm3,78
+	pxor	xmm4,xmm3
+DB	102,15,58,68,218,0
+DB	102,15,58,68,234,17
+DB	102,15,58,68,231,0
+
+	lea	r8,[32+r8]
+	nop
+	sub	r9,0x20
+	jbe	NEAR $L$even_tail
+	nop
+	jmp	NEAR $L$mod_loop
+
+ALIGN	32
+$L$mod_loop:
+	movdqa	xmm1,xmm0
+	movdqa	xmm8,xmm4
+	pshufd	xmm4,xmm0,78
+	pxor	xmm4,xmm0
+
+DB	102,15,58,68,198,0
+DB	102,15,58,68,206,17
+DB	102,15,58,68,231,16
+
+	pxor	xmm0,xmm3
+	pxor	xmm1,xmm5
+	movdqu	xmm9,XMMWORD[r8]
+	pxor	xmm8,xmm0
+DB	102,69,15,56,0,202
+	movdqu	xmm3,XMMWORD[16+r8]
+
+	pxor	xmm8,xmm1
+	pxor	xmm1,xmm9
+	pxor	xmm4,xmm8
+DB	102,65,15,56,0,218
+	movdqa	xmm8,xmm4
+	psrldq	xmm8,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm4
+
+	movdqa	xmm5,xmm3
+
+	movdqa	xmm9,xmm0
+	movdqa	xmm8,xmm0
+	psllq	xmm0,5
+	pxor	xmm8,xmm0
+DB	102,15,58,68,218,0
+	psllq	xmm0,1
+	pxor	xmm0,xmm8
+	psllq	xmm0,57
+	movdqa	xmm8,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm8,8
+	pxor	xmm0,xmm9
+	pshufd	xmm4,xmm5,78
+	pxor	xmm1,xmm8
+	pxor	xmm4,xmm5
+
+	movdqa	xmm9,xmm0
+	psrlq	xmm0,1
+DB	102,15,58,68,234,17
+	pxor	xmm1,xmm9
+	pxor	xmm9,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm9
+	lea	r8,[32+r8]
+	psrlq	xmm0,1
+DB	102,15,58,68,231,0
+	pxor	xmm0,xmm1
+
+	sub	r9,0x20
+	ja	NEAR $L$mod_loop
+
+$L$even_tail:
+	movdqa	xmm1,xmm0
+	movdqa	xmm8,xmm4
+	pshufd	xmm4,xmm0,78
+	pxor	xmm4,xmm0
+
+DB	102,15,58,68,198,0
+DB	102,15,58,68,206,17
+DB	102,15,58,68,231,16
+
+	pxor	xmm0,xmm3
+	pxor	xmm1,xmm5
+	pxor	xmm8,xmm0
+	pxor	xmm8,xmm1
+	pxor	xmm4,xmm8
+	movdqa	xmm8,xmm4
+	psrldq	xmm8,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	test	r9,r9
+	jnz	NEAR $L$done
+
+$L$odd_tail:
+	movdqu	xmm8,XMMWORD[r8]
+DB	102,69,15,56,0,194
+	pxor	xmm0,xmm8
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,223,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+$L$done:
+DB	102,65,15,56,0,194
+	movdqu	XMMWORD[rcx],xmm0
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_gcm_ghash_clmul_14:
+
+global	gcm_init_avx
+
+ALIGN	32
+gcm_init_avx:
+
+$L$SEH_begin_gcm_init_avx_1:
+_CET_ENDBR
+	sub	rsp,0x18
+$L$SEH_prologue_gcm_init_avx_2:
+	movaps	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_init_avx_3:
+$L$SEH_endprologue_gcm_init_avx_4:
+	vzeroupper
+
+	vmovdqu	xmm2,XMMWORD[rdx]
+	vpshufd	xmm2,xmm2,78
+
+
+	vpshufd	xmm4,xmm2,255
+	vpsrlq	xmm3,xmm2,63
+	vpsllq	xmm2,xmm2,1
+	vpxor	xmm5,xmm5,xmm5
+	vpcmpgtd	xmm5,xmm5,xmm4
+	vpslldq	xmm3,xmm3,8
+	vpor	xmm2,xmm2,xmm3
+
+
+	vpand	xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
+	vpxor	xmm2,xmm2,xmm5
+
+	vpunpckhqdq	xmm6,xmm2,xmm2
+	vmovdqa	xmm0,xmm2
+	vpxor	xmm6,xmm6,xmm2
+	mov	r10,4
+	jmp	NEAR $L$init_start_avx
+ALIGN	32
+$L$init_loop_avx:
+	vpalignr	xmm5,xmm4,xmm3,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+$L$init_start_avx:
+	vmovdqa	xmm5,xmm0
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+	vpshufd	xmm3,xmm5,78
+	vpshufd	xmm4,xmm0,78
+	vpxor	xmm3,xmm3,xmm5
+	vmovdqu	XMMWORD[rcx],xmm5
+	vpxor	xmm4,xmm4,xmm0
+	vmovdqu	XMMWORD[16+rcx],xmm0
+	lea	rcx,[48+rcx]
+	sub	r10,1
+	jnz	NEAR $L$init_loop_avx
+
+	vpalignr	xmm5,xmm3,xmm4,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+	ret
+$L$SEH_end_gcm_init_avx_5:
+
+
+global	gcm_ghash_avx
+
+ALIGN	32
+gcm_ghash_avx:
+
+$L$SEH_begin_gcm_ghash_avx_1:
+_CET_ENDBR
+	lea	rax,[((-136))+rsp]
+	lea	rsp,[((-32))+rax]
+$L$SEH_prologue_gcm_ghash_avx_2:
+	movaps	XMMWORD[(-32)+rax],xmm6
+$L$SEH_prologue_gcm_ghash_avx_3:
+	movaps	XMMWORD[(-16)+rax],xmm7
+$L$SEH_prologue_gcm_ghash_avx_4:
+	movaps	XMMWORD[rax],xmm8
+$L$SEH_prologue_gcm_ghash_avx_5:
+	movaps	XMMWORD[16+rax],xmm9
+$L$SEH_prologue_gcm_ghash_avx_6:
+	movaps	XMMWORD[32+rax],xmm10
+$L$SEH_prologue_gcm_ghash_avx_7:
+	movaps	XMMWORD[48+rax],xmm11
+$L$SEH_prologue_gcm_ghash_avx_8:
+	movaps	XMMWORD[64+rax],xmm12
+$L$SEH_prologue_gcm_ghash_avx_9:
+	movaps	XMMWORD[80+rax],xmm13
+$L$SEH_prologue_gcm_ghash_avx_10:
+	movaps	XMMWORD[96+rax],xmm14
+$L$SEH_prologue_gcm_ghash_avx_11:
+	movaps	XMMWORD[112+rax],xmm15
+$L$SEH_prologue_gcm_ghash_avx_12:
+$L$SEH_endprologue_gcm_ghash_avx_13:
+	vzeroupper
+
+	vmovdqu	xmm10,XMMWORD[rcx]
+	lea	r10,[$L$0x1c2_polynomial]
+	lea	rdx,[64+rdx]
+	vmovdqu	xmm13,XMMWORD[$L$bswap_mask]
+	vpshufb	xmm10,xmm10,xmm13
+	cmp	r9,0x80
+	jb	NEAR $L$short_avx
+	sub	r9,0x80
+
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm9,xmm9,xmm14
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+
+	lea	r8,[128+r8]
+	cmp	r9,0x80
+	jb	NEAR $L$tail_avx
+
+	vpxor	xmm15,xmm15,xmm10
+	sub	r9,0x80
+	jmp	NEAR $L$oop8x_avx
+
+ALIGN	32
+$L$oop8x_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpxor	xmm8,xmm8,xmm15
+	vpclmulqdq	xmm10,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm11,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm12,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm10,xmm10,xmm3
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vxorps	xmm11,xmm11,xmm4
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm12,xmm12,xmm5
+	vxorps	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpxor	xmm12,xmm12,xmm10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm12,xmm12,xmm11
+	vpslldq	xmm9,xmm12,8
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vpsrldq	xmm12,xmm12,8
+	vpxor	xmm10,xmm10,xmm9
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vxorps	xmm11,xmm11,xmm12
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vxorps	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+	vxorps	xmm10,xmm10,xmm12
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vxorps	xmm12,xmm12,xmm11
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm15,xmm15,xmm12
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+	vpxor	xmm15,xmm15,xmm10
+
+	lea	r8,[128+r8]
+	sub	r9,0x80
+	jnc	NEAR $L$oop8x_avx
+
+	add	r9,0x80
+	jmp	NEAR $L$tail_no_xor_avx
+
+ALIGN	32
+$L$short_avx:
+	vmovdqu	xmm14,XMMWORD[((-16))+r9*1+r8]
+	lea	r8,[r9*1+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+
+	vmovdqa	xmm3,xmm0
+	vmovdqa	xmm4,xmm1
+	vmovdqa	xmm5,xmm2
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-32))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-48))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-64))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-80))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-96))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-112))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovq	xmm7,QWORD[((184-64))+rdx]
+	sub	r9,0x10
+	jmp	NEAR $L$tail_avx
+
+ALIGN	32
+$L$tail_avx:
+	vpxor	xmm15,xmm15,xmm10
+$L$tail_no_xor_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+
+	vmovdqu	xmm12,XMMWORD[r10]
+
+	vpxor	xmm10,xmm3,xmm0
+	vpxor	xmm11,xmm4,xmm1
+	vpxor	xmm5,xmm5,xmm2
+
+	vpxor	xmm5,xmm5,xmm10
+	vpxor	xmm5,xmm5,xmm11
+	vpslldq	xmm9,xmm5,8
+	vpsrldq	xmm5,xmm5,8
+	vpxor	xmm10,xmm10,xmm9
+	vpxor	xmm11,xmm11,xmm5
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm9
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm11
+	vpxor	xmm10,xmm10,xmm9
+
+	cmp	r9,0
+	jne	NEAR $L$short_avx
+
+	vpshufb	xmm10,xmm10,xmm13
+	vmovdqu	XMMWORD[rcx],xmm10
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_gcm_ghash_avx_14:
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$bswap_mask:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$0x1c2_polynomial:
+	DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$7_mask:
+	DD	7,0,7,0
+ALIGN	64
+
+	DB	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
+	DB	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+	DB	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+	DB	114,103,62,0
+ALIGN	64
+section	.text
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_gcm_init_clmul_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_clmul_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_clmul_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_clmul_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_clmul_14 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_clmul_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_init_avx_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_avx_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_avx_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_avx_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_avx_14 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_avx_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_gcm_init_clmul_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_init_clmul_4-$L$SEH_begin_gcm_init_clmul_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prologue_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_init_clmul_2-$L$SEH_begin_gcm_init_clmul_1
+	DB	34
+
+	DW	0
+$L$SEH_info_gcm_ghash_clmul_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_ghash_clmul_13-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	22
+	DB	0
+	DB	$L$SEH_prologue_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_gcm_ghash_clmul_11-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_gcm_ghash_clmul_10-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_gcm_ghash_clmul_9-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_gcm_ghash_clmul_8-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_gcm_ghash_clmul_7-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_gcm_ghash_clmul_6-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_gcm_ghash_clmul_5-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_gcm_ghash_clmul_4-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_gcm_ghash_clmul_3-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_ghash_clmul_2-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	1
+	DW	21
+
+$L$SEH_info_gcm_init_avx_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_init_avx_4-$L$SEH_begin_gcm_init_avx_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prologue_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_init_avx_2-$L$SEH_begin_gcm_init_avx_1
+	DB	34
+
+	DW	0
+$L$SEH_info_gcm_ghash_avx_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_ghash_avx_13-$L$SEH_begin_gcm_ghash_avx_1
+	DB	22
+	DB	0
+	DB	$L$SEH_prologue_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_gcm_ghash_avx_11-$L$SEH_begin_gcm_ghash_avx_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_gcm_ghash_avx_10-$L$SEH_begin_gcm_ghash_avx_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_gcm_ghash_avx_9-$L$SEH_begin_gcm_ghash_avx_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_gcm_ghash_avx_8-$L$SEH_begin_gcm_ghash_avx_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_gcm_ghash_avx_7-$L$SEH_begin_gcm_ghash_avx_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_gcm_ghash_avx_6-$L$SEH_begin_gcm_ghash_avx_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_gcm_ghash_avx_5-$L$SEH_begin_gcm_ghash_avx_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_gcm_ghash_avx_4-$L$SEH_begin_gcm_ghash_avx_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_gcm_ghash_avx_3-$L$SEH_begin_gcm_ghash_avx_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_ghash_avx_2-$L$SEH_begin_gcm_ghash_avx_1
+	DB	1
+	DW	21
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-nasm.o b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.o
new file mode 100644
index 0000000000..2908f94555
Binary files /dev/null and b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/ghashv8-armx-ios64.S b/ring-0.17.14/pregenerated/ghashv8-armx-ios64.S
new file mode 100644
index 0000000000..0747a5fc6e
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghashv8-armx-ios64.S
@@ -0,0 +1,149 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.globl	_gcm_init_clmul
+.private_extern	_gcm_init_clmul
+
+.align	4
+_gcm_init_clmul:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+
+.globl	_gcm_gmult_clmul
+.private_extern	_gcm_gmult_clmul
+
+.align	4
+_gcm_gmult_clmul:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/ghashv8-armx-linux64.S b/ring-0.17.14/pregenerated/ghashv8-armx-linux64.S
new file mode 100644
index 0000000000..4e811b7a5c
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghashv8-armx-linux64.S
@@ -0,0 +1,149 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_clmul
+.hidden	gcm_init_clmul
+.type	gcm_init_clmul,%function
+.align	4
+gcm_init_clmul:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_gmult_clmul
+.hidden	gcm_gmult_clmul
+.type	gcm_gmult_clmul,%function
+.align	4
+gcm_gmult_clmul:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/ghashv8-armx-win64.S b/ring-0.17.14/pregenerated/ghashv8-armx-win64.S
new file mode 100644
index 0000000000..86386ac7be
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ghashv8-armx-win64.S
@@ -0,0 +1,153 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_clmul
+
+.def gcm_init_clmul
+   .type 32
+.endef
+.align	4
+gcm_init_clmul:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+
+.globl	gcm_gmult_clmul
+
+.def gcm_gmult_clmul
+   .type 32
+.endef
+.align	4
+gcm_gmult_clmul:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S b/ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S
new file mode 100644
index 0000000000..88355ca462
--- /dev/null
+++ b/ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S
@@ -0,0 +1,1608 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+.section	__TEXT,__const
+.align	5
+Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad	1,0,0,0
+Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	_ecp_nistz256_mul_mont
+.private_extern	_ecp_nistz256_mul_mont
+
+.align	4
+_ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_sqr_mont
+.private_extern	_ecp_nistz256_sqr_mont
+
+.align	4
+_ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_neg
+.private_extern	_ecp_nistz256_neg
+
+.align	4
+_ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+.globl	_ecp_nistz256_point_double
+.private_extern	_ecp_nistz256_point_double
+
+.align	5
+_ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_ecp_nistz256_point_add
+.private_extern	_ecp_nistz256_point_add
+
+.align	5
+_ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	Ldouble_shortcut
+
+.align	4
+Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_ecp_nistz256_point_add_affine
+.private_extern	_ecp_nistz256_point_add_affine
+
+.align	5
+_ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,Lone_mont@PAGE-64
+	add	x23,x23,Lone_mont@PAGEOFF-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	_ecp_nistz256_ord_mul_mont
+.private_extern	_ecp_nistz256_ord_mul_mont
+
+.align	4
+_ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord@PAGE
+	add	x23,x23,Lord@PAGEOFF
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	_ecp_nistz256_ord_sqr_mont
+.private_extern	_ecp_nistz256_ord_sqr_mont
+
+.align	4
+_ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord@PAGE
+	add	x23,x23,Lord@PAGEOFF
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	Loop_ord_sqr
+
+.align	4
+Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	_ecp_nistz256_select_w5
+.private_extern	_ecp_nistz256_select_w5
+
+.align	4
+_ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	_ecp_nistz256_select_w7
+.private_extern	_ecp_nistz256_select_w7
+
+.align	4
+_ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S b/ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S
new file mode 100644
index 0000000000..e9a8c521f4
--- /dev/null
+++ b/ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S
@@ -0,0 +1,1608 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+.section	.rodata
+.align	5
+.Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+.LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+.Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+.Lone:
+.quad	1,0,0,0
+.Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+.hidden	ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,%function
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+.hidden	ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,%function
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+.hidden	ecp_nistz256_neg
+.type	ecp_nistz256_neg,%function
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.type	__ecp_nistz256_mul_mont,%function
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.type	__ecp_nistz256_sqr_mont,%function
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.type	__ecp_nistz256_add_to,%function
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
+
+.type	__ecp_nistz256_sub_from,%function
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
+
+.type	__ecp_nistz256_sub_morf,%function
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
+
+.type	__ecp_nistz256_div_by_2,%function
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
+.globl	ecp_nistz256_point_double
+.hidden	ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,%function
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+.Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl	ecp_nistz256_point_add
+.hidden	ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,%function
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+.Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	.Ldouble_shortcut
+
+.align	4
+.Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+.Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl	ecp_nistz256_point_add_affine
+.hidden	ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,%function
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,.Lone_mont-64
+	add	x23,x23,:lo12:.Lone_mont-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+.hidden	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,%function
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,.Lord
+	add	x23,x23,:lo12:.Lord
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	ecp_nistz256_ord_sqr_mont
+.hidden	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,%function
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,.Lord
+	add	x23,x23,:lo12:.Lord
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	.Loop_ord_sqr
+
+.align	4
+.Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,.Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+.hidden	ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,%function
+.align	4
+ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+.Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, .Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+.hidden	ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,%function
+.align	4
+ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+.Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, .Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/p256-armv8-asm-win64.S b/ring-0.17.14/pregenerated/p256-armv8-asm-win64.S
new file mode 100644
index 0000000000..efc4581039
--- /dev/null
+++ b/ring-0.17.14/pregenerated/p256-armv8-asm-win64.S
@@ -0,0 +1,1640 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+.section	.rodata
+.align	5
+Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad	1,0,0,0
+Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+
+.def ecp_nistz256_mul_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+
+.def ecp_nistz256_sqr_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+
+.def ecp_nistz256_neg
+   .type 32
+.endef
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.def __ecp_nistz256_mul_mont
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.def __ecp_nistz256_sqr_mont
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.def __ecp_nistz256_add_to
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_sub_from
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_sub_morf
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_div_by_2
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+.globl	ecp_nistz256_point_double
+
+.def ecp_nistz256_point_double
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	ecp_nistz256_point_add
+
+.def ecp_nistz256_point_add
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	Ldouble_shortcut
+
+.align	4
+Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	ecp_nistz256_point_add_affine
+
+.def ecp_nistz256_point_add_affine
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,Lone_mont-64
+	add	x23,x23,:lo12:Lone_mont-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+
+.def ecp_nistz256_ord_mul_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord
+	add	x23,x23,:lo12:Lord
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	ecp_nistz256_ord_sqr_mont
+
+.def ecp_nistz256_ord_sqr_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord
+	add	x23,x23,:lo12:Lord
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	Loop_ord_sqr
+
+.align	4
+Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+
+.def ecp_nistz256_select_w5
+   .type 32
+.endef
+.align	4
+ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+
+.def ecp_nistz256_select_w7
+   .type 32
+.endef
+.align	4
+ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S b/ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S
new file mode 100644
index 0000000000..deeec078b2
--- /dev/null
+++ b/ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S
@@ -0,0 +1,4599 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+
+.section	.rodata
+.align	64
+.Lpoly:
+.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+.LOne:
+.long	1,1,1,1,1,1,1,1
+.LTwo:
+.long	2,2,2,2,2,2,2,2
+.LThree:
+.long	3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+.Lord:
+.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+.text	
+
+
+
+.globl	ecp_nistz256_neg
+.hidden ecp_nistz256_neg
+.type	ecp_nistz256_neg,@function
+.align	32
+ecp_nistz256_neg:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+.Lneg_body:
+
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r13,%r13
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r8,%rax
+	sbbq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r13
+.cfi_restore	%r13
+	movq	8(%rsp),%r12
+.cfi_restore	%r12
+	leaq	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lneg_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+
+
+
+
+
+.globl	ecp_nistz256_ord_mul_mont_nohw
+.hidden ecp_nistz256_ord_mul_mont_nohw
+.type	ecp_nistz256_ord_mul_mont_nohw,@function
+.align	32
+ecp_nistz256_ord_mul_mont_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_mul_body:
+
+	movq	0(%rdx),%rax
+	movq	%rdx,%rbx
+	leaq	.Lord(%rip),%r14
+	movq	.LordK(%rip),%r15
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,%r13
+	imulq	%r15,%r8
+
+	movq	%rdx,%r11
+	mulq	24(%rsi)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	0(%r14)
+	movq	%r8,%rbp
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%r8
+
+	mulq	8(%r14)
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	%rdx,%r10
+	movq	%rbp,%rdx
+	adcq	$0,%r8
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	8(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r8,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r9,%rcx
+	imulq	%r15,%r9
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+	mulq	0(%r14)
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	%r9,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%r9
+
+	mulq	8(%r14)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	%rdx,%r11
+	movq	%rbp,%rdx
+	adcq	$0,%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r12
+	movq	16(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r9,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r10,%rcx
+	imulq	%r15,%r10
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+	mulq	0(%r14)
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	%r10,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r12
+	sbbq	$0,%r10
+
+	mulq	8(%r14)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	%rdx,%r12
+	movq	%rbp,%rdx
+	adcq	$0,%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r13
+	movq	24(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r10,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r11,%rcx
+	imulq	%r15,%r11
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+	mulq	0(%r14)
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	%r11,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r13
+	sbbq	$0,%r11
+
+	mulq	8(%r14)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	%rdx,%r13
+	movq	%rbp,%rdx
+	adcq	$0,%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	sbbq	%rdx,%rbp
+
+	addq	%r11,%r8
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	movq	%r12,%rsi
+	subq	0(%r14),%r12
+	movq	%r13,%r11
+	sbbq	8(%r14),%r13
+	movq	%r8,%rcx
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rsi,%r12
+	cmovcq	%r11,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw
+
+
+
+
+
+
+
+.globl	ecp_nistz256_ord_sqr_mont_nohw
+.hidden ecp_nistz256_ord_sqr_mont_nohw
+.type	ecp_nistz256_ord_sqr_mont_nohw,@function
+.align	32
+ecp_nistz256_ord_sqr_mont_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_sqr_body:
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%rax
+	movq	16(%rsi),%r14
+	movq	24(%rsi),%r15
+	leaq	.Lord(%rip),%rsi
+	movq	%rdx,%rbx
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+
+	movq	%rax,%rbp
+	mulq	%r8
+	movq	%rax,%r9
+.byte	102,72,15,110,205
+	movq	%r14,%rax
+	movq	%rdx,%r10
+
+	mulq	%r8
+	addq	%rax,%r10
+	movq	%r15,%rax
+.byte	102,73,15,110,214
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r8
+	addq	%rax,%r11
+	movq	%r15,%rax
+.byte	102,73,15,110,223
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	movq	%rax,%r13
+	movq	%r14,%rax
+	movq	%rdx,%r14
+
+
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+
+	addq	%r15,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r14
+
+
+	xorq	%r15,%r15
+	movq	%r8,%rax
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+
+	mulq	%rax
+	movq	%rax,%r8
+.byte	102,72,15,126,200
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r9
+	adcq	%rax,%r10
+.byte	102,72,15,126,208
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r11
+	adcq	%rax,%r12
+.byte	102,72,15,126,216
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	movq	%r8,%rcx
+	imulq	32(%rsi),%r8
+
+	mulq	%rax
+	addq	%rbp,%r13
+	adcq	%rax,%r14
+	movq	0(%rsi),%rax
+	adcq	%rdx,%r15
+
+
+	mulq	%r8
+	movq	%r8,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%rbp
+
+	mulq	%r8
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	%rdx,%r10
+	movq	%r8,%rdx
+	adcq	$0,%rbp
+
+	movq	%r9,%rcx
+	imulq	32(%rsi),%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r8
+
+	addq	%rbp,%r11
+	adcq	$0,%r8
+
+
+	mulq	%r9
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%rbp
+
+	mulq	%r9
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r9,%rdx
+	adcq	$0,%rbp
+
+	movq	%r10,%rcx
+	imulq	32(%rsi),%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r9
+
+	addq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	mulq	%r10
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r8
+	sbbq	$0,%rbp
+
+	mulq	%r10
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	movq	%r10,%rdx
+	adcq	$0,%rbp
+
+	movq	%r11,%rcx
+	imulq	32(%rsi),%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r9
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r10
+
+	addq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	mulq	%r11
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r9
+	sbbq	$0,%rbp
+
+	mulq	%r11
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	movq	%r11,%rdx
+	adcq	$0,%rbp
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r10
+	sbbq	%rdx,%r11
+
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+
+	xorq	%rdx,%rdx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	movq	%r8,%r12
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+	subq	0(%rsi),%r8
+	movq	%r10,%r14
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r15
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r8
+	cmovncq	%r9,%rax
+	cmovncq	%r10,%r14
+	cmovncq	%r11,%r15
+
+	decq	%rbx
+	jnz	.Loop_ord_sqr
+
+	movq	%r8,0(%rdi)
+	movq	%rax,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r14,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r15,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw
+
+.globl	ecp_nistz256_ord_mul_mont_adx
+.hidden ecp_nistz256_ord_mul_mont_adx
+.type	ecp_nistz256_ord_mul_mont_adx,@function
+.align	32
+ecp_nistz256_ord_mul_mont_adx:
+.cfi_startproc	
+.Lecp_nistz256_ord_mul_mont_adx:
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_mulx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+	leaq	.Lord-128(%rip),%r14
+	movq	.LordK(%rip),%r15
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	mulxq	%r11,%rbp,%r11
+	addq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcq	%rbp,%r10
+	adcq	%rcx,%r11
+	adcq	$0,%r12
+
+
+	xorq	%r13,%r13
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+	adcxq	%r8,%r12
+	adoxq	%r8,%r13
+	adcq	$0,%r13
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r13
+	adoxq	%r9,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	leaq	128(%r14),%r14
+	movq	%r12,%rbx
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	movq	%r13,%rdx
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcq	$0,%r10
+
+
+
+	movq	%r8,%rcx
+	subq	0(%r14),%r12
+	sbbq	8(%r14),%r13
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx
+
+.globl	ecp_nistz256_ord_sqr_mont_adx
+.hidden ecp_nistz256_ord_sqr_mont_adx
+.type	ecp_nistz256_ord_sqr_mont_adx,@function
+.align	32
+ecp_nistz256_ord_sqr_mont_adx:
+.cfi_startproc	
+_CET_ENDBR
+.Lecp_nistz256_ord_sqr_mont_adx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_sqrx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	.Lord(%rip),%rsi
+	jmp	.Loop_ord_sqrx
+
+.align	32
+.Loop_ord_sqrx:
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	movq	%rdx,%rax
+.byte	102,73,15,110,206
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	addq	%rcx,%r10
+.byte	102,73,15,110,215
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+	mulxq	%r8,%rcx,%r14
+	movq	%rax,%rdx
+.byte	102,73,15,110,216
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+
+	mulxq	%rdx,%r8,%rbp
+.byte	102,72,15,126,202
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+.byte	102,72,15,126,210
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+	mulxq	%rdx,%rcx,%rbp
+.byte	0x67
+.byte	102,72,15,126,218
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	mulxq	%rdx,%rcx,%rax
+	adoxq	%rcx,%r14
+	adoxq	%rax,%r15
+
+
+	movq	%r8,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	xorq	%rax,%rax
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	adcxq	%rax,%r8
+
+
+	movq	%r9,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	adoxq	%rax,%r9
+
+
+	movq	%r10,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	adcxq	%rax,%r10
+
+
+	movq	%r11,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	adoxq	%rax,%r11
+
+
+	addq	%r8,%r12
+	adcq	%r13,%r9
+	movq	%r12,%rdx
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%r14
+	adcq	$0,%rax
+
+
+	subq	0(%rsi),%r12
+	movq	%r10,%r15
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r8
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rax
+
+	cmovncq	%r12,%rdx
+	cmovncq	%r9,%r14
+	cmovncq	%r10,%r15
+	cmovncq	%r11,%r8
+
+	decq	%rbx
+	jnz	.Loop_ord_sqrx
+
+	movq	%rdx,0(%rdi)
+	movq	%r14,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r15,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r8,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx
+
+
+
+
+
+
+.globl	ecp_nistz256_mul_mont_nohw
+.hidden ecp_nistz256_mul_mont_nohw
+.type	ecp_nistz256_mul_mont_nohw,@function
+.align	32
+ecp_nistz256_mul_mont_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lmul_body:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmul_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw
+
+.type	__ecp_nistz256_mul_montq,@function
+.align	32
+__ecp_nistz256_mul_montq:
+.cfi_startproc	
+
+
+	movq	%rax,%rbp
+	mulq	%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r10
+	movq	.Lpoly+24(%rip),%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r11
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r13,%r13
+	movq	%rdx,%r12
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,%rbp
+	shlq	$32,%r8
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r8,%r9
+	adcq	%rbp,%r10
+	adcq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+
+	movq	%r9,%rbp
+	shlq	$32,%r9
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r9,%r10
+	adcq	%rbp,%r11
+	adcq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+
+	movq	%r10,%rbp
+	shlq	$32,%r10
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r10,%r11
+	adcq	%rbp,%r12
+	adcq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+	movq	%r11,%rbp
+	shlq	$32,%r11
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r11,%r12
+	adcq	%rbp,%r13
+	movq	%r12,%rcx
+	adcq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r10
+
+
+
+	subq	$-1,%r12
+	movq	%r8,%rbx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rdx
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rcx,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rbx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rdx,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+
+
+
+
+
+
+
+.globl	ecp_nistz256_sqr_mont_nohw
+.hidden ecp_nistz256_sqr_mont_nohw
+.type	ecp_nistz256_sqr_mont_nohw,@function
+.align	32
+ecp_nistz256_sqr_mont_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lsqr_body:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqr_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw
+
+.type	__ecp_nistz256_sqr_montq,@function
+.align	32
+__ecp_nistz256_sqr_montq:
+.cfi_startproc	
+	movq	%rax,%r13
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+
+	mulq	%r13
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%r14
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+
+	mulq	%r15
+	xorq	%r15,%r15
+	addq	%rax,%r13
+	movq	0(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+	mulq	%rax
+	movq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r11
+	adcq	%rax,%r12
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r13
+	adcq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	movq	.Lpoly+8(%rip),%rsi
+	movq	.Lpoly+24(%rip),%rbp
+
+
+
+
+	movq	%r8,%rcx
+	shlq	$32,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	adcq	%rax,%r10
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	movq	%r13,%r9
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%rcx
+	sbbq	%rbp,%r15
+	sbbq	$0,%r11
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%rcx,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.globl	ecp_nistz256_mul_mont_adx
+.hidden ecp_nistz256_mul_mont_adx
+.type	ecp_nistz256_mul_mont_adx,@function
+.align	32
+ecp_nistz256_mul_mont_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lmulx_body:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmulx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx
+
+.type	__ecp_nistz256_mul_montx,@function
+.align	32
+__ecp_nistz256_mul_montx:
+.cfi_startproc	
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	movq	$32,%r14
+	xorq	%r13,%r13
+	mulxq	%r11,%rbp,%r11
+	movq	.Lpoly+24(%rip),%r15
+	adcq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	adcq	%rbp,%r10
+	shlxq	%r14,%r8,%rbp
+	adcq	%rcx,%r11
+	shrxq	%r14,%r8,%rcx
+	adcq	$0,%r12
+
+
+
+	addq	%rbp,%r9
+	adcq	%rcx,%r10
+
+	mulxq	%r15,%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcq	%rcx,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	adcxq	%rcx,%r12
+	shlxq	%r14,%r9,%rcx
+	adoxq	%rbp,%r13
+	shrxq	%r14,%r9,%rbp
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+
+	addq	%rcx,%r10
+	adcq	%rbp,%r11
+
+	mulxq	%r15,%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcq	%rcx,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	adcxq	%rcx,%r13
+	shlxq	%r14,%r10,%rcx
+	adoxq	%rbp,%r8
+	shrxq	%r14,%r10,%rbp
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+
+	addq	%rcx,%r11
+	adcq	%rbp,%r12
+
+	mulxq	%r15,%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcq	%rcx,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	adcxq	%rcx,%r8
+	shlxq	%r14,%r11,%rcx
+	adoxq	%rbp,%r9
+	shrxq	%r14,%r11,%rbp
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+
+	addq	%rcx,%r12
+	adcq	%rbp,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	movq	%r12,%rbx
+	movq	.Lpoly+8(%rip),%r14
+	adcq	%rcx,%r8
+	movq	%r13,%rdx
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+
+	xorl	%eax,%eax
+	movq	%r8,%rcx
+	sbbq	$-1,%r12
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rbp
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.globl	ecp_nistz256_sqr_mont_adx
+.hidden ecp_nistz256_sqr_mont_adx
+.type	ecp_nistz256_sqr_mont_adx,@function
+.align	32
+ecp_nistz256_sqr_mont_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lsqrx_body:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqrx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx
+
+.type	__ecp_nistz256_sqr_montx,@function
+.align	32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc	
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	xorl	%eax,%eax
+	adcq	%rcx,%r10
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+
+	mulxq	%r8,%rcx,%r14
+	movq	0+128(%rsi),%rdx
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+	mulxq	%rdx,%r8,%rbp
+	movq	8+128(%rsi),%rdx
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+	movq	16+128(%rsi),%rdx
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+.byte	0x67
+	mulxq	%rdx,%rcx,%rbp
+	movq	24+128(%rsi),%rdx
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	movq	$32,%rsi
+	adoxq	%rbp,%r13
+.byte	0x67,0x67
+	mulxq	%rdx,%rcx,%rax
+	movq	.Lpoly+24(%rip),%rdx
+	adoxq	%rcx,%r14
+	shlxq	%rsi,%r8,%rcx
+	adoxq	%rax,%r15
+	shrxq	%rsi,%r8,%rax
+	movq	%rdx,%rbp
+
+
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+
+	mulxq	%r8,%rcx,%r8
+	adcq	%rcx,%r11
+	shlxq	%rsi,%r9,%rcx
+	adcq	$0,%r8
+	shrxq	%rsi,%r9,%rax
+
+
+	addq	%rcx,%r10
+	adcq	%rax,%r11
+
+	mulxq	%r9,%rcx,%r9
+	adcq	%rcx,%r8
+	shlxq	%rsi,%r10,%rcx
+	adcq	$0,%r9
+	shrxq	%rsi,%r10,%rax
+
+
+	addq	%rcx,%r11
+	adcq	%rax,%r8
+
+	mulxq	%r10,%rcx,%r10
+	adcq	%rcx,%r9
+	shlxq	%rsi,%r11,%rcx
+	adcq	$0,%r10
+	shrxq	%rsi,%r11,%rax
+
+
+	addq	%rcx,%r8
+	adcq	%rax,%r9
+
+	mulxq	%r11,%rcx,%r11
+	adcq	%rcx,%r10
+	adcq	$0,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r8,%r12
+	movq	.Lpoly+8(%rip),%rsi
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	movq	%r13,%r9
+	adcq	$0,%rdx
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%r11
+	sbbq	%rbp,%r15
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%r11,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+
+
+.globl	ecp_nistz256_select_w5_nohw
+.hidden ecp_nistz256_select_w5_nohw
+.type	ecp_nistz256_select_w5_nohw,@function
+.align	32
+ecp_nistz256_select_w5_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movdqa	.LOne(%rip),%xmm0
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+
+	movdqa	%xmm0,%xmm8
+	pshufd	$0,%xmm1,%xmm1
+
+	movq	$16,%rax
+.Lselect_loop_sse_w5:
+
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	pcmpeqd	%xmm1,%xmm15
+
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	movdqa	64(%rsi),%xmm13
+	movdqa	80(%rsi),%xmm14
+	leaq	96(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	pand	%xmm15,%xmm13
+	por	%xmm12,%xmm5
+	pand	%xmm15,%xmm14
+	por	%xmm13,%xmm6
+	por	%xmm14,%xmm7
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w5
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm7,80(%rdi)
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w5_nohw:
+.size	ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw
+
+
+
+.globl	ecp_nistz256_select_w7_nohw
+.hidden ecp_nistz256_select_w7_nohw
+.type	ecp_nistz256_select_w7_nohw,@function
+.align	32
+ecp_nistz256_select_w7_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movdqa	.LOne(%rip),%xmm8
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	movdqa	%xmm8,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	$64,%rax
+
+.Lselect_loop_sse_w7:
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	pcmpeqd	%xmm1,%xmm15
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	leaq	64(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	prefetcht0	255(%rsi)
+	por	%xmm12,%xmm5
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w7
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w7_nohw:
+.size	ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw
+
+
+.globl	ecp_nistz256_select_w5_avx2
+.hidden ecp_nistz256_select_w5_avx2
+.type	ecp_nistz256_select_w5_avx2,@function
+.align	32
+ecp_nistz256_select_w5_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	vzeroupper
+	vmovdqa	.LTwo(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+	vpxor	%ymm4,%ymm4,%ymm4
+
+	vmovdqa	.LOne(%rip),%ymm5
+	vmovdqa	.LTwo(%rip),%ymm10
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+	movq	$8,%rax
+.Lselect_loop_avx2_w5:
+
+	vmovdqa	0(%rsi),%ymm6
+	vmovdqa	32(%rsi),%ymm7
+	vmovdqa	64(%rsi),%ymm8
+
+	vmovdqa	96(%rsi),%ymm11
+	vmovdqa	128(%rsi),%ymm12
+	vmovdqa	160(%rsi),%ymm13
+
+	vpcmpeqd	%ymm1,%ymm5,%ymm9
+	vpcmpeqd	%ymm1,%ymm10,%ymm14
+
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm10,%ymm10
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm9,%ymm6,%ymm6
+	vpand	%ymm9,%ymm7,%ymm7
+	vpand	%ymm9,%ymm8,%ymm8
+	vpand	%ymm14,%ymm11,%ymm11
+	vpand	%ymm14,%ymm12,%ymm12
+	vpand	%ymm14,%ymm13,%ymm13
+
+	vpxor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm2,%ymm2
+	vpxor	%ymm12,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm4,%ymm4
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w5
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vzeroupper
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w5_avx2:
+.size	ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2
+
+
+
+.globl	ecp_nistz256_select_w7_avx2
+.hidden ecp_nistz256_select_w7_avx2
+.type	ecp_nistz256_select_w7_avx2,@function
+.align	32
+ecp_nistz256_select_w7_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	vzeroupper
+	vmovdqa	.LThree(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+
+	vmovdqa	.LOne(%rip),%ymm4
+	vmovdqa	.LTwo(%rip),%ymm8
+	vmovdqa	.LThree(%rip),%ymm12
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+
+	movq	$21,%rax
+.Lselect_loop_avx2_w7:
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vmovdqa	64(%rsi),%ymm9
+	vmovdqa	96(%rsi),%ymm10
+
+	vmovdqa	128(%rsi),%ymm13
+	vmovdqa	160(%rsi),%ymm14
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+	vpcmpeqd	%ymm1,%ymm8,%ymm11
+	vpcmpeqd	%ymm1,%ymm12,%ymm15
+
+	vpaddd	%ymm0,%ymm4,%ymm4
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpaddd	%ymm0,%ymm12,%ymm12
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+	vpand	%ymm11,%ymm9,%ymm9
+	vpand	%ymm11,%ymm10,%ymm10
+	vpand	%ymm15,%ymm13,%ymm13
+	vpand	%ymm15,%ymm14,%ymm14
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm9,%ymm2,%ymm2
+	vpxor	%ymm10,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm2,%ymm2
+	vpxor	%ymm14,%ymm3,%ymm3
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w7
+
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vzeroupper
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w7_avx2:
+.size	ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2
+.type	__ecp_nistz256_add_toq,@function
+.align	32
+__ecp_nistz256_add_toq:
+.cfi_startproc	
+	xorq	%r11,%r11
+	addq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type	__ecp_nistz256_sub_fromq,@function
+.align	32
+__ecp_nistz256_sub_fromq:
+.cfi_startproc	
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	%r11,%r11
+
+	addq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+	testq	%r11,%r11
+
+	cmovzq	%rax,%r12
+	cmovzq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovzq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovzq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type	__ecp_nistz256_subq,@function
+.align	32
+__ecp_nistz256_subq:
+.cfi_startproc	
+	subq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	%r11,%r11
+
+	addq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+	testq	%r11,%r11
+
+	cmovnzq	%rax,%r12
+	cmovnzq	%rbp,%r13
+	cmovnzq	%rcx,%r8
+	cmovnzq	%r10,%r9
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type	__ecp_nistz256_mul_by_2q,@function
+.align	32
+__ecp_nistz256_mul_by_2q:
+.cfi_startproc	
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl	ecp_nistz256_point_double_nohw
+.hidden ecp_nistz256_point_double_nohw
+.type	ecp_nistz256_point_double_nohw,@function
+.align	32
+ecp_nistz256_point_double_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$160+8,%rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_doubleq_body:
+
+.Lpoint_double_shortcutq:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-0(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	32(%rbx),%rax
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rax
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	0+32(%rsp),%rax
+	movq	8+32(%rsp),%r14
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subq
+
+	movq	32(%rsp),%rax
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-0(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	leaq	160+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_doubleq_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_double_nohw,.-ecp_nistz256_point_double_nohw
+.globl	ecp_nistz256_point_add_nohw
+.hidden ecp_nistz256_point_add_nohw
+.type	ecp_nistz256_point_add_nohw,@function
+.align	32
+ecp_nistz256_point_add_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$576+8,%rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_addq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-0(%rsi),%rsi
+	movq	%rax,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rax
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	416(%rsp),%rax
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	512(%rsp),%rax
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	0+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	480(%rsp),%rax
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	.Ladd_proceedq
+
+
+
+	testq	%r9,%r9
+	jz	.Ladd_doubleq
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_doneq
+
+.align	32
+.Ladd_doubleq:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+.cfi_adjust_cfa_offset	-416
+	jmp	.Lpoint_double_shortcutq
+.cfi_adjust_cfa_offset	416
+
+.align	32
+.Ladd_proceedq:
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	0+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%rax
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	160(%rsp),%rax
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_doneq:
+	leaq	576+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_addq_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add_nohw,.-ecp_nistz256_point_add_nohw
+.globl	ecp_nistz256_point_add_affine_nohw
+.hidden ecp_nistz256_point_add_affine_nohw
+.type	ecp_nistz256_point_add_affine_nohw,@function
+.align	32
+ecp_nistz256_point_add_affine_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$480+8,%rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affineq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-0(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rax
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-0(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+96(%rsp),%rax
+	movq	8+96(%rsp),%r14
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	0+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rax
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	0+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affineq_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add_affine_nohw,.-ecp_nistz256_point_add_affine_nohw
+.type	__ecp_nistz256_add_tox,@function
+.align	32
+__ecp_nistz256_add_tox:
+.cfi_startproc	
+	xorq	%r11,%r11
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type	__ecp_nistz256_sub_fromx,@function
+.align	32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc	
+	xorq	%r11,%r11
+	sbbq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	$0,%r11
+
+	xorq	%r10,%r10
+	adcq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+
+	btq	$0,%r11
+	cmovncq	%rax,%r12
+	cmovncq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovncq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovncq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type	__ecp_nistz256_subx,@function
+.align	32
+__ecp_nistz256_subx:
+.cfi_startproc	
+	xorq	%r11,%r11
+	sbbq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	$0,%r11
+
+	xorq	%r9,%r9
+	adcq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+
+	btq	$0,%r11
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%r10,%r9
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type	__ecp_nistz256_mul_by_2x,@function
+.align	32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc	
+	xorq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.globl	ecp_nistz256_point_double_adx
+.hidden ecp_nistz256_point_double_adx
+.type	ecp_nistz256_point_double_adx,@function
+.align	32
+ecp_nistz256_point_double_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$160+8,%rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-128(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	32(%rbx),%rdx
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rdx
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	0+32(%rsp),%rdx
+	movq	8+32(%rsp),%r14
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subx
+
+	movq	32(%rsp),%rdx
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-128(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	leaq	160+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_doublex_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_double_adx,.-ecp_nistz256_point_double_adx
+.globl	ecp_nistz256_point_add_adx
+.hidden ecp_nistz256_point_add_adx
+.type	ecp_nistz256_point_add_adx,@function
+.align	32
+ecp_nistz256_point_add_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$576+8,%rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_addx_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-128(%rsi),%rsi
+	movq	%rdx,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rdx
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	416(%rsp),%rdx
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	512(%rsp),%rdx
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	-128+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	480(%rsp),%rdx
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	.Ladd_proceedx
+
+
+
+	testq	%r9,%r9
+	jz	.Ladd_doublex
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_donex
+
+.align	32
+.Ladd_doublex:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+.cfi_adjust_cfa_offset	-416
+	jmp	.Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset	416
+
+.align	32
+.Ladd_proceedx:
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	-128+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%rdx
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	160(%rsp),%rdx
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_donex:
+	leaq	576+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_addx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add_adx,.-ecp_nistz256_point_add_adx
+.globl	ecp_nistz256_point_add_affine_adx
+.hidden ecp_nistz256_point_add_affine_adx
+.type	ecp_nistz256_point_add_affine_adx,@function
+.align	32
+ecp_nistz256_point_add_affine_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$480+8,%rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affinex_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-128(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rdx
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-128(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+96(%rsp),%rdx
+	movq	8+96(%rsp),%r14
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	-128+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rdx
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	-128+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affinex_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add_affine_adx,.-ecp_nistz256_point_add_affine_adx
+#endif
diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S b/ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S
new file mode 100644
index 0000000000..035c37fc95
--- /dev/null
+++ b/ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S
@@ -0,0 +1,4513 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+.section	__DATA,__const
+.p2align	6
+L$poly:
+.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+L$One:
+.long	1,1,1,1,1,1,1,1
+L$Two:
+.long	2,2,2,2,2,2,2,2
+L$Three:
+.long	3,3,3,3,3,3,3,3
+L$ONE_mont:
+.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+L$ord:
+.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+L$ordK:
+.quad	0xccd1c8aaee00bc4f
+.text	
+
+
+
+.globl	_ecp_nistz256_neg
+.private_extern _ecp_nistz256_neg
+
+.p2align	5
+_ecp_nistz256_neg:
+
+_CET_ENDBR
+	pushq	%r12
+
+	pushq	%r13
+
+L$neg_body:
+
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r13,%r13
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r8,%rax
+	sbbq	24(%rsi),%r11
+	leaq	L$poly(%rip),%rsi
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r13
+
+	movq	8(%rsp),%r12
+
+	leaq	16(%rsp),%rsp
+
+L$neg_epilogue:
+	ret
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_ord_mul_mont_nohw
+.private_extern _ecp_nistz256_ord_mul_mont_nohw
+
+.p2align	5
+_ecp_nistz256_ord_mul_mont_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_mul_body:
+
+	movq	0(%rdx),%rax
+	movq	%rdx,%rbx
+	leaq	L$ord(%rip),%r14
+	movq	L$ordK(%rip),%r15
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,%r13
+	imulq	%r15,%r8
+
+	movq	%rdx,%r11
+	mulq	24(%rsi)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	0(%r14)
+	movq	%r8,%rbp
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%r8
+
+	mulq	8(%r14)
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	%rdx,%r10
+	movq	%rbp,%rdx
+	adcq	$0,%r8
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	8(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r8,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r9,%rcx
+	imulq	%r15,%r9
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+	mulq	0(%r14)
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	%r9,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%r9
+
+	mulq	8(%r14)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	%rdx,%r11
+	movq	%rbp,%rdx
+	adcq	$0,%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r12
+	movq	16(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r9,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r10,%rcx
+	imulq	%r15,%r10
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+	mulq	0(%r14)
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	%r10,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r12
+	sbbq	$0,%r10
+
+	mulq	8(%r14)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	%rdx,%r12
+	movq	%rbp,%rdx
+	adcq	$0,%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r13
+	movq	24(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r10,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r11,%rcx
+	imulq	%r15,%r11
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+	mulq	0(%r14)
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	%r11,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r13
+	sbbq	$0,%r11
+
+	mulq	8(%r14)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	%rdx,%r13
+	movq	%rbp,%rdx
+	adcq	$0,%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	sbbq	%rdx,%rbp
+
+	addq	%r11,%r8
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	movq	%r12,%rsi
+	subq	0(%r14),%r12
+	movq	%r13,%r11
+	sbbq	8(%r14),%r13
+	movq	%r8,%rcx
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rsi,%r12
+	cmovcq	%r11,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_mul_epilogue:
+	ret
+
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_ord_sqr_mont_nohw
+.private_extern _ecp_nistz256_ord_sqr_mont_nohw
+
+.p2align	5
+_ecp_nistz256_ord_sqr_mont_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_sqr_body:
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%rax
+	movq	16(%rsi),%r14
+	movq	24(%rsi),%r15
+	leaq	L$ord(%rip),%rsi
+	movq	%rdx,%rbx
+	jmp	L$oop_ord_sqr
+
+.p2align	5
+L$oop_ord_sqr:
+
+	movq	%rax,%rbp
+	mulq	%r8
+	movq	%rax,%r9
+.byte	102,72,15,110,205
+	movq	%r14,%rax
+	movq	%rdx,%r10
+
+	mulq	%r8
+	addq	%rax,%r10
+	movq	%r15,%rax
+.byte	102,73,15,110,214
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r8
+	addq	%rax,%r11
+	movq	%r15,%rax
+.byte	102,73,15,110,223
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	movq	%rax,%r13
+	movq	%r14,%rax
+	movq	%rdx,%r14
+
+
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+
+	addq	%r15,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r14
+
+
+	xorq	%r15,%r15
+	movq	%r8,%rax
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+
+	mulq	%rax
+	movq	%rax,%r8
+.byte	102,72,15,126,200
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r9
+	adcq	%rax,%r10
+.byte	102,72,15,126,208
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r11
+	adcq	%rax,%r12
+.byte	102,72,15,126,216
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	movq	%r8,%rcx
+	imulq	32(%rsi),%r8
+
+	mulq	%rax
+	addq	%rbp,%r13
+	adcq	%rax,%r14
+	movq	0(%rsi),%rax
+	adcq	%rdx,%r15
+
+
+	mulq	%r8
+	movq	%r8,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%rbp
+
+	mulq	%r8
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	%rdx,%r10
+	movq	%r8,%rdx
+	adcq	$0,%rbp
+
+	movq	%r9,%rcx
+	imulq	32(%rsi),%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r8
+
+	addq	%rbp,%r11
+	adcq	$0,%r8
+
+
+	mulq	%r9
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%rbp
+
+	mulq	%r9
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r9,%rdx
+	adcq	$0,%rbp
+
+	movq	%r10,%rcx
+	imulq	32(%rsi),%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r9
+
+	addq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	mulq	%r10
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r8
+	sbbq	$0,%rbp
+
+	mulq	%r10
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	movq	%r10,%rdx
+	adcq	$0,%rbp
+
+	movq	%r11,%rcx
+	imulq	32(%rsi),%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r9
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r10
+
+	addq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	mulq	%r11
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r9
+	sbbq	$0,%rbp
+
+	mulq	%r11
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	movq	%r11,%rdx
+	adcq	$0,%rbp
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r10
+	sbbq	%rdx,%r11
+
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+
+	xorq	%rdx,%rdx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	movq	%r8,%r12
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+	subq	0(%rsi),%r8
+	movq	%r10,%r14
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r15
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r8
+	cmovncq	%r9,%rax
+	cmovncq	%r10,%r14
+	cmovncq	%r11,%r15
+
+	decq	%rbx
+	jnz	L$oop_ord_sqr
+
+	movq	%r8,0(%rdi)
+	movq	%rax,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r14,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r15,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_sqr_epilogue:
+	ret
+
+
+
+.globl	_ecp_nistz256_ord_mul_mont_adx
+.private_extern _ecp_nistz256_ord_mul_mont_adx
+
+.p2align	5
+_ecp_nistz256_ord_mul_mont_adx:
+
+L$ecp_nistz256_ord_mul_mont_adx:
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_mulx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+	leaq	L$ord-128(%rip),%r14
+	movq	L$ordK(%rip),%r15
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	mulxq	%r11,%rbp,%r11
+	addq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcq	%rbp,%r10
+	adcq	%rcx,%r11
+	adcq	$0,%r12
+
+
+	xorq	%r13,%r13
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+	adcxq	%r8,%r12
+	adoxq	%r8,%r13
+	adcq	$0,%r13
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r13
+	adoxq	%r9,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	leaq	128(%r14),%r14
+	movq	%r12,%rbx
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	movq	%r13,%rdx
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcq	$0,%r10
+
+
+
+	movq	%r8,%rcx
+	subq	0(%r14),%r12
+	sbbq	8(%r14),%r13
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_mulx_epilogue:
+	ret
+
+
+
+.globl	_ecp_nistz256_ord_sqr_mont_adx
+.private_extern _ecp_nistz256_ord_sqr_mont_adx
+
+.p2align	5
+_ecp_nistz256_ord_sqr_mont_adx:
+
+_CET_ENDBR
+L$ecp_nistz256_ord_sqr_mont_adx:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_sqrx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	L$ord(%rip),%rsi
+	jmp	L$oop_ord_sqrx
+
+.p2align	5
+L$oop_ord_sqrx:
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	movq	%rdx,%rax
+.byte	102,73,15,110,206
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	addq	%rcx,%r10
+.byte	102,73,15,110,215
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+	mulxq	%r8,%rcx,%r14
+	movq	%rax,%rdx
+.byte	102,73,15,110,216
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+
+	mulxq	%rdx,%r8,%rbp
+.byte	102,72,15,126,202
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+.byte	102,72,15,126,210
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+	mulxq	%rdx,%rcx,%rbp
+.byte	0x67
+.byte	102,72,15,126,218
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	mulxq	%rdx,%rcx,%rax
+	adoxq	%rcx,%r14
+	adoxq	%rax,%r15
+
+
+	movq	%r8,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	xorq	%rax,%rax
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	adcxq	%rax,%r8
+
+
+	movq	%r9,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	adoxq	%rax,%r9
+
+
+	movq	%r10,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	adcxq	%rax,%r10
+
+
+	movq	%r11,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	adoxq	%rax,%r11
+
+
+	addq	%r8,%r12
+	adcq	%r13,%r9
+	movq	%r12,%rdx
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%r14
+	adcq	$0,%rax
+
+
+	subq	0(%rsi),%r12
+	movq	%r10,%r15
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r8
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rax
+
+	cmovncq	%r12,%rdx
+	cmovncq	%r9,%r14
+	cmovncq	%r10,%r15
+	cmovncq	%r11,%r8
+
+	decq	%rbx
+	jnz	L$oop_ord_sqrx
+
+	movq	%rdx,0(%rdi)
+	movq	%r14,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r15,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r8,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_sqrx_epilogue:
+	ret
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_mul_mont_nohw
+.private_extern _ecp_nistz256_mul_mont_nohw
+
+.p2align	5
+_ecp_nistz256_mul_mont_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mul_body:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$mul_epilogue:
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_montq:
+
+
+
+	movq	%rax,%rbp
+	mulq	%r9
+	movq	L$poly+8(%rip),%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r10
+	movq	L$poly+24(%rip),%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r11
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r13,%r13
+	movq	%rdx,%r12
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,%rbp
+	shlq	$32,%r8
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r8,%r9
+	adcq	%rbp,%r10
+	adcq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+
+	movq	%r9,%rbp
+	shlq	$32,%r9
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r9,%r10
+	adcq	%rbp,%r11
+	adcq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+
+	movq	%r10,%rbp
+	shlq	$32,%r10
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r10,%r11
+	adcq	%rbp,%r12
+	adcq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+	movq	%r11,%rbp
+	shlq	$32,%r11
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r11,%r12
+	adcq	%rbp,%r13
+	movq	%r12,%rcx
+	adcq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r10
+
+
+
+	subq	$-1,%r12
+	movq	%r8,%rbx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rdx
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rcx,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rbx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rdx,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_sqr_mont_nohw
+.private_extern _ecp_nistz256_sqr_mont_nohw
+
+.p2align	5
+_ecp_nistz256_sqr_mont_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$sqr_body:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$sqr_epilogue:
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sqr_montq:
+
+	movq	%rax,%r13
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+
+	mulq	%r13
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%r14
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+
+	mulq	%r15
+	xorq	%r15,%r15
+	addq	%rax,%r13
+	movq	0(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+	mulq	%rax
+	movq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r11
+	adcq	%rax,%r12
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r13
+	adcq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	movq	L$poly+8(%rip),%rsi
+	movq	L$poly+24(%rip),%rbp
+
+
+
+
+	movq	%r8,%rcx
+	shlq	$32,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	adcq	%rax,%r10
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	movq	%r13,%r9
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%rcx
+	sbbq	%rbp,%r15
+	sbbq	$0,%r11
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%rcx,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+
+
+.globl	_ecp_nistz256_mul_mont_adx
+.private_extern _ecp_nistz256_mul_mont_adx
+
+.p2align	5
+_ecp_nistz256_mul_mont_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mulx_body:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$mulx_epilogue:
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_montx:
+
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	movq	$32,%r14
+	xorq	%r13,%r13
+	mulxq	%r11,%rbp,%r11
+	movq	L$poly+24(%rip),%r15
+	adcq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	adcq	%rbp,%r10
+	shlxq	%r14,%r8,%rbp
+	adcq	%rcx,%r11
+	shrxq	%r14,%r8,%rcx
+	adcq	$0,%r12
+
+
+
+	addq	%rbp,%r9
+	adcq	%rcx,%r10
+
+	mulxq	%r15,%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcq	%rcx,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	adcxq	%rcx,%r12
+	shlxq	%r14,%r9,%rcx
+	adoxq	%rbp,%r13
+	shrxq	%r14,%r9,%rbp
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+
+	addq	%rcx,%r10
+	adcq	%rbp,%r11
+
+	mulxq	%r15,%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcq	%rcx,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	adcxq	%rcx,%r13
+	shlxq	%r14,%r10,%rcx
+	adoxq	%rbp,%r8
+	shrxq	%r14,%r10,%rbp
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+
+	addq	%rcx,%r11
+	adcq	%rbp,%r12
+
+	mulxq	%r15,%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcq	%rcx,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	adcxq	%rcx,%r8
+	shlxq	%r14,%r11,%rcx
+	adoxq	%rbp,%r9
+	shrxq	%r14,%r11,%rbp
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+
+	addq	%rcx,%r12
+	adcq	%rbp,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	movq	%r12,%rbx
+	movq	L$poly+8(%rip),%r14
+	adcq	%rcx,%r8
+	movq	%r13,%rdx
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+
+	xorl	%eax,%eax
+	movq	%r8,%rcx
+	sbbq	$-1,%r12
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rbp
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+.globl	_ecp_nistz256_sqr_mont_adx
+.private_extern _ecp_nistz256_sqr_mont_adx
+
+.p2align	5
+_ecp_nistz256_sqr_mont_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$sqrx_body:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$sqrx_epilogue:
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sqr_montx:
+
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	xorl	%eax,%eax
+	adcq	%rcx,%r10
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+
+	mulxq	%r8,%rcx,%r14
+	movq	0+128(%rsi),%rdx
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+	mulxq	%rdx,%r8,%rbp
+	movq	8+128(%rsi),%rdx
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+	movq	16+128(%rsi),%rdx
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+.byte	0x67
+	mulxq	%rdx,%rcx,%rbp
+	movq	24+128(%rsi),%rdx
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	movq	$32,%rsi
+	adoxq	%rbp,%r13
+.byte	0x67,0x67
+	mulxq	%rdx,%rcx,%rax
+	movq	L$poly+24(%rip),%rdx
+	adoxq	%rcx,%r14
+	shlxq	%rsi,%r8,%rcx
+	adoxq	%rax,%r15
+	shrxq	%rsi,%r8,%rax
+	movq	%rdx,%rbp
+
+
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+
+	mulxq	%r8,%rcx,%r8
+	adcq	%rcx,%r11
+	shlxq	%rsi,%r9,%rcx
+	adcq	$0,%r8
+	shrxq	%rsi,%r9,%rax
+
+
+	addq	%rcx,%r10
+	adcq	%rax,%r11
+
+	mulxq	%r9,%rcx,%r9
+	adcq	%rcx,%r8
+	shlxq	%rsi,%r10,%rcx
+	adcq	$0,%r9
+	shrxq	%rsi,%r10,%rax
+
+
+	addq	%rcx,%r11
+	adcq	%rax,%r8
+
+	mulxq	%r10,%rcx,%r10
+	adcq	%rcx,%r9
+	shlxq	%rsi,%r11,%rcx
+	adcq	$0,%r10
+	shrxq	%rsi,%r11,%rax
+
+
+	addq	%rcx,%r8
+	adcq	%rax,%r9
+
+	mulxq	%r11,%rcx,%r11
+	adcq	%rcx,%r10
+	adcq	$0,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r8,%r12
+	movq	L$poly+8(%rip),%rsi
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	movq	%r13,%r9
+	adcq	$0,%rdx
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%r11
+	sbbq	%rbp,%r15
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%r11,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+
+
+
+
+.globl	_ecp_nistz256_select_w5_nohw
+.private_extern _ecp_nistz256_select_w5_nohw
+
+.p2align	5
+_ecp_nistz256_select_w5_nohw:
+
+_CET_ENDBR
+	movdqa	L$One(%rip),%xmm0
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+
+	movdqa	%xmm0,%xmm8
+	pshufd	$0,%xmm1,%xmm1
+
+	movq	$16,%rax
+L$select_loop_sse_w5:
+
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	pcmpeqd	%xmm1,%xmm15
+
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	movdqa	64(%rsi),%xmm13
+	movdqa	80(%rsi),%xmm14
+	leaq	96(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	pand	%xmm15,%xmm13
+	por	%xmm12,%xmm5
+	pand	%xmm15,%xmm14
+	por	%xmm13,%xmm6
+	por	%xmm14,%xmm7
+
+	decq	%rax
+	jnz	L$select_loop_sse_w5
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm7,80(%rdi)
+	ret
+
+L$SEH_end_ecp_nistz256_select_w5_nohw:
+
+
+
+
+.globl	_ecp_nistz256_select_w7_nohw
+.private_extern _ecp_nistz256_select_w7_nohw
+
+.p2align	5
+_ecp_nistz256_select_w7_nohw:
+
+_CET_ENDBR
+	movdqa	L$One(%rip),%xmm8
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	movdqa	%xmm8,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	$64,%rax
+
+L$select_loop_sse_w7:
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	pcmpeqd	%xmm1,%xmm15
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	leaq	64(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	prefetcht0	255(%rsi)
+	por	%xmm12,%xmm5
+
+	decq	%rax
+	jnz	L$select_loop_sse_w7
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	ret
+
+L$SEH_end_ecp_nistz256_select_w7_nohw:
+
+
+
+.globl	_ecp_nistz256_select_w5_avx2
+.private_extern _ecp_nistz256_select_w5_avx2
+
+.p2align	5
+_ecp_nistz256_select_w5_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	vmovdqa	L$Two(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+	vpxor	%ymm4,%ymm4,%ymm4
+
+	vmovdqa	L$One(%rip),%ymm5
+	vmovdqa	L$Two(%rip),%ymm10
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+	movq	$8,%rax
+L$select_loop_avx2_w5:
+
+	vmovdqa	0(%rsi),%ymm6
+	vmovdqa	32(%rsi),%ymm7
+	vmovdqa	64(%rsi),%ymm8
+
+	vmovdqa	96(%rsi),%ymm11
+	vmovdqa	128(%rsi),%ymm12
+	vmovdqa	160(%rsi),%ymm13
+
+	vpcmpeqd	%ymm1,%ymm5,%ymm9
+	vpcmpeqd	%ymm1,%ymm10,%ymm14
+
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm10,%ymm10
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm9,%ymm6,%ymm6
+	vpand	%ymm9,%ymm7,%ymm7
+	vpand	%ymm9,%ymm8,%ymm8
+	vpand	%ymm14,%ymm11,%ymm11
+	vpand	%ymm14,%ymm12,%ymm12
+	vpand	%ymm14,%ymm13,%ymm13
+
+	vpxor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm2,%ymm2
+	vpxor	%ymm12,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm4,%ymm4
+
+	decq	%rax
+	jnz	L$select_loop_avx2_w5
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vzeroupper
+	ret
+
+L$SEH_end_ecp_nistz256_select_w5_avx2:
+
+
+
+
+.globl	_ecp_nistz256_select_w7_avx2
+.private_extern _ecp_nistz256_select_w7_avx2
+
+.p2align	5
+_ecp_nistz256_select_w7_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	vmovdqa	L$Three(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+
+	vmovdqa	L$One(%rip),%ymm4
+	vmovdqa	L$Two(%rip),%ymm8
+	vmovdqa	L$Three(%rip),%ymm12
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+
+	movq	$21,%rax
+L$select_loop_avx2_w7:
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vmovdqa	64(%rsi),%ymm9
+	vmovdqa	96(%rsi),%ymm10
+
+	vmovdqa	128(%rsi),%ymm13
+	vmovdqa	160(%rsi),%ymm14
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+	vpcmpeqd	%ymm1,%ymm8,%ymm11
+	vpcmpeqd	%ymm1,%ymm12,%ymm15
+
+	vpaddd	%ymm0,%ymm4,%ymm4
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpaddd	%ymm0,%ymm12,%ymm12
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+	vpand	%ymm11,%ymm9,%ymm9
+	vpand	%ymm11,%ymm10,%ymm10
+	vpand	%ymm15,%ymm13,%ymm13
+	vpand	%ymm15,%ymm14,%ymm14
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm9,%ymm2,%ymm2
+	vpxor	%ymm10,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm2,%ymm2
+	vpxor	%ymm14,%ymm3,%ymm3
+
+	decq	%rax
+	jnz	L$select_loop_avx2_w7
+
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vzeroupper
+	ret
+
+L$SEH_end_ecp_nistz256_select_w7_avx2:
+
+
+.p2align	5
+__ecp_nistz256_add_toq:
+
+	xorq	%r11,%r11
+	addq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sub_fromq:
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	%r11,%r11
+
+	addq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+	testq	%r11,%r11
+
+	cmovzq	%rax,%r12
+	cmovzq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovzq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovzq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_subq:
+
+	subq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	%r11,%r11
+
+	addq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+	testq	%r11,%r11
+
+	cmovnzq	%rax,%r12
+	cmovnzq	%rbp,%r13
+	cmovnzq	%rcx,%r8
+	cmovnzq	%r10,%r9
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_by_2q:
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+.globl	_ecp_nistz256_point_double_nohw
+.private_extern _ecp_nistz256_point_double_nohw
+
+.p2align	5
+_ecp_nistz256_point_double_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$160+8,%rsp
+
+L$point_doubleq_body:
+
+L$point_double_shortcutq:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	L$poly+8(%rip),%r14
+	movq	L$poly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-0(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	32(%rbx),%rax
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rax
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	0+32(%rsp),%rax
+	movq	8+32(%rsp),%r14
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subq
+
+	movq	32(%rsp),%rax
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-0(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	leaq	160+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_doubleq_epilogue:
+	ret
+
+
+.globl	_ecp_nistz256_point_add_nohw
+.private_extern _ecp_nistz256_point_add_nohw
+
+.p2align	5
+_ecp_nistz256_point_add_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$576+8,%rsp
+
+L$point_addq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-0(%rsi),%rsi
+	movq	%rax,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rax
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	416(%rsp),%rax
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	512(%rsp),%rax
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	0+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	480(%rsp),%rax
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	L$add_proceedq
+
+
+
+	testq	%r9,%r9
+	jz	L$add_doubleq
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	L$add_doneq
+
+.p2align	5
+L$add_doubleq:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+
+	jmp	L$point_double_shortcutq
+
+
+.p2align	5
+L$add_proceedq:
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	0+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%rax
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	160(%rsp),%rax
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+L$add_doneq:
+	leaq	576+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_addq_epilogue:
+	ret
+
+
+.globl	_ecp_nistz256_point_add_affine_nohw
+.private_extern _ecp_nistz256_point_add_affine_nohw
+
+.p2align	5
+_ecp_nistz256_point_add_affine_nohw:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$480+8,%rsp
+
+L$add_affineq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-0(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rax
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-0(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+96(%rsp),%rax
+	movq	8+96(%rsp),%r14
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	0+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rax
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	0+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	L$ONE_mont(%rip),%xmm2
+	pand	L$ONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$add_affineq_epilogue:
+	ret
+
+
+
+.p2align	5
+__ecp_nistz256_add_tox:
+
+	xorq	%r11,%r11
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sub_fromx:
+
+	xorq	%r11,%r11
+	sbbq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	$0,%r11
+
+	xorq	%r10,%r10
+	adcq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+
+	btq	$0,%r11
+	cmovncq	%rax,%r12
+	cmovncq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovncq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovncq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_subx:
+
+	xorq	%r11,%r11
+	sbbq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	$0,%r11
+
+	xorq	%r9,%r9
+	adcq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+
+	btq	$0,%r11
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%r10,%r9
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_by_2x:
+
+	xorq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+.globl	_ecp_nistz256_point_double_adx
+.private_extern _ecp_nistz256_point_double_adx
+
+.p2align	5
+_ecp_nistz256_point_double_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$160+8,%rsp
+
+L$point_doublex_body:
+
+L$point_double_shortcutx:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	L$poly+8(%rip),%r14
+	movq	L$poly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-128(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	32(%rbx),%rdx
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rdx
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	0+32(%rsp),%rdx
+	movq	8+32(%rsp),%r14
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subx
+
+	movq	32(%rsp),%rdx
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-128(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	leaq	160+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_doublex_epilogue:
+	ret
+
+
+.globl	_ecp_nistz256_point_add_adx
+.private_extern _ecp_nistz256_point_add_adx
+
+.p2align	5
+_ecp_nistz256_point_add_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$576+8,%rsp
+
+L$point_addx_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-128(%rsi),%rsi
+	movq	%rdx,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rdx
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	416(%rsp),%rdx
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	512(%rsp),%rdx
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	-128+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	480(%rsp),%rdx
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	L$add_proceedx
+
+
+
+	testq	%r9,%r9
+	jz	L$add_doublex
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	L$add_donex
+
+.p2align	5
+L$add_doublex:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+
+	jmp	L$point_double_shortcutx
+
+
+.p2align	5
+L$add_proceedx:
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	-128+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%rdx
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	160(%rsp),%rdx
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+L$add_donex:
+	leaq	576+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_addx_epilogue:
+	ret
+
+
+.globl	_ecp_nistz256_point_add_affine_adx
+.private_extern _ecp_nistz256_point_add_affine_adx
+
+.p2align	5
+_ecp_nistz256_point_add_affine_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$480+8,%rsp
+
+L$add_affinex_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-128(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rdx
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-128(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+96(%rsp),%rdx
+	movq	8+96(%rsp),%r14
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	-128+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rdx
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	-128+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	L$ONE_mont(%rip),%xmm2
+	pand	L$ONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$add_affinex_epilogue:
+	ret
+
+
+#endif
diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm
new file mode 100644
index 0000000000..e837575870
--- /dev/null
+++ b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm
@@ -0,0 +1,5071 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$poly:
+	DQ	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+
+$L$One:
+	DD	1,1,1,1,1,1,1,1
+$L$Two:
+	DD	2,2,2,2,2,2,2,2
+$L$Three:
+	DD	3,3,3,3,3,3,3,3
+$L$ONE_mont:
+	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+
+
+$L$ord:
+	DQ	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+$L$ordK:
+	DQ	0xccd1c8aaee00bc4f
+section	.text
+
+
+
+
+global	ecp_nistz256_neg
+
+ALIGN	32
+ecp_nistz256_neg:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_neg:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	push	r12
+
+	push	r13
+
+$L$neg_body:
+
+	xor	r8,r8
+	xor	r9,r9
+	xor	r10,r10
+	xor	r11,r11
+	xor	r13,r13
+
+	sub	r8,QWORD[rsi]
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	rax,r8
+	sbb	r11,QWORD[24+rsi]
+	lea	rsi,[$L$poly]
+	mov	rdx,r9
+	sbb	r13,0
+
+	add	r8,QWORD[rsi]
+	mov	rcx,r10
+	adc	r9,QWORD[8+rsi]
+	adc	r10,QWORD[16+rsi]
+	mov	r12,r11
+	adc	r11,QWORD[24+rsi]
+	test	r13,r13
+
+	cmovz	r8,rax
+	cmovz	r9,rdx
+	mov	QWORD[rdi],r8
+	cmovz	r10,rcx
+	mov	QWORD[8+rdi],r9
+	cmovz	r11,r12
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+
+	mov	r13,QWORD[rsp]
+
+	mov	r12,QWORD[8+rsp]
+
+	lea	rsp,[16+rsp]
+
+$L$neg_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_neg:
+
+
+
+
+
+
+global	ecp_nistz256_ord_mul_mont_nohw
+
+ALIGN	32
+ecp_nistz256_ord_mul_mont_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_mul_body:
+
+	mov	rax,QWORD[rdx]
+	mov	rbx,rdx
+	lea	r14,[$L$ord]
+	mov	r15,QWORD[$L$ordK]
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	mov	r8,rax
+	mov	rax,rcx
+	mov	r9,rdx
+
+	mul	QWORD[8+rsi]
+	add	r9,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD[16+rsi]
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	r13,r8
+	imul	r8,r15
+
+	mov	r11,rdx
+	mul	QWORD[24+rsi]
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	QWORD[r14]
+	mov	rbp,r8
+	add	r13,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rcx,rdx
+
+	sub	r10,r8
+	sbb	r8,0
+
+	mul	QWORD[8+r14]
+	add	r9,rcx
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,rbp
+	adc	r10,rdx
+	mov	rdx,rbp
+	adc	r8,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r11,rax
+	mov	rax,QWORD[8+rbx]
+	sbb	rbp,rdx
+
+	add	r11,r8
+	adc	r12,rbp
+	adc	r13,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r9,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r10,rbp
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r9
+	imul	r9,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	xor	r8,r8
+	add	r12,rax
+	mov	rax,r9
+	adc	r13,rdx
+	adc	r8,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r9
+	add	rcx,rax
+	mov	rax,r9
+	adc	rcx,rdx
+
+	sub	r11,r9
+	sbb	r9,0
+
+	mul	QWORD[8+r14]
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rbp
+	adc	r11,rdx
+	mov	rdx,rbp
+	adc	r9,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r12,rax
+	mov	rax,QWORD[16+rbx]
+	sbb	rbp,rdx
+
+	add	r12,r9
+	adc	r13,rbp
+	adc	r8,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r10
+	imul	r10,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r13,rbp
+	adc	rdx,0
+	xor	r9,r9
+	add	r13,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r10
+	add	rcx,rax
+	mov	rax,r10
+	adc	rcx,rdx
+
+	sub	r12,r10
+	sbb	r10,0
+
+	mul	QWORD[8+r14]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	r12,rdx
+	mov	rdx,rbp
+	adc	r10,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r13,rax
+	mov	rax,QWORD[24+rbx]
+	sbb	rbp,rdx
+
+	add	r13,r10
+	adc	r8,rbp
+	adc	r9,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r13,rbp
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r11
+	imul	r11,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r8,rbp
+	adc	rdx,0
+	xor	r10,r10
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r11
+	add	rcx,rax
+	mov	rax,r11
+	adc	rcx,rdx
+
+	sub	r13,r11
+	sbb	r11,0
+
+	mul	QWORD[8+r14]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	r13,rdx
+	mov	rdx,rbp
+	adc	r11,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r8,rax
+	sbb	rbp,rdx
+
+	add	r8,r11
+	adc	r9,rbp
+	adc	r10,0
+
+
+	mov	rsi,r12
+	sub	r12,QWORD[r14]
+	mov	r11,r13
+	sbb	r13,QWORD[8+r14]
+	mov	rcx,r8
+	sbb	r8,QWORD[16+r14]
+	mov	rbp,r9
+	sbb	r9,QWORD[24+r14]
+	sbb	r10,0
+
+	cmovc	r12,rsi
+	cmovc	r13,r11
+	cmovc	r8,rcx
+	cmovc	r9,rbp
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_mul_mont_nohw:
+
+
+
+
+
+
+
+global	ecp_nistz256_ord_sqr_mont_nohw
+
+ALIGN	32
+ecp_nistz256_ord_sqr_mont_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_sqr_body:
+
+	mov	r8,QWORD[rsi]
+	mov	rax,QWORD[8+rsi]
+	mov	r14,QWORD[16+rsi]
+	mov	r15,QWORD[24+rsi]
+	lea	rsi,[$L$ord]
+	mov	rbx,rdx
+	jmp	NEAR $L$oop_ord_sqr
+
+ALIGN	32
+$L$oop_ord_sqr:
+
+	mov	rbp,rax
+	mul	r8
+	mov	r9,rax
+DB	102,72,15,110,205
+	mov	rax,r14
+	mov	r10,rdx
+
+	mul	r8
+	add	r10,rax
+	mov	rax,r15
+DB	102,73,15,110,214
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r8
+	add	r11,rax
+	mov	rax,r15
+DB	102,73,15,110,223
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	r14
+	mov	r13,rax
+	mov	rax,r14
+	mov	r14,rdx
+
+
+	mul	rbp
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	rbp
+	add	r12,rax
+	adc	rdx,0
+
+	add	r12,r15
+	adc	r13,rdx
+	adc	r14,0
+
+
+	xor	r15,r15
+	mov	rax,r8
+	add	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	adc	r14,r14
+	adc	r15,0
+
+
+	mul	rax
+	mov	r8,rax
+DB	102,72,15,126,200
+	mov	rbp,rdx
+
+	mul	rax
+	add	r9,rbp
+	adc	r10,rax
+DB	102,72,15,126,208
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	rax
+	add	r11,rbp
+	adc	r12,rax
+DB	102,72,15,126,216
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mov	rcx,r8
+	imul	r8,QWORD[32+rsi]
+
+	mul	rax
+	add	r13,rbp
+	adc	r14,rax
+	mov	rax,QWORD[rsi]
+	adc	r15,rdx
+
+
+	mul	r8
+	mov	rbp,r8
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r10,r8
+	sbb	rbp,0
+
+	mul	r8
+	add	r9,rcx
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,r8
+	adc	r10,rdx
+	mov	rdx,r8
+	adc	rbp,0
+
+	mov	rcx,r9
+	imul	r9,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r11,rax
+	mov	rax,QWORD[rsi]
+	sbb	r8,rdx
+
+	add	r11,rbp
+	adc	r8,0
+
+
+	mul	r9
+	mov	rbp,r9
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r11,r9
+	sbb	rbp,0
+
+	mul	r9
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,r9
+	adc	r11,rdx
+	mov	rdx,r9
+	adc	rbp,0
+
+	mov	rcx,r10
+	imul	r10,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r8,rax
+	mov	rax,QWORD[rsi]
+	sbb	r9,rdx
+
+	add	r8,rbp
+	adc	r9,0
+
+
+	mul	r10
+	mov	rbp,r10
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r8,r10
+	sbb	rbp,0
+
+	mul	r10
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,r10
+	adc	r8,rdx
+	mov	rdx,r10
+	adc	rbp,0
+
+	mov	rcx,r11
+	imul	r11,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r9,rax
+	mov	rax,QWORD[rsi]
+	sbb	r10,rdx
+
+	add	r9,rbp
+	adc	r10,0
+
+
+	mul	r11
+	mov	rbp,r11
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r9,r11
+	sbb	rbp,0
+
+	mul	r11
+	add	r8,rcx
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	mov	rdx,r11
+	adc	rbp,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r10,rax
+	sbb	r11,rdx
+
+	add	r10,rbp
+	adc	r11,0
+
+
+	xor	rdx,rdx
+	add	r8,r12
+	adc	r9,r13
+	mov	r12,r8
+	adc	r10,r14
+	adc	r11,r15
+	mov	rax,r9
+	adc	rdx,0
+
+
+	sub	r8,QWORD[rsi]
+	mov	r14,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r15,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	rdx,0
+
+	cmovc	r8,r12
+	cmovnc	rax,r9
+	cmovnc	r14,r10
+	cmovnc	r15,r11
+
+	dec	rbx
+	jnz	NEAR $L$oop_ord_sqr
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],rax
+	pxor	xmm1,xmm1
+	mov	QWORD[16+rdi],r14
+	pxor	xmm2,xmm2
+	mov	QWORD[24+rdi],r15
+	pxor	xmm3,xmm3
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_sqr_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw:
+
+global	ecp_nistz256_ord_mul_mont_adx
+
+ALIGN	32
+ecp_nistz256_ord_mul_mont_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_mont_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$ecp_nistz256_ord_mul_mont_adx:
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_mulx_body:
+
+	mov	rbx,rdx
+	mov	rdx,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+	lea	r14,[(($L$ord-128))]
+	mov	r15,QWORD[$L$ordK]
+
+
+	mulx	r9,r8,r9
+	mulx	r10,rcx,r10
+	mulx	r11,rbp,r11
+	add	r9,rcx
+	mulx	r12,rcx,r12
+	mov	rdx,r8
+	mulx	rax,rdx,r15
+	adc	r10,rbp
+	adc	r11,rcx
+	adc	r12,0
+
+
+	xor	r13,r13
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r8,rcx
+	adox	r9,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[8+rbx]
+	adcx	r11,rcx
+	adox	r12,rbp
+	adcx	r12,r8
+	adox	r13,r8
+	adc	r13,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r9
+	mulx	rax,rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	adcx	r13,r8
+	adox	r8,r8
+	adc	r8,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[16+rbx]
+	adcx	r12,rcx
+	adox	r13,rbp
+	adcx	r13,r9
+	adox	r8,r9
+	adc	r8,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r10
+	mulx	rax,rdx,r15
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	adcx	r8,r9
+	adox	r9,r9
+	adc	r9,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[24+rbx]
+	adcx	r13,rcx
+	adox	r8,rbp
+	adcx	r8,r10
+	adox	r9,r10
+	adc	r9,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r11
+	mulx	rax,rdx,r15
+	adcx	r8,rcx
+	adox	r9,rbp
+
+	adcx	r9,r10
+	adox	r10,r10
+	adc	r10,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	lea	r14,[128+r14]
+	mov	rbx,r12
+	adcx	r8,rcx
+	adox	r9,rbp
+	mov	rdx,r13
+	adcx	r9,r11
+	adox	r10,r11
+	adc	r10,0
+
+
+
+	mov	rcx,r8
+	sub	r12,QWORD[r14]
+	sbb	r13,QWORD[8+r14]
+	sbb	r8,QWORD[16+r14]
+	mov	rbp,r9
+	sbb	r9,QWORD[24+r14]
+	sbb	r10,0
+
+	cmovc	r12,rbx
+	cmovc	r13,rdx
+	cmovc	r8,rcx
+	cmovc	r9,rbp
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_mulx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_mul_mont_adx:
+
+global	ecp_nistz256_ord_sqr_mont_adx
+
+ALIGN	32
+ecp_nistz256_ord_sqr_mont_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+$L$ecp_nistz256_ord_sqr_mont_adx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_sqrx_body:
+
+	mov	rbx,rdx
+	mov	rdx,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+	lea	rsi,[$L$ord]
+	jmp	NEAR $L$oop_ord_sqrx
+
+ALIGN	32
+$L$oop_ord_sqrx:
+	mulx	r10,r9,r14
+	mulx	r11,rcx,r15
+	mov	rax,rdx
+DB	102,73,15,110,206
+	mulx	r12,rbp,r8
+	mov	rdx,r14
+	add	r10,rcx
+DB	102,73,15,110,215
+	adc	r11,rbp
+	adc	r12,0
+	xor	r13,r13
+
+	mulx	rbp,rcx,r15
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,r8
+	mov	rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+	adc	r13,0
+
+	mulx	r14,rcx,r8
+	mov	rdx,rax
+DB	102,73,15,110,216
+	xor	r15,r15
+	adcx	r9,r9
+	adox	r13,rcx
+	adcx	r10,r10
+	adox	r14,r15
+
+
+	mulx	rbp,r8,rdx
+DB	102,72,15,126,202
+	adcx	r11,r11
+	adox	r9,rbp
+	adcx	r12,r12
+	mulx	rax,rcx,rdx
+DB	102,72,15,126,210
+	adcx	r13,r13
+	adox	r10,rcx
+	adcx	r14,r14
+	mulx	rbp,rcx,rdx
+	DB	0x67
+DB	102,72,15,126,218
+	adox	r11,rax
+	adcx	r15,r15
+	adox	r12,rcx
+	adox	r13,rbp
+	mulx	rax,rcx,rdx
+	adox	r14,rcx
+	adox	r15,rax
+
+
+	mov	rdx,r8
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	xor	rax,rax
+	mulx	rbp,rcx,QWORD[rsi]
+	adcx	r8,rcx
+	adox	r9,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adcx	r11,rcx
+	adox	r8,rbp
+	adcx	r8,rax
+
+
+	mov	rdx,r9
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adox	r9,rcx
+	adcx	r10,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adox	r10,rcx
+	adcx	r11,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adox	r11,rcx
+	adcx	r8,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adox	r8,rcx
+	adcx	r9,rbp
+	adox	r9,rax
+
+
+	mov	rdx,r10
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adcx	r11,rcx
+	adox	r8,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adcx	r8,rcx
+	adox	r9,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+	adcx	r10,rax
+
+
+	mov	rdx,r11
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adox	r11,rcx
+	adcx	r8,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adox	r8,rcx
+	adcx	r9,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adox	r9,rcx
+	adcx	r10,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adox	r10,rcx
+	adcx	r11,rbp
+	adox	r11,rax
+
+
+	add	r12,r8
+	adc	r9,r13
+	mov	rdx,r12
+	adc	r10,r14
+	adc	r11,r15
+	mov	r14,r9
+	adc	rax,0
+
+
+	sub	r12,QWORD[rsi]
+	mov	r15,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r8,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	rax,0
+
+	cmovnc	rdx,r12
+	cmovnc	r14,r9
+	cmovnc	r15,r10
+	cmovnc	r8,r11
+
+	dec	rbx
+	jnz	NEAR $L$oop_ord_sqrx
+
+	mov	QWORD[rdi],rdx
+	mov	QWORD[8+rdi],r14
+	pxor	xmm1,xmm1
+	mov	QWORD[16+rdi],r15
+	pxor	xmm2,xmm2
+	mov	QWORD[24+rdi],r8
+	pxor	xmm3,xmm3
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_sqrx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_mont_adx:
+
+
+
+
+
+
+global	ecp_nistz256_mul_mont_nohw
+
+ALIGN	32
+ecp_nistz256_mul_mont_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_mul_mont_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mul_body:
+	mov	rbx,rdx
+	mov	rax,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+
+	call	__ecp_nistz256_mul_montq
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_mul_mont_nohw:
+
+
+ALIGN	32
+__ecp_nistz256_mul_montq:
+
+
+
+	mov	rbp,rax
+	mul	r9
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r8,rax
+	mov	rax,rbp
+	mov	r9,rdx
+
+	mul	r10
+	mov	r15,QWORD[(($L$poly+24))]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	r11
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r12
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	xor	r13,r13
+	mov	r12,rdx
+
+
+
+
+
+
+
+
+
+
+	mov	rbp,r8
+	shl	r8,32
+	mul	r15
+	shr	rbp,32
+	add	r9,r8
+	adc	r10,rbp
+	adc	r11,rax
+	mov	rax,QWORD[8+rbx]
+	adc	r12,rdx
+	adc	r13,0
+	xor	r8,r8
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,r9
+	adc	r13,rdx
+	adc	r8,0
+
+
+
+	mov	rbp,r9
+	shl	r9,32
+	mul	r15
+	shr	rbp,32
+	add	r10,r9
+	adc	r11,rbp
+	adc	r12,rax
+	mov	rax,QWORD[16+rbx]
+	adc	r13,rdx
+	adc	r8,0
+	xor	r9,r9
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r13,rcx
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+
+
+	mov	rbp,r10
+	shl	r10,32
+	mul	r15
+	shr	rbp,32
+	add	r11,r10
+	adc	r12,rbp
+	adc	r13,rax
+	mov	rax,QWORD[24+rbx]
+	adc	r8,rdx
+	adc	r9,0
+	xor	r10,r10
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r13,rcx
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r8,rcx
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+
+
+	mov	rbp,r11
+	shl	r11,32
+	mul	r15
+	shr	rbp,32
+	add	r12,r11
+	adc	r13,rbp
+	mov	rcx,r12
+	adc	r8,rax
+	adc	r9,rdx
+	mov	rbp,r13
+	adc	r10,0
+
+
+
+	sub	r12,-1
+	mov	rbx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	rdx,r9
+	sbb	r9,r15
+	sbb	r10,0
+
+	cmovc	r12,rcx
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rbx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,rdx
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+
+
+
+
+
+
+global	ecp_nistz256_sqr_mont_nohw
+
+ALIGN	32
+ecp_nistz256_sqr_mont_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_sqr_mont_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqr_body:
+	mov	rax,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+
+	call	__ecp_nistz256_sqr_montq
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$sqr_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_sqr_mont_nohw:
+
+
+ALIGN	32
+__ecp_nistz256_sqr_montq:
+
+	mov	r13,rax
+	mul	r14
+	mov	r9,rax
+	mov	rax,r15
+	mov	r10,rdx
+
+	mul	r13
+	add	r10,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r13
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	r14
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	r14
+	add	r12,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r12,rbp
+	mov	r13,rdx
+	adc	r13,0
+
+
+	mul	r15
+	xor	r15,r15
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	mov	r14,rdx
+	adc	r14,0
+
+	add	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	adc	r14,r14
+	adc	r15,0
+
+	mul	rax
+	mov	r8,rax
+	mov	rax,QWORD[8+rsi]
+	mov	rcx,rdx
+
+	mul	rax
+	add	r9,rcx
+	adc	r10,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	add	r11,rcx
+	adc	r12,rax
+	mov	rax,QWORD[24+rsi]
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	add	r13,rcx
+	adc	r14,rax
+	mov	rax,r8
+	adc	r15,rdx
+
+	mov	rsi,QWORD[(($L$poly+8))]
+	mov	rbp,QWORD[(($L$poly+24))]
+
+
+
+
+	mov	rcx,r8
+	shl	r8,32
+	mul	rbp
+	shr	rcx,32
+	add	r9,r8
+	adc	r10,rcx
+	adc	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+
+
+
+	mov	rcx,r9
+	shl	r9,32
+	mov	r8,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r10,r9
+	adc	r11,rcx
+	adc	r8,rax
+	mov	rax,r10
+	adc	rdx,0
+
+
+
+	mov	rcx,r10
+	shl	r10,32
+	mov	r9,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r11,r10
+	adc	r8,rcx
+	adc	r9,rax
+	mov	rax,r11
+	adc	rdx,0
+
+
+
+	mov	rcx,r11
+	shl	r11,32
+	mov	r10,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r8,r11
+	adc	r9,rcx
+	adc	r10,rax
+	adc	rdx,0
+	xor	r11,r11
+
+
+
+	add	r12,r8
+	adc	r13,r9
+	mov	r8,r12
+	adc	r14,r10
+	adc	r15,rdx
+	mov	r9,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	r10,r14
+	sbb	r13,rsi
+	sbb	r14,0
+	mov	rcx,r15
+	sbb	r15,rbp
+	sbb	r11,0
+
+	cmovc	r12,r8
+	cmovc	r13,r9
+	mov	QWORD[rdi],r12
+	cmovc	r14,r10
+	mov	QWORD[8+rdi],r13
+	cmovc	r15,rcx
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+
+	ret
+
+
+global	ecp_nistz256_mul_mont_adx
+
+ALIGN	32
+ecp_nistz256_mul_mont_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_mul_mont_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx_body:
+	mov	rbx,rdx
+	mov	rdx,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+
+	call	__ecp_nistz256_mul_montx
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$mulx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_mul_mont_adx:
+
+
+ALIGN	32
+__ecp_nistz256_mul_montx:
+
+
+
+	mulx	r9,r8,r9
+	mulx	r10,rcx,r10
+	mov	r14,32
+	xor	r13,r13
+	mulx	r11,rbp,r11
+	mov	r15,QWORD[(($L$poly+24))]
+	adc	r9,rcx
+	mulx	r12,rcx,r12
+	mov	rdx,r8
+	adc	r10,rbp
+	shlx	rbp,r8,r14
+	adc	r11,rcx
+	shrx	rcx,r8,r14
+	adc	r12,0
+
+
+
+	add	r9,rbp
+	adc	r10,rcx
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[8+rbx]
+	adc	r11,rcx
+	adc	r12,rbp
+	adc	r13,0
+	xor	r8,r8
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r9
+	adcx	r12,rcx
+	shlx	rcx,r9,r14
+	adox	r13,rbp
+	shrx	rbp,r9,r14
+
+	adcx	r13,r8
+	adox	r8,r8
+	adc	r8,0
+
+
+
+	add	r10,rcx
+	adc	r11,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[16+rbx]
+	adc	r12,rcx
+	adc	r13,rbp
+	adc	r8,0
+	xor	r9,r9
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r10
+	adcx	r13,rcx
+	shlx	rcx,r10,r14
+	adox	r8,rbp
+	shrx	rbp,r10,r14
+
+	adcx	r8,r9
+	adox	r9,r9
+	adc	r9,0
+
+
+
+	add	r11,rcx
+	adc	r12,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[24+rbx]
+	adc	r13,rcx
+	adc	r8,rbp
+	adc	r9,0
+	xor	r10,r10
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r11
+	adcx	r8,rcx
+	shlx	rcx,r11,r14
+	adox	r9,rbp
+	shrx	rbp,r11,r14
+
+	adcx	r9,r10
+	adox	r10,r10
+	adc	r10,0
+
+
+
+	add	r12,rcx
+	adc	r13,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rbx,r12
+	mov	r14,QWORD[(($L$poly+8))]
+	adc	r8,rcx
+	mov	rdx,r13
+	adc	r9,rbp
+	adc	r10,0
+
+
+
+	xor	eax,eax
+	mov	rcx,r8
+	sbb	r12,-1
+	sbb	r13,r14
+	sbb	r8,0
+	mov	rbp,r9
+	sbb	r9,r15
+	sbb	r10,0
+
+	cmovc	r12,rbx
+	cmovc	r13,rdx
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,rbp
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+global	ecp_nistz256_sqr_mont_adx
+
+ALIGN	32
+ecp_nistz256_sqr_mont_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_sqr_mont_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqrx_body:
+	mov	rdx,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+
+	call	__ecp_nistz256_sqr_montx
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$sqrx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_sqr_mont_adx:
+
+
+ALIGN	32
+__ecp_nistz256_sqr_montx:
+
+	mulx	r10,r9,r14
+	mulx	r11,rcx,r15
+	xor	eax,eax
+	adc	r10,rcx
+	mulx	r12,rbp,r8
+	mov	rdx,r14
+	adc	r11,rbp
+	adc	r12,0
+	xor	r13,r13
+
+
+	mulx	rbp,rcx,r15
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,r8
+	mov	rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+	adc	r13,0
+
+
+	mulx	r14,rcx,r8
+	mov	rdx,QWORD[((0+128))+rsi]
+	xor	r15,r15
+	adcx	r9,r9
+	adox	r13,rcx
+	adcx	r10,r10
+	adox	r14,r15
+
+	mulx	rbp,r8,rdx
+	mov	rdx,QWORD[((8+128))+rsi]
+	adcx	r11,r11
+	adox	r9,rbp
+	adcx	r12,r12
+	mulx	rax,rcx,rdx
+	mov	rdx,QWORD[((16+128))+rsi]
+	adcx	r13,r13
+	adox	r10,rcx
+	adcx	r14,r14
+	DB	0x67
+	mulx	rbp,rcx,rdx
+	mov	rdx,QWORD[((24+128))+rsi]
+	adox	r11,rax
+	adcx	r15,r15
+	adox	r12,rcx
+	mov	rsi,32
+	adox	r13,rbp
+	DB	0x67,0x67
+	mulx	rax,rcx,rdx
+	mov	rdx,QWORD[(($L$poly+24))]
+	adox	r14,rcx
+	shlx	rcx,r8,rsi
+	adox	r15,rax
+	shrx	rax,r8,rsi
+	mov	rbp,rdx
+
+
+	add	r9,rcx
+	adc	r10,rax
+
+	mulx	r8,rcx,r8
+	adc	r11,rcx
+	shlx	rcx,r9,rsi
+	adc	r8,0
+	shrx	rax,r9,rsi
+
+
+	add	r10,rcx
+	adc	r11,rax
+
+	mulx	r9,rcx,r9
+	adc	r8,rcx
+	shlx	rcx,r10,rsi
+	adc	r9,0
+	shrx	rax,r10,rsi
+
+
+	add	r11,rcx
+	adc	r8,rax
+
+	mulx	r10,rcx,r10
+	adc	r9,rcx
+	shlx	rcx,r11,rsi
+	adc	r10,0
+	shrx	rax,r11,rsi
+
+
+	add	r8,rcx
+	adc	r9,rax
+
+	mulx	r11,rcx,r11
+	adc	r10,rcx
+	adc	r11,0
+
+	xor	rdx,rdx
+	add	r12,r8
+	mov	rsi,QWORD[(($L$poly+8))]
+	adc	r13,r9
+	mov	r8,r12
+	adc	r14,r10
+	adc	r15,r11
+	mov	r9,r13
+	adc	rdx,0
+
+	sub	r12,-1
+	mov	r10,r14
+	sbb	r13,rsi
+	sbb	r14,0
+	mov	r11,r15
+	sbb	r15,rbp
+	sbb	rdx,0
+
+	cmovc	r12,r8
+	cmovc	r13,r9
+	mov	QWORD[rdi],r12
+	cmovc	r14,r10
+	mov	QWORD[8+rdi],r13
+	cmovc	r15,r11
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+
+	ret
+
+
+
+
+global	ecp_nistz256_select_w5_nohw
+
+ALIGN	32
+ecp_nistz256_select_w5_nohw:
+
+_CET_ENDBR
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w5_nohw:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0x0f,0x29,0x70,0xe0
+	DB	0x0f,0x29,0x78,0xf0
+	DB	0x44,0x0f,0x29,0x00
+	DB	0x44,0x0f,0x29,0x48,0x10
+	DB	0x44,0x0f,0x29,0x50,0x20
+	DB	0x44,0x0f,0x29,0x58,0x30
+	DB	0x44,0x0f,0x29,0x60,0x40
+	DB	0x44,0x0f,0x29,0x68,0x50
+	DB	0x44,0x0f,0x29,0x70,0x60
+	DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm0,XMMWORD[$L$One]
+	movd	xmm1,r8d
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+
+	movdqa	xmm8,xmm0
+	pshufd	xmm1,xmm1,0
+
+	mov	rax,16
+$L$select_loop_sse_w5:
+
+	movdqa	xmm15,xmm8
+	paddd	xmm8,xmm0
+	pcmpeqd	xmm15,xmm1
+
+	movdqa	xmm9,XMMWORD[rdx]
+	movdqa	xmm10,XMMWORD[16+rdx]
+	movdqa	xmm11,XMMWORD[32+rdx]
+	movdqa	xmm12,XMMWORD[48+rdx]
+	movdqa	xmm13,XMMWORD[64+rdx]
+	movdqa	xmm14,XMMWORD[80+rdx]
+	lea	rdx,[96+rdx]
+
+	pand	xmm9,xmm15
+	pand	xmm10,xmm15
+	por	xmm2,xmm9
+	pand	xmm11,xmm15
+	por	xmm3,xmm10
+	pand	xmm12,xmm15
+	por	xmm4,xmm11
+	pand	xmm13,xmm15
+	por	xmm5,xmm12
+	pand	xmm14,xmm15
+	por	xmm6,xmm13
+	por	xmm7,xmm14
+
+	dec	rax
+	jnz	NEAR $L$select_loop_sse_w5
+
+	movdqu	XMMWORD[rcx],xmm2
+	movdqu	XMMWORD[16+rcx],xmm3
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqu	XMMWORD[48+rcx],xmm5
+	movdqu	XMMWORD[64+rcx],xmm6
+	movdqu	XMMWORD[80+rcx],xmm7
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_select_w5_nohw:
+
+
+
+
+global	ecp_nistz256_select_w7_nohw
+
+ALIGN	32
+ecp_nistz256_select_w7_nohw:
+
+_CET_ENDBR
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w7_nohw:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0x0f,0x29,0x70,0xe0
+	DB	0x0f,0x29,0x78,0xf0
+	DB	0x44,0x0f,0x29,0x00
+	DB	0x44,0x0f,0x29,0x48,0x10
+	DB	0x44,0x0f,0x29,0x50,0x20
+	DB	0x44,0x0f,0x29,0x58,0x30
+	DB	0x44,0x0f,0x29,0x60,0x40
+	DB	0x44,0x0f,0x29,0x68,0x50
+	DB	0x44,0x0f,0x29,0x70,0x60
+	DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm8,XMMWORD[$L$One]
+	movd	xmm1,r8d
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+
+	movdqa	xmm0,xmm8
+	pshufd	xmm1,xmm1,0
+	mov	rax,64
+
+$L$select_loop_sse_w7:
+	movdqa	xmm15,xmm8
+	paddd	xmm8,xmm0
+	movdqa	xmm9,XMMWORD[rdx]
+	movdqa	xmm10,XMMWORD[16+rdx]
+	pcmpeqd	xmm15,xmm1
+	movdqa	xmm11,XMMWORD[32+rdx]
+	movdqa	xmm12,XMMWORD[48+rdx]
+	lea	rdx,[64+rdx]
+
+	pand	xmm9,xmm15
+	pand	xmm10,xmm15
+	por	xmm2,xmm9
+	pand	xmm11,xmm15
+	por	xmm3,xmm10
+	pand	xmm12,xmm15
+	por	xmm4,xmm11
+	prefetcht0	[255+rdx]
+	por	xmm5,xmm12
+
+	dec	rax
+	jnz	NEAR $L$select_loop_sse_w7
+
+	movdqu	XMMWORD[rcx],xmm2
+	movdqu	XMMWORD[16+rcx],xmm3
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqu	XMMWORD[48+rcx],xmm5
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_select_w7_nohw:
+
+
+
+global	ecp_nistz256_select_w5_avx2
+
+ALIGN	32
+ecp_nistz256_select_w5_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	lea	rax,[((-136))+rsp]
+	mov	r11,rsp
+$L$SEH_begin_ecp_nistz256_select_w5_avx2:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0xc5,0xf8,0x29,0x70,0xe0
+	DB	0xc5,0xf8,0x29,0x78,0xf0
+	DB	0xc5,0x78,0x29,0x40,0x00
+	DB	0xc5,0x78,0x29,0x48,0x10
+	DB	0xc5,0x78,0x29,0x50,0x20
+	DB	0xc5,0x78,0x29,0x58,0x30
+	DB	0xc5,0x78,0x29,0x60,0x40
+	DB	0xc5,0x78,0x29,0x68,0x50
+	DB	0xc5,0x78,0x29,0x70,0x60
+	DB	0xc5,0x78,0x29,0x78,0x70
+	vmovdqa	ymm0,YMMWORD[$L$Two]
+
+	vpxor	ymm2,ymm2,ymm2
+	vpxor	ymm3,ymm3,ymm3
+	vpxor	ymm4,ymm4,ymm4
+
+	vmovdqa	ymm5,YMMWORD[$L$One]
+	vmovdqa	ymm10,YMMWORD[$L$Two]
+
+	vmovd	xmm1,r8d
+	vpermd	ymm1,ymm2,ymm1
+
+	mov	rax,8
+$L$select_loop_avx2_w5:
+
+	vmovdqa	ymm6,YMMWORD[rdx]
+	vmovdqa	ymm7,YMMWORD[32+rdx]
+	vmovdqa	ymm8,YMMWORD[64+rdx]
+
+	vmovdqa	ymm11,YMMWORD[96+rdx]
+	vmovdqa	ymm12,YMMWORD[128+rdx]
+	vmovdqa	ymm13,YMMWORD[160+rdx]
+
+	vpcmpeqd	ymm9,ymm5,ymm1
+	vpcmpeqd	ymm14,ymm10,ymm1
+
+	vpaddd	ymm5,ymm5,ymm0
+	vpaddd	ymm10,ymm10,ymm0
+	lea	rdx,[192+rdx]
+
+	vpand	ymm6,ymm6,ymm9
+	vpand	ymm7,ymm7,ymm9
+	vpand	ymm8,ymm8,ymm9
+	vpand	ymm11,ymm11,ymm14
+	vpand	ymm12,ymm12,ymm14
+	vpand	ymm13,ymm13,ymm14
+
+	vpxor	ymm2,ymm2,ymm6
+	vpxor	ymm3,ymm3,ymm7
+	vpxor	ymm4,ymm4,ymm8
+	vpxor	ymm2,ymm2,ymm11
+	vpxor	ymm3,ymm3,ymm12
+	vpxor	ymm4,ymm4,ymm13
+
+	dec	rax
+	jnz	NEAR $L$select_loop_avx2_w5
+
+	vmovdqu	YMMWORD[rcx],ymm2
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vmovdqu	YMMWORD[64+rcx],ymm4
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[r11]
+	ret
+
+$L$SEH_end_ecp_nistz256_select_w5_avx2:
+
+
+
+
+global	ecp_nistz256_select_w7_avx2
+
+ALIGN	32
+ecp_nistz256_select_w7_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	mov	r11,rsp
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w7_avx2:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0xc5,0xf8,0x29,0x70,0xe0
+	DB	0xc5,0xf8,0x29,0x78,0xf0
+	DB	0xc5,0x78,0x29,0x40,0x00
+	DB	0xc5,0x78,0x29,0x48,0x10
+	DB	0xc5,0x78,0x29,0x50,0x20
+	DB	0xc5,0x78,0x29,0x58,0x30
+	DB	0xc5,0x78,0x29,0x60,0x40
+	DB	0xc5,0x78,0x29,0x68,0x50
+	DB	0xc5,0x78,0x29,0x70,0x60
+	DB	0xc5,0x78,0x29,0x78,0x70
+	vmovdqa	ymm0,YMMWORD[$L$Three]
+
+	vpxor	ymm2,ymm2,ymm2
+	vpxor	ymm3,ymm3,ymm3
+
+	vmovdqa	ymm4,YMMWORD[$L$One]
+	vmovdqa	ymm8,YMMWORD[$L$Two]
+	vmovdqa	ymm12,YMMWORD[$L$Three]
+
+	vmovd	xmm1,r8d
+	vpermd	ymm1,ymm2,ymm1
+
+
+	mov	rax,21
+$L$select_loop_avx2_w7:
+
+	vmovdqa	ymm5,YMMWORD[rdx]
+	vmovdqa	ymm6,YMMWORD[32+rdx]
+
+	vmovdqa	ymm9,YMMWORD[64+rdx]
+	vmovdqa	ymm10,YMMWORD[96+rdx]
+
+	vmovdqa	ymm13,YMMWORD[128+rdx]
+	vmovdqa	ymm14,YMMWORD[160+rdx]
+
+	vpcmpeqd	ymm7,ymm4,ymm1
+	vpcmpeqd	ymm11,ymm8,ymm1
+	vpcmpeqd	ymm15,ymm12,ymm1
+
+	vpaddd	ymm4,ymm4,ymm0
+	vpaddd	ymm8,ymm8,ymm0
+	vpaddd	ymm12,ymm12,ymm0
+	lea	rdx,[192+rdx]
+
+	vpand	ymm5,ymm5,ymm7
+	vpand	ymm6,ymm6,ymm7
+	vpand	ymm9,ymm9,ymm11
+	vpand	ymm10,ymm10,ymm11
+	vpand	ymm13,ymm13,ymm15
+	vpand	ymm14,ymm14,ymm15
+
+	vpxor	ymm2,ymm2,ymm5
+	vpxor	ymm3,ymm3,ymm6
+	vpxor	ymm2,ymm2,ymm9
+	vpxor	ymm3,ymm3,ymm10
+	vpxor	ymm2,ymm2,ymm13
+	vpxor	ymm3,ymm3,ymm14
+
+	dec	rax
+	jnz	NEAR $L$select_loop_avx2_w7
+
+
+	vmovdqa	ymm5,YMMWORD[rdx]
+	vmovdqa	ymm6,YMMWORD[32+rdx]
+
+	vpcmpeqd	ymm7,ymm4,ymm1
+
+	vpand	ymm5,ymm5,ymm7
+	vpand	ymm6,ymm6,ymm7
+
+	vpxor	ymm2,ymm2,ymm5
+	vpxor	ymm3,ymm3,ymm6
+
+	vmovdqu	YMMWORD[rcx],ymm2
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[r11]
+	ret
+
+$L$SEH_end_ecp_nistz256_select_w7_avx2:
+
+
+ALIGN	32
+__ecp_nistz256_add_toq:
+
+	xor	r11,r11
+	add	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	mov	rax,r12
+	adc	r8,QWORD[16+rbx]
+	adc	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sub_fromq:
+
+	sub	r12,QWORD[rbx]
+	sbb	r13,QWORD[8+rbx]
+	mov	rax,r12
+	sbb	r8,QWORD[16+rbx]
+	sbb	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	sbb	r11,r11
+
+	add	r12,-1
+	mov	rcx,r8
+	adc	r13,r14
+	adc	r8,0
+	mov	r10,r9
+	adc	r9,r15
+	test	r11,r11
+
+	cmovz	r12,rax
+	cmovz	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovz	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovz	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_subq:
+
+	sub	rax,r12
+	sbb	rbp,r13
+	mov	r12,rax
+	sbb	rcx,r8
+	sbb	r10,r9
+	mov	r13,rbp
+	sbb	r11,r11
+
+	add	rax,-1
+	mov	r8,rcx
+	adc	rbp,r14
+	adc	rcx,0
+	mov	r9,r10
+	adc	r10,r15
+	test	r11,r11
+
+	cmovnz	r12,rax
+	cmovnz	r13,rbp
+	cmovnz	r8,rcx
+	cmovnz	r9,r10
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_by_2q:
+
+	xor	r11,r11
+	add	r12,r12
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+global	ecp_nistz256_point_double_nohw
+
+ALIGN	32
+ecp_nistz256_point_double_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_double_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*5+8
+
+$L$point_doubleq_body:
+
+$L$point_double_shortcutq:
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rsi
+	movdqu	xmm1,XMMWORD[16+rsi]
+	mov	r12,QWORD[((32+0))+rsi]
+	mov	r13,QWORD[((32+8))+rsi]
+	mov	r8,QWORD[((32+16))+rsi]
+	mov	r9,QWORD[((32+24))+rsi]
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r15,QWORD[(($L$poly+24))]
+	movdqa	XMMWORD[96+rsp],xmm0
+	movdqa	XMMWORD[(96+16)+rsp],xmm1
+	lea	r10,[32+rdi]
+	lea	r11,[64+rdi]
+DB	102,72,15,110,199
+DB	102,73,15,110,202
+DB	102,73,15,110,211
+
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	lea	rsi,[((64-0))+rsi]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[32+rbx]
+	mov	r9,QWORD[((64+0))+rbx]
+	mov	r10,QWORD[((64+8))+rbx]
+	mov	r11,QWORD[((64+16))+rbx]
+	mov	r12,QWORD[((64+24))+rbx]
+	lea	rsi,[((64-0))+rbx]
+	lea	rbx,[32+rbx]
+DB	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_toq
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xor	r9,r9
+	mov	rax,r12
+	add	r12,-1
+	mov	r10,r13
+	adc	r13,rsi
+	mov	rcx,r14
+	adc	r14,0
+	mov	r8,r15
+	adc	r15,rbp
+	adc	r9,0
+	xor	rsi,rsi
+	test	rax,1
+
+	cmovz	r12,rax
+	cmovz	r13,r10
+	cmovz	r14,rcx
+	cmovz	r15,r8
+	cmovz	r9,rsi
+
+	mov	rax,r13
+	shr	r12,1
+	shl	rax,63
+	mov	r10,r14
+	shr	r13,1
+	or	r12,rax
+	shl	r10,63
+	mov	rcx,r15
+	shr	r14,1
+	or	r13,r10
+	shl	rcx,63
+	mov	QWORD[rdi],r12
+	shr	r15,1
+	mov	QWORD[8+rdi],r13
+	shl	r9,63
+	or	r14,rcx
+	or	r15,r9
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	mov	rax,QWORD[64+rsp]
+	lea	rbx,[64+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_toq
+
+	mov	rax,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	rax,QWORD[((0+32))+rsp]
+	mov	r14,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r15,QWORD[((16+32))+rsp]
+	mov	r8,QWORD[((24+32))+rsp]
+DB	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	lea	rbx,[128+rsp]
+	mov	r8,r14
+	mov	r9,r15
+	mov	r14,rsi
+	mov	r15,rbp
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_subq
+
+	mov	rax,QWORD[32+rsp]
+	lea	rbx,[32+rsp]
+	mov	r14,r12
+	xor	ecx,ecx
+	mov	QWORD[((0+0))+rsp],r12
+	mov	r10,r13
+	mov	QWORD[((0+8))+rsp],r13
+	cmovz	r11,r8
+	mov	QWORD[((0+16))+rsp],r8
+	lea	rsi,[((0-0))+rsp]
+	cmovz	r12,r9
+	mov	QWORD[((0+24))+rsp],r9
+	mov	r9,r14
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+DB	102,72,15,126,203
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	lea	rsi,[((160+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_doubleq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_double_nohw:
+global	ecp_nistz256_point_add_nohw
+
+ALIGN	32
+ecp_nistz256_point_add_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*18+8
+
+$L$point_addq_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rbx,rsi
+	mov	rsi,rdx
+	movdqa	XMMWORD[384+rsp],xmm0
+	movdqa	XMMWORD[(384+16)+rsp],xmm1
+	movdqa	XMMWORD[416+rsp],xmm2
+	movdqa	XMMWORD[(416+16)+rsp],xmm3
+	movdqa	XMMWORD[448+rsp],xmm4
+	movdqa	XMMWORD[(448+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rsi]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rsi]
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[480+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(480+16)+rsp],xmm1
+	movdqu	xmm0,XMMWORD[64+rsi]
+	movdqu	xmm1,XMMWORD[80+rsi]
+	movdqa	XMMWORD[512+rsp],xmm2
+	movdqa	XMMWORD[(512+16)+rsp],xmm3
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+
+	lea	rsi,[((64-0))+rsi]
+	mov	QWORD[((544+0))+rsp],rax
+	mov	QWORD[((544+8))+rsp],r14
+	mov	QWORD[((544+16))+rsp],r15
+	mov	QWORD[((544+24))+rsp],r8
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm1,0xb1
+	por	xmm4,xmm1
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+	mov	rax,QWORD[((64+0))+rbx]
+	mov	r14,QWORD[((64+8))+rbx]
+	mov	r15,QWORD[((64+16))+rbx]
+	mov	r8,QWORD[((64+24))+rbx]
+DB	102,72,15,110,203
+
+	lea	rsi,[((64-0))+rbx]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[416+rsp]
+	lea	rbx,[416+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((0+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[512+rsp]
+	lea	rbx,[512+rsp]
+	mov	r9,QWORD[((0+256))+rsp]
+	mov	r10,QWORD[((8+256))+rsp]
+	lea	rsi,[((0+256))+rsp]
+	mov	r11,QWORD[((16+256))+rsp]
+	mov	r12,QWORD[((24+256))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[224+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	or	r12,r13
+	movdqa	xmm2,xmm4
+	or	r12,r8
+	or	r12,r9
+	por	xmm2,xmm5
+DB	102,73,15,110,220
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[480+rsp]
+	lea	rbx,[480+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	or	r12,r13
+	or	r12,r8
+	or	r12,r9
+
+DB	102,73,15,126,208
+DB	102,73,15,126,217
+	or	r12,r8
+	DB	0x3e
+	jnz	NEAR $L$add_proceedq
+
+
+
+	test	r9,r9
+	jz	NEAR $L$add_doubleq
+
+
+
+
+
+
+DB	102,72,15,126,199
+	pxor	xmm0,xmm0
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm0
+	movdqu	XMMWORD[32+rdi],xmm0
+	movdqu	XMMWORD[48+rdi],xmm0
+	movdqu	XMMWORD[64+rdi],xmm0
+	movdqu	XMMWORD[80+rdi],xmm0
+	jmp	NEAR $L$add_doneq
+
+ALIGN	32
+$L$add_doubleq:
+DB	102,72,15,126,206
+DB	102,72,15,126,199
+	add	rsp,416
+
+	jmp	NEAR $L$point_double_shortcutq
+
+
+ALIGN	32
+$L$add_proceedq:
+	mov	rax,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+352))+rsp]
+	mov	r10,QWORD[((8+352))+rsp]
+	lea	rsi,[((0+352))+rsp]
+	mov	r11,QWORD[((16+352))+rsp]
+	mov	r12,QWORD[((24+352))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[rsp]
+	lea	rbx,[rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[160+rsp]
+	lea	rbx,[160+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[96+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subq
+
+	lea	rbx,[128+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((192+0))+rsp]
+	mov	rbp,QWORD[((192+8))+rsp]
+	mov	rcx,QWORD[((192+16))+rsp]
+	mov	r10,QWORD[((192+24))+rsp]
+	lea	rdi,[320+rsp]
+
+	call	__ecp_nistz256_subq
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rax,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((0+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[256+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[352+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((352+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[544+rsp]
+	pand	xmm3,XMMWORD[((544+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[480+rsp]
+	pand	xmm3,XMMWORD[((480+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[320+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((320+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[512+rsp]
+	pand	xmm3,XMMWORD[((512+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+$L$add_doneq:
+	lea	rsi,[((576+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_addq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add_nohw:
+global	ecp_nistz256_point_add_affine_nohw
+
+ALIGN	32
+ecp_nistz256_point_add_affine_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_affine_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*15+8
+
+$L$add_affineq_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rdx
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[320+rsp],xmm0
+	movdqa	XMMWORD[(320+16)+rsp],xmm1
+	movdqa	XMMWORD[352+rsp],xmm2
+	movdqa	XMMWORD[(352+16)+rsp],xmm3
+	movdqa	XMMWORD[384+rsp],xmm4
+	movdqa	XMMWORD[(384+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rbx]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rbx]
+	movdqu	xmm2,XMMWORD[32+rbx]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rbx]
+	movdqa	XMMWORD[416+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(416+16)+rsp],xmm1
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+	movdqa	XMMWORD[448+rsp],xmm2
+	movdqa	XMMWORD[(448+16)+rsp],xmm3
+	por	xmm3,xmm2
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm3,xmm1
+
+	lea	rsi,[((64-0))+rsi]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm3,0xb1
+	mov	rax,QWORD[rbx]
+
+	mov	r9,r12
+	por	xmm4,xmm3
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	mov	r10,r13
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	mov	r11,r14
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+
+	lea	rsi,[((32-0))+rsp]
+	mov	r12,r15
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[320+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[352+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[((0+96))+rsp]
+	mov	r14,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r15,QWORD[((16+96))+rsp]
+	mov	r8,QWORD[((24+96))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+128))+rsp]
+	mov	r10,QWORD[((8+128))+rsp]
+	lea	rsi,[((0+128))+rsp]
+	mov	r11,QWORD[((16+128))+rsp]
+	mov	r12,QWORD[((24+128))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[192+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subq
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[64+rsp]
+
+	call	__ecp_nistz256_subq
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rax,QWORD[352+rsp]
+	lea	rbx,[352+rsp]
+	mov	r9,QWORD[((0+160))+rsp]
+	mov	r10,QWORD[((8+160))+rsp]
+	lea	rsi,[((0+160))+rsp]
+	mov	r11,QWORD[((16+160))+rsp]
+	mov	r12,QWORD[((24+160))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[$L$ONE_mont]
+	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[224+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((224+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[320+rsp]
+	pand	xmm3,XMMWORD[((320+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[256+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((256+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[352+rsp]
+	pand	xmm3,XMMWORD[((352+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+	lea	rsi,[((480+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$add_affineq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add_affine_nohw:
+
+ALIGN	32
+__ecp_nistz256_add_tox:
+
+	xor	r11,r11
+	adc	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	mov	rax,r12
+	adc	r8,QWORD[16+rbx]
+	adc	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	adc	r11,0
+
+	xor	r10,r10
+	sbb	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sub_fromx:
+
+	xor	r11,r11
+	sbb	r12,QWORD[rbx]
+	sbb	r13,QWORD[8+rbx]
+	mov	rax,r12
+	sbb	r8,QWORD[16+rbx]
+	sbb	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	sbb	r11,0
+
+	xor	r10,r10
+	adc	r12,-1
+	mov	rcx,r8
+	adc	r13,r14
+	adc	r8,0
+	mov	r10,r9
+	adc	r9,r15
+
+	bt	r11,0
+	cmovnc	r12,rax
+	cmovnc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovnc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovnc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_subx:
+
+	xor	r11,r11
+	sbb	rax,r12
+	sbb	rbp,r13
+	mov	r12,rax
+	sbb	rcx,r8
+	sbb	r10,r9
+	mov	r13,rbp
+	sbb	r11,0
+
+	xor	r9,r9
+	adc	rax,-1
+	mov	r8,rcx
+	adc	rbp,r14
+	adc	rcx,0
+	mov	r9,r10
+	adc	r10,r15
+
+	bt	r11,0
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	cmovc	r8,rcx
+	cmovc	r9,r10
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_by_2x:
+
+	xor	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	xor	r10,r10
+	sbb	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+global	ecp_nistz256_point_double_adx
+
+ALIGN	32
+ecp_nistz256_point_double_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_double_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*5+8
+
+$L$point_doublex_body:
+
+$L$point_double_shortcutx:
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rsi
+	movdqu	xmm1,XMMWORD[16+rsi]
+	mov	r12,QWORD[((32+0))+rsi]
+	mov	r13,QWORD[((32+8))+rsi]
+	mov	r8,QWORD[((32+16))+rsi]
+	mov	r9,QWORD[((32+24))+rsi]
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r15,QWORD[(($L$poly+24))]
+	movdqa	XMMWORD[96+rsp],xmm0
+	movdqa	XMMWORD[(96+16)+rsp],xmm1
+	lea	r10,[32+rdi]
+	lea	r11,[64+rdi]
+DB	102,72,15,110,199
+DB	102,73,15,110,202
+DB	102,73,15,110,211
+
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	lea	rsi,[((64-128))+rsi]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[32+rbx]
+	mov	r9,QWORD[((64+0))+rbx]
+	mov	r10,QWORD[((64+8))+rbx]
+	mov	r11,QWORD[((64+16))+rbx]
+	mov	r12,QWORD[((64+24))+rbx]
+	lea	rsi,[((64-128))+rbx]
+	lea	rbx,[32+rbx]
+DB	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_tox
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xor	r9,r9
+	mov	rax,r12
+	add	r12,-1
+	mov	r10,r13
+	adc	r13,rsi
+	mov	rcx,r14
+	adc	r14,0
+	mov	r8,r15
+	adc	r15,rbp
+	adc	r9,0
+	xor	rsi,rsi
+	test	rax,1
+
+	cmovz	r12,rax
+	cmovz	r13,r10
+	cmovz	r14,rcx
+	cmovz	r15,r8
+	cmovz	r9,rsi
+
+	mov	rax,r13
+	shr	r12,1
+	shl	rax,63
+	mov	r10,r14
+	shr	r13,1
+	or	r12,rax
+	shl	r10,63
+	mov	rcx,r15
+	shr	r14,1
+	or	r13,r10
+	shl	rcx,63
+	mov	QWORD[rdi],r12
+	shr	r15,1
+	mov	QWORD[8+rdi],r13
+	shl	r9,63
+	or	r14,rcx
+	or	r15,r9
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	mov	rdx,QWORD[64+rsp]
+	lea	rbx,[64+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_tox
+
+	mov	rdx,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	rdx,QWORD[((0+32))+rsp]
+	mov	r14,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r15,QWORD[((16+32))+rsp]
+	mov	r8,QWORD[((24+32))+rsp]
+DB	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	lea	rbx,[128+rsp]
+	mov	r8,r14
+	mov	r9,r15
+	mov	r14,rsi
+	mov	r15,rbp
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_subx
+
+	mov	rdx,QWORD[32+rsp]
+	lea	rbx,[32+rsp]
+	mov	r14,r12
+	xor	ecx,ecx
+	mov	QWORD[((0+0))+rsp],r12
+	mov	r10,r13
+	mov	QWORD[((0+8))+rsp],r13
+	cmovz	r11,r8
+	mov	QWORD[((0+16))+rsp],r8
+	lea	rsi,[((0-128))+rsp]
+	cmovz	r12,r9
+	mov	QWORD[((0+24))+rsp],r9
+	mov	r9,r14
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+DB	102,72,15,126,203
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	lea	rsi,[((160+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_doublex_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_double_adx:
+global	ecp_nistz256_point_add_adx
+
+ALIGN	32
+ecp_nistz256_point_add_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*18+8
+
+$L$point_addx_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rbx,rsi
+	mov	rsi,rdx
+	movdqa	XMMWORD[384+rsp],xmm0
+	movdqa	XMMWORD[(384+16)+rsp],xmm1
+	movdqa	XMMWORD[416+rsp],xmm2
+	movdqa	XMMWORD[(416+16)+rsp],xmm3
+	movdqa	XMMWORD[448+rsp],xmm4
+	movdqa	XMMWORD[(448+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rsi]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rsi]
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[480+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(480+16)+rsp],xmm1
+	movdqu	xmm0,XMMWORD[64+rsi]
+	movdqu	xmm1,XMMWORD[80+rsi]
+	movdqa	XMMWORD[512+rsp],xmm2
+	movdqa	XMMWORD[(512+16)+rsp],xmm3
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+
+	lea	rsi,[((64-128))+rsi]
+	mov	QWORD[((544+0))+rsp],rdx
+	mov	QWORD[((544+8))+rsp],r14
+	mov	QWORD[((544+16))+rsp],r15
+	mov	QWORD[((544+24))+rsp],r8
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm1,0xb1
+	por	xmm4,xmm1
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+	mov	rdx,QWORD[((64+0))+rbx]
+	mov	r14,QWORD[((64+8))+rbx]
+	mov	r15,QWORD[((64+16))+rbx]
+	mov	r8,QWORD[((64+24))+rbx]
+DB	102,72,15,110,203
+
+	lea	rsi,[((64-128))+rbx]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[416+rsp]
+	lea	rbx,[416+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((-128+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[512+rsp]
+	lea	rbx,[512+rsp]
+	mov	r9,QWORD[((0+256))+rsp]
+	mov	r10,QWORD[((8+256))+rsp]
+	lea	rsi,[((-128+256))+rsp]
+	mov	r11,QWORD[((16+256))+rsp]
+	mov	r12,QWORD[((24+256))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[224+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	or	r12,r13
+	movdqa	xmm2,xmm4
+	or	r12,r8
+	or	r12,r9
+	por	xmm2,xmm5
+DB	102,73,15,110,220
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[480+rsp]
+	lea	rbx,[480+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	or	r12,r13
+	or	r12,r8
+	or	r12,r9
+
+DB	102,73,15,126,208
+DB	102,73,15,126,217
+	or	r12,r8
+	DB	0x3e
+	jnz	NEAR $L$add_proceedx
+
+
+
+	test	r9,r9
+	jz	NEAR $L$add_doublex
+
+
+
+
+
+
+DB	102,72,15,126,199
+	pxor	xmm0,xmm0
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm0
+	movdqu	XMMWORD[32+rdi],xmm0
+	movdqu	XMMWORD[48+rdi],xmm0
+	movdqu	XMMWORD[64+rdi],xmm0
+	movdqu	XMMWORD[80+rdi],xmm0
+	jmp	NEAR $L$add_donex
+
+ALIGN	32
+$L$add_doublex:
+DB	102,72,15,126,206
+DB	102,72,15,126,199
+	add	rsp,416
+
+	jmp	NEAR $L$point_double_shortcutx
+
+
+ALIGN	32
+$L$add_proceedx:
+	mov	rdx,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+352))+rsp]
+	mov	r10,QWORD[((8+352))+rsp]
+	lea	rsi,[((-128+352))+rsp]
+	mov	r11,QWORD[((16+352))+rsp]
+	mov	r12,QWORD[((24+352))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[rsp]
+	lea	rbx,[rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[160+rsp]
+	lea	rbx,[160+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[96+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subx
+
+	lea	rbx,[128+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((192+0))+rsp]
+	mov	rbp,QWORD[((192+8))+rsp]
+	mov	rcx,QWORD[((192+16))+rsp]
+	mov	r10,QWORD[((192+24))+rsp]
+	lea	rdi,[320+rsp]
+
+	call	__ecp_nistz256_subx
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rdx,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((-128+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[256+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[352+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((352+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[544+rsp]
+	pand	xmm3,XMMWORD[((544+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[480+rsp]
+	pand	xmm3,XMMWORD[((480+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[320+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((320+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[512+rsp]
+	pand	xmm3,XMMWORD[((512+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+$L$add_donex:
+	lea	rsi,[((576+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_addx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add_adx:
+global	ecp_nistz256_point_add_affine_adx
+
+ALIGN	32
+ecp_nistz256_point_add_affine_adx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_affine_adx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*15+8
+
+$L$add_affinex_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rdx
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[320+rsp],xmm0
+	movdqa	XMMWORD[(320+16)+rsp],xmm1
+	movdqa	XMMWORD[352+rsp],xmm2
+	movdqa	XMMWORD[(352+16)+rsp],xmm3
+	movdqa	XMMWORD[384+rsp],xmm4
+	movdqa	XMMWORD[(384+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rbx]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rbx]
+	movdqu	xmm2,XMMWORD[32+rbx]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rbx]
+	movdqa	XMMWORD[416+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(416+16)+rsp],xmm1
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+	movdqa	XMMWORD[448+rsp],xmm2
+	movdqa	XMMWORD[(448+16)+rsp],xmm3
+	por	xmm3,xmm2
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm3,xmm1
+
+	lea	rsi,[((64-128))+rsi]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm3,0xb1
+	mov	rdx,QWORD[rbx]
+
+	mov	r9,r12
+	por	xmm4,xmm3
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	mov	r10,r13
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	mov	r11,r14
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+
+	lea	rsi,[((32-128))+rsp]
+	mov	r12,r15
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[320+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[352+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[((0+96))+rsp]
+	mov	r14,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r15,QWORD[((16+96))+rsp]
+	mov	r8,QWORD[((24+96))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+128))+rsp]
+	mov	r10,QWORD[((8+128))+rsp]
+	lea	rsi,[((-128+128))+rsp]
+	mov	r11,QWORD[((16+128))+rsp]
+	mov	r12,QWORD[((24+128))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[192+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subx
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[64+rsp]
+
+	call	__ecp_nistz256_subx
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rdx,QWORD[352+rsp]
+	lea	rbx,[352+rsp]
+	mov	r9,QWORD[((0+160))+rsp]
+	mov	r10,QWORD[((8+160))+rsp]
+	lea	rsi,[((-128+160))+rsp]
+	mov	r11,QWORD[((16+160))+rsp]
+	mov	r12,QWORD[((24+160))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[$L$ONE_mont]
+	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[224+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((224+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[320+rsp]
+	pand	xmm3,XMMWORD[((320+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[256+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((256+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[352+rsp]
+	pand	xmm3,XMMWORD[((352+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+	lea	rsi,[((480+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$add_affinex_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add_affine_adx:
+EXTERN	__imp_RtlVirtualUnwind
+
+
+ALIGN	16
+short_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[16+rax]
+
+	mov	r12,QWORD[((-8))+rax]
+	mov	r13,QWORD[((-16))+rax]
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[8+r11]
+	lea	rax,[r10*1+rax]
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_ecp_nistz256_neg wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_neg wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_neg wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_mul_mont_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_sqr_mont_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_sqr_mont_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_sqr_mont_nohw wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_mul_mont_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_mul_mont_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_mul_mont_adx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_sqr_mont_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_sqr_mont_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_sqr_mont_adx wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_select_w5_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_select_w5_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_select_w7_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_select_w7_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_select_w5_avx2 wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_select_w5_avx2 wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_select_w7_avx2 wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_select_w7_avx2 wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_point_double_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_double_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_double_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add_affine_nohw wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add_affine_nohw wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add_affine_nohw wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_point_double_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_double_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_double_adx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add_adx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add_affine_adx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add_affine_adx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add_affine_adx wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ecp_nistz256_neg:
+	DB	9,0,0,0
+	DD	short_handler wrt ..imagebase
+	DD	$L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase
+$L$SEH_info_ecp_nistz256_ord_mul_mont_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_mul_mont_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_mont_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_mul_mont_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_sqr_mont_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_mul_mont_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$mulx_body wrt ..imagebase,$L$mulx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_sqr_mont_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$sqrx_body wrt ..imagebase,$L$sqrx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_select_wX_nohw:
+	DB	0x01,0x33,0x16,0x00
+	DB	0x33,0xf8,0x09,0x00
+	DB	0x2e,0xe8,0x08,0x00
+	DB	0x29,0xd8,0x07,0x00
+	DB	0x24,0xc8,0x06,0x00
+	DB	0x1f,0xb8,0x05,0x00
+	DB	0x1a,0xa8,0x04,0x00
+	DB	0x15,0x98,0x03,0x00
+	DB	0x10,0x88,0x02,0x00
+	DB	0x0c,0x78,0x01,0x00
+	DB	0x08,0x68,0x00,0x00
+	DB	0x04,0x01,0x15,0x00
+ALIGN	8
+$L$SEH_info_ecp_nistz256_select_wX_avx2:
+	DB	0x01,0x36,0x17,0x0b
+	DB	0x36,0xf8,0x09,0x00
+	DB	0x31,0xe8,0x08,0x00
+	DB	0x2c,0xd8,0x07,0x00
+	DB	0x27,0xc8,0x06,0x00
+	DB	0x22,0xb8,0x05,0x00
+	DB	0x1d,0xa8,0x04,0x00
+	DB	0x18,0x98,0x03,0x00
+	DB	0x13,0x88,0x02,0x00
+	DB	0x0e,0x78,0x01,0x00
+	DB	0x09,0x68,0x00,0x00
+	DB	0x04,0x01,0x15,0x00
+	DB	0x00,0xb3,0x00,0x00
+ALIGN	8
+$L$SEH_info_ecp_nistz256_point_double_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase
+	DD	32*5+56,0
+$L$SEH_info_ecp_nistz256_point_add_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase
+	DD	32*18+56,0
+$L$SEH_info_ecp_nistz256_point_add_affine_nohw:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase
+	DD	32*15+56,0
+ALIGN	8
+$L$SEH_info_ecp_nistz256_point_double_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase
+	DD	32*5+56,0
+$L$SEH_info_ecp_nistz256_point_add_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase
+	DD	32*18+56,0
+$L$SEH_info_ecp_nistz256_point_add_affine_adx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase
+	DD	32*15+56,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o
new file mode 100644
index 0000000000..7c7647eb5e
Binary files /dev/null and b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h
new file mode 100644
index 0000000000..da4b6fad47
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h
@@ -0,0 +1,168 @@
+
+#ifndef ring_core_generated_PREFIX_SYMBOLS_H
+#define ring_core_generated_PREFIX_SYMBOLS_H
+
+#define ecp_nistz256_point_double p256_point_double
+#define ecp_nistz256_point_add p256_point_add
+#define ecp_nistz256_point_add_affine p256_point_add_affine
+#define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont
+#define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont
+#define ecp_nistz256_mul_mont p256_mul_mont
+#define ecp_nistz256_sqr_mont p256_sqr_mont
+#define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available
+#define avx2_available ring_core_0_17_14__avx2_available
+#define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp
+#define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish
+#define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon
+#define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init
+#define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon
+#define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update
+#define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon
+#define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32
+#define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2
+#define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon
+#define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw
+#define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3
+#define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x
+#define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero
+#define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod
+#define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero
+#define LIMBS_equal ring_core_0_17_14__LIMBS_equal
+#define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than
+#define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once
+#define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32
+#define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod
+#define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod
+#define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window
+#define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window
+#define LIMB_shr ring_core_0_17_14__LIMB_shr
+#define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup
+#define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel
+#define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2
+#define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel
+#define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2
+#define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks
+#define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key
+#define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt
+#define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base
+#define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks
+#define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt
+#define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key
+#define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt
+#define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt
+#define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place
+#define bn_gather5 ring_core_0_17_14__bn_gather5
+#define bn_mul_mont ring_core_0_17_14__bn_mul_mont
+#define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw
+#define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont
+#define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont
+#define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon
+#define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5
+#define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5
+#define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64
+#define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw
+#define bn_powerx5 ring_core_0_17_14__bn_powerx5
+#define bn_scatter5 ring_core_0_17_14__bn_scatter5
+#define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal
+#define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont
+#define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal
+#define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks
+#define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy
+#define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor
+#define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main
+#define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open
+#define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2
+#define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41
+#define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal
+#define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2
+#define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41
+#define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx
+#define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw
+#define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx
+#define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw
+#define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx
+#define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw
+#define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx
+#define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw
+#define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx
+#define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw
+#define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx
+#define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw
+#define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2
+#define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw
+#define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2
+#define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw
+#define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx
+#define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw
+#define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul
+#define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square
+#define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx
+#define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul
+#define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon
+#define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1
+#define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul
+#define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon
+#define gcm_init_avx ring_core_0_17_14__gcm_init_avx
+#define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul
+#define gcm_init_neon ring_core_0_17_14__gcm_init_neon
+#define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2
+#define k25519Precomp ring_core_0_17_14__k25519Precomp
+#define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb
+#define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar
+#define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg
+#define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5
+#define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7
+#define neon_available ring_core_0_17_14__neon_available
+#define p256_mul_mont ring_core_0_17_14__p256_mul_mont
+#define p256_point_add ring_core_0_17_14__p256_point_add
+#define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine
+#define p256_point_double ring_core_0_17_14__p256_point_double
+#define p256_point_mul ring_core_0_17_14__p256_point_mul
+#define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base
+#define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime
+#define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont
+#define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont
+#define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont
+#define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2
+#define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont
+#define p384_elem_neg ring_core_0_17_14__p384_elem_neg
+#define p384_elem_sub ring_core_0_17_14__p384_elem_sub
+#define p384_point_add ring_core_0_17_14__p384_point_add
+#define p384_point_double ring_core_0_17_14__p384_point_double
+#define p384_point_mul ring_core_0_17_14__p384_point_mul
+#define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont
+#define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod
+#define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks
+#define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order
+#define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx
+#define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3
+#define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw
+#define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon
+#define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw
+#define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order
+#define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx
+#define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw
+#define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon
+#define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw
+#define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks
+#define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt
+#define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes
+#define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key
+#define x25519_NEON ring_core_0_17_14__x25519_NEON
+#define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert
+#define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative
+#define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt
+#define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg
+#define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes
+#define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime
+#define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime
+#define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base
+#define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx
+#define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked
+#define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask
+#define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd
+#define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce
+#define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx
+#define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked
+
+#endif
diff --git a/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h
new file mode 100644
index 0000000000..5db19eb659
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h
@@ -0,0 +1,334 @@
+
+#ifndef ring_core_generated_PREFIX_SYMBOLS_ASM_H
+#define ring_core_generated_PREFIX_SYMBOLS_ASM_H
+
+#if defined(__APPLE__)
+#define _ecp_nistz256_point_double _p256_point_double
+#define _ecp_nistz256_point_add _p256_point_add
+#define _ecp_nistz256_point_add_affine _p256_point_add_affine
+#define _ecp_nistz256_ord_mul_mont _p256_scalar_mul_mont
+#define _ecp_nistz256_ord_sqr_mont _p256_scalar_sqr_rep_mont
+#define _ecp_nistz256_mul_mont _p256_mul_mont
+#define _ecp_nistz256_sqr_mont _p256_sqr_mont
+#define _adx_bmi2_available _ring_core_0_17_14__adx_bmi2_available
+#define _avx2_available _ring_core_0_17_14__avx2_available
+#define _CRYPTO_memcmp _ring_core_0_17_14__CRYPTO_memcmp
+#define _CRYPTO_poly1305_finish _ring_core_0_17_14__CRYPTO_poly1305_finish
+#define _CRYPTO_poly1305_finish_neon _ring_core_0_17_14__CRYPTO_poly1305_finish_neon
+#define _CRYPTO_poly1305_init _ring_core_0_17_14__CRYPTO_poly1305_init
+#define _CRYPTO_poly1305_init_neon _ring_core_0_17_14__CRYPTO_poly1305_init_neon
+#define _CRYPTO_poly1305_update _ring_core_0_17_14__CRYPTO_poly1305_update
+#define _CRYPTO_poly1305_update_neon _ring_core_0_17_14__CRYPTO_poly1305_update_neon
+#define _ChaCha20_ctr32 _ring_core_0_17_14__ChaCha20_ctr32
+#define _ChaCha20_ctr32_avx2 _ring_core_0_17_14__ChaCha20_ctr32_avx2
+#define _ChaCha20_ctr32_neon _ring_core_0_17_14__ChaCha20_ctr32_neon
+#define _ChaCha20_ctr32_nohw _ring_core_0_17_14__ChaCha20_ctr32_nohw
+#define _ChaCha20_ctr32_ssse3 _ring_core_0_17_14__ChaCha20_ctr32_ssse3
+#define _ChaCha20_ctr32_ssse3_4x _ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x
+#define _LIMB_is_zero _ring_core_0_17_14__LIMB_is_zero
+#define _LIMBS_add_mod _ring_core_0_17_14__LIMBS_add_mod
+#define _LIMBS_are_zero _ring_core_0_17_14__LIMBS_are_zero
+#define _LIMBS_equal _ring_core_0_17_14__LIMBS_equal
+#define _LIMBS_less_than _ring_core_0_17_14__LIMBS_less_than
+#define _LIMBS_reduce_once _ring_core_0_17_14__LIMBS_reduce_once
+#define _LIMBS_select_512_32 _ring_core_0_17_14__LIMBS_select_512_32
+#define _LIMBS_shl_mod _ring_core_0_17_14__LIMBS_shl_mod
+#define _LIMBS_sub_mod _ring_core_0_17_14__LIMBS_sub_mod
+#define _LIMBS_window5_split_window _ring_core_0_17_14__LIMBS_window5_split_window
+#define _LIMBS_window5_unsplit_window _ring_core_0_17_14__LIMBS_window5_unsplit_window
+#define _LIMB_shr _ring_core_0_17_14__LIMB_shr
+#define _OPENSSL_cpuid_setup _ring_core_0_17_14__OPENSSL_cpuid_setup
+#define _aes_gcm_dec_kernel _ring_core_0_17_14__aes_gcm_dec_kernel
+#define _aes_gcm_dec_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2
+#define _aes_gcm_enc_kernel _ring_core_0_17_14__aes_gcm_enc_kernel
+#define _aes_gcm_enc_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2
+#define _aes_hw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks
+#define _aes_hw_set_encrypt_key _ring_core_0_17_14__aes_hw_set_encrypt_key
+#define _aes_hw_set_encrypt_key_alt _ring_core_0_17_14__aes_hw_set_encrypt_key_alt
+#define _aes_hw_set_encrypt_key_base _ring_core_0_17_14__aes_hw_set_encrypt_key_base
+#define _aes_nohw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks
+#define _aes_nohw_encrypt _ring_core_0_17_14__aes_nohw_encrypt
+#define _aes_nohw_set_encrypt_key _ring_core_0_17_14__aes_nohw_set_encrypt_key
+#define _aesni_gcm_decrypt _ring_core_0_17_14__aesni_gcm_decrypt
+#define _aesni_gcm_encrypt _ring_core_0_17_14__aesni_gcm_encrypt
+#define _bn_from_montgomery_in_place _ring_core_0_17_14__bn_from_montgomery_in_place
+#define _bn_gather5 _ring_core_0_17_14__bn_gather5
+#define _bn_mul_mont _ring_core_0_17_14__bn_mul_mont
+#define _bn_mul_mont_nohw _ring_core_0_17_14__bn_mul_mont_nohw
+#define _bn_mul4x_mont _ring_core_0_17_14__bn_mul4x_mont
+#define _bn_mulx4x_mont _ring_core_0_17_14__bn_mulx4x_mont
+#define _bn_mul8x_mont_neon _ring_core_0_17_14__bn_mul8x_mont_neon
+#define _bn_mul4x_mont_gather5 _ring_core_0_17_14__bn_mul4x_mont_gather5
+#define _bn_mulx4x_mont_gather5 _ring_core_0_17_14__bn_mulx4x_mont_gather5
+#define _bn_neg_inv_mod_r_u64 _ring_core_0_17_14__bn_neg_inv_mod_r_u64
+#define _bn_power5_nohw _ring_core_0_17_14__bn_power5_nohw
+#define _bn_powerx5 _ring_core_0_17_14__bn_powerx5
+#define _bn_scatter5 _ring_core_0_17_14__bn_scatter5
+#define _bn_sqr8x_internal _ring_core_0_17_14__bn_sqr8x_internal
+#define _bn_sqr8x_mont _ring_core_0_17_14__bn_sqr8x_mont
+#define _bn_sqrx8x_internal _ring_core_0_17_14__bn_sqrx8x_internal
+#define _bsaes_ctr32_encrypt_blocks _ring_core_0_17_14__bsaes_ctr32_encrypt_blocks
+#define _bssl_constant_time_test_conditional_memcpy _ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy
+#define _bssl_constant_time_test_conditional_memxor _ring_core_0_17_14__bssl_constant_time_test_conditional_memxor
+#define _bssl_constant_time_test_main _ring_core_0_17_14__bssl_constant_time_test_main
+#define _chacha20_poly1305_open _ring_core_0_17_14__chacha20_poly1305_open
+#define _chacha20_poly1305_open_avx2 _ring_core_0_17_14__chacha20_poly1305_open_avx2
+#define _chacha20_poly1305_open_sse41 _ring_core_0_17_14__chacha20_poly1305_open_sse41
+#define _chacha20_poly1305_seal _ring_core_0_17_14__chacha20_poly1305_seal
+#define _chacha20_poly1305_seal_avx2 _ring_core_0_17_14__chacha20_poly1305_seal_avx2
+#define _chacha20_poly1305_seal_sse41 _ring_core_0_17_14__chacha20_poly1305_seal_sse41
+#define _ecp_nistz256_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_mul_mont_adx
+#define _ecp_nistz256_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_mul_mont_nohw
+#define _ecp_nistz256_ord_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx
+#define _ecp_nistz256_ord_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw
+#define _ecp_nistz256_ord_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx
+#define _ecp_nistz256_ord_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw
+#define _ecp_nistz256_point_add_adx _ring_core_0_17_14__ecp_nistz256_point_add_adx
+#define _ecp_nistz256_point_add_nohw _ring_core_0_17_14__ecp_nistz256_point_add_nohw
+#define _ecp_nistz256_point_add_affine_adx _ring_core_0_17_14__ecp_nistz256_point_add_affine_adx
+#define _ecp_nistz256_point_add_affine_nohw _ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw
+#define _ecp_nistz256_point_double_adx _ring_core_0_17_14__ecp_nistz256_point_double_adx
+#define _ecp_nistz256_point_double_nohw _ring_core_0_17_14__ecp_nistz256_point_double_nohw
+#define _ecp_nistz256_select_w5_avx2 _ring_core_0_17_14__ecp_nistz256_select_w5_avx2
+#define _ecp_nistz256_select_w5_nohw _ring_core_0_17_14__ecp_nistz256_select_w5_nohw
+#define _ecp_nistz256_select_w7_avx2 _ring_core_0_17_14__ecp_nistz256_select_w7_avx2
+#define _ecp_nistz256_select_w7_nohw _ring_core_0_17_14__ecp_nistz256_select_w7_nohw
+#define _ecp_nistz256_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_sqr_mont_adx
+#define _ecp_nistz256_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw
+#define _fiat_curve25519_adx_mul _ring_core_0_17_14__fiat_curve25519_adx_mul
+#define _fiat_curve25519_adx_square _ring_core_0_17_14__fiat_curve25519_adx_square
+#define _gcm_ghash_avx _ring_core_0_17_14__gcm_ghash_avx
+#define _gcm_ghash_clmul _ring_core_0_17_14__gcm_ghash_clmul
+#define _gcm_ghash_neon _ring_core_0_17_14__gcm_ghash_neon
+#define _gcm_ghash_vpclmulqdq_avx2_1 _ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1
+#define _gcm_gmult_clmul _ring_core_0_17_14__gcm_gmult_clmul
+#define _gcm_gmult_neon _ring_core_0_17_14__gcm_gmult_neon
+#define _gcm_init_avx _ring_core_0_17_14__gcm_init_avx
+#define _gcm_init_clmul _ring_core_0_17_14__gcm_init_clmul
+#define _gcm_init_neon _ring_core_0_17_14__gcm_init_neon
+#define _gcm_init_vpclmulqdq_avx2 _ring_core_0_17_14__gcm_init_vpclmulqdq_avx2
+#define _k25519Precomp _ring_core_0_17_14__k25519Precomp
+#define _limbs_mul_add_limb _ring_core_0_17_14__limbs_mul_add_limb
+#define _little_endian_bytes_from_scalar _ring_core_0_17_14__little_endian_bytes_from_scalar
+#define _ecp_nistz256_neg _ring_core_0_17_14__ecp_nistz256_neg
+#define _ecp_nistz256_select_w5 _ring_core_0_17_14__ecp_nistz256_select_w5
+#define _ecp_nistz256_select_w7 _ring_core_0_17_14__ecp_nistz256_select_w7
+#define _neon_available _ring_core_0_17_14__neon_available
+#define _p256_mul_mont _ring_core_0_17_14__p256_mul_mont
+#define _p256_point_add _ring_core_0_17_14__p256_point_add
+#define _p256_point_add_affine _ring_core_0_17_14__p256_point_add_affine
+#define _p256_point_double _ring_core_0_17_14__p256_point_double
+#define _p256_point_mul _ring_core_0_17_14__p256_point_mul
+#define _p256_point_mul_base _ring_core_0_17_14__p256_point_mul_base
+#define _p256_point_mul_base_vartime _ring_core_0_17_14__p256_point_mul_base_vartime
+#define _p256_scalar_mul_mont _ring_core_0_17_14__p256_scalar_mul_mont
+#define _p256_scalar_sqr_rep_mont _ring_core_0_17_14__p256_scalar_sqr_rep_mont
+#define _p256_sqr_mont _ring_core_0_17_14__p256_sqr_mont
+#define _p384_elem_div_by_2 _ring_core_0_17_14__p384_elem_div_by_2
+#define _p384_elem_mul_mont _ring_core_0_17_14__p384_elem_mul_mont
+#define _p384_elem_neg _ring_core_0_17_14__p384_elem_neg
+#define _p384_elem_sub _ring_core_0_17_14__p384_elem_sub
+#define _p384_point_add _ring_core_0_17_14__p384_point_add
+#define _p384_point_double _ring_core_0_17_14__p384_point_double
+#define _p384_point_mul _ring_core_0_17_14__p384_point_mul
+#define _p384_scalar_mul_mont _ring_core_0_17_14__p384_scalar_mul_mont
+#define _openssl_poly1305_neon2_addmulmod _ring_core_0_17_14__openssl_poly1305_neon2_addmulmod
+#define _openssl_poly1305_neon2_blocks _ring_core_0_17_14__openssl_poly1305_neon2_blocks
+#define _sha256_block_data_order _ring_core_0_17_14__sha256_block_data_order
+#define _sha256_block_data_order_avx _ring_core_0_17_14__sha256_block_data_order_avx
+#define _sha256_block_data_order_ssse3 _ring_core_0_17_14__sha256_block_data_order_ssse3
+#define _sha256_block_data_order_hw _ring_core_0_17_14__sha256_block_data_order_hw
+#define _sha256_block_data_order_neon _ring_core_0_17_14__sha256_block_data_order_neon
+#define _sha256_block_data_order_nohw _ring_core_0_17_14__sha256_block_data_order_nohw
+#define _sha512_block_data_order _ring_core_0_17_14__sha512_block_data_order
+#define _sha512_block_data_order_avx _ring_core_0_17_14__sha512_block_data_order_avx
+#define _sha512_block_data_order_hw _ring_core_0_17_14__sha512_block_data_order_hw
+#define _sha512_block_data_order_neon _ring_core_0_17_14__sha512_block_data_order_neon
+#define _sha512_block_data_order_nohw _ring_core_0_17_14__sha512_block_data_order_nohw
+#define _vpaes_ctr32_encrypt_blocks _ring_core_0_17_14__vpaes_ctr32_encrypt_blocks
+#define _vpaes_encrypt _ring_core_0_17_14__vpaes_encrypt
+#define _vpaes_encrypt_key_to_bsaes _ring_core_0_17_14__vpaes_encrypt_key_to_bsaes
+#define _vpaes_set_encrypt_key _ring_core_0_17_14__vpaes_set_encrypt_key
+#define _x25519_NEON _ring_core_0_17_14__x25519_NEON
+#define _x25519_fe_invert _ring_core_0_17_14__x25519_fe_invert
+#define _x25519_fe_isnegative _ring_core_0_17_14__x25519_fe_isnegative
+#define _x25519_fe_mul_ttt _ring_core_0_17_14__x25519_fe_mul_ttt
+#define _x25519_fe_neg _ring_core_0_17_14__x25519_fe_neg
+#define _x25519_fe_tobytes _ring_core_0_17_14__x25519_fe_tobytes
+#define _x25519_ge_double_scalarmult_vartime _ring_core_0_17_14__x25519_ge_double_scalarmult_vartime
+#define _x25519_ge_frombytes_vartime _ring_core_0_17_14__x25519_ge_frombytes_vartime
+#define _x25519_ge_scalarmult_base _ring_core_0_17_14__x25519_ge_scalarmult_base
+#define _x25519_ge_scalarmult_base_adx _ring_core_0_17_14__x25519_ge_scalarmult_base_adx
+#define _x25519_public_from_private_generic_masked _ring_core_0_17_14__x25519_public_from_private_generic_masked
+#define _x25519_sc_mask _ring_core_0_17_14__x25519_sc_mask
+#define _x25519_sc_muladd _ring_core_0_17_14__x25519_sc_muladd
+#define _x25519_sc_reduce _ring_core_0_17_14__x25519_sc_reduce
+#define _x25519_scalar_mult_adx _ring_core_0_17_14__x25519_scalar_mult_adx
+#define _x25519_scalar_mult_generic_masked _ring_core_0_17_14__x25519_scalar_mult_generic_masked
+
+#else
+#define ecp_nistz256_point_double p256_point_double
+#define ecp_nistz256_point_add p256_point_add
+#define ecp_nistz256_point_add_affine p256_point_add_affine
+#define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont
+#define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont
+#define ecp_nistz256_mul_mont p256_mul_mont
+#define ecp_nistz256_sqr_mont p256_sqr_mont
+#define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available
+#define avx2_available ring_core_0_17_14__avx2_available
+#define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp
+#define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish
+#define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon
+#define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init
+#define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon
+#define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update
+#define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon
+#define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32
+#define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2
+#define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon
+#define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw
+#define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3
+#define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x
+#define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero
+#define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod
+#define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero
+#define LIMBS_equal ring_core_0_17_14__LIMBS_equal
+#define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than
+#define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once
+#define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32
+#define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod
+#define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod
+#define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window
+#define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window
+#define LIMB_shr ring_core_0_17_14__LIMB_shr
+#define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup
+#define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel
+#define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2
+#define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel
+#define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2
+#define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks
+#define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key
+#define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt
+#define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base
+#define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks
+#define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt
+#define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key
+#define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt
+#define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt
+#define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place
+#define bn_gather5 ring_core_0_17_14__bn_gather5
+#define bn_mul_mont ring_core_0_17_14__bn_mul_mont
+#define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw
+#define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont
+#define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont
+#define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon
+#define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5
+#define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5
+#define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64
+#define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw
+#define bn_powerx5 ring_core_0_17_14__bn_powerx5
+#define bn_scatter5 ring_core_0_17_14__bn_scatter5
+#define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal
+#define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont
+#define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal
+#define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks
+#define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy
+#define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor
+#define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main
+#define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open
+#define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2
+#define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41
+#define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal
+#define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2
+#define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41
+#define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx
+#define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw
+#define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx
+#define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw
+#define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx
+#define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw
+#define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx
+#define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw
+#define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx
+#define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw
+#define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx
+#define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw
+#define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2
+#define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw
+#define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2
+#define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw
+#define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx
+#define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw
+#define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul
+#define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square
+#define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx
+#define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul
+#define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon
+#define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1
+#define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul
+#define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon
+#define gcm_init_avx ring_core_0_17_14__gcm_init_avx
+#define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul
+#define gcm_init_neon ring_core_0_17_14__gcm_init_neon
+#define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2
+#define k25519Precomp ring_core_0_17_14__k25519Precomp
+#define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb
+#define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar
+#define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg
+#define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5
+#define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7
+#define neon_available ring_core_0_17_14__neon_available
+#define p256_mul_mont ring_core_0_17_14__p256_mul_mont
+#define p256_point_add ring_core_0_17_14__p256_point_add
+#define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine
+#define p256_point_double ring_core_0_17_14__p256_point_double
+#define p256_point_mul ring_core_0_17_14__p256_point_mul
+#define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base
+#define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime
+#define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont
+#define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont
+#define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont
+#define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2
+#define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont
+#define p384_elem_neg ring_core_0_17_14__p384_elem_neg
+#define p384_elem_sub ring_core_0_17_14__p384_elem_sub
+#define p384_point_add ring_core_0_17_14__p384_point_add
+#define p384_point_double ring_core_0_17_14__p384_point_double
+#define p384_point_mul ring_core_0_17_14__p384_point_mul
+#define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont
+#define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod
+#define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks
+#define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order
+#define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx
+#define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3
+#define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw
+#define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon
+#define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw
+#define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order
+#define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx
+#define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw
+#define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon
+#define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw
+#define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks
+#define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt
+#define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes
+#define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key
+#define x25519_NEON ring_core_0_17_14__x25519_NEON
+#define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert
+#define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative
+#define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt
+#define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg
+#define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes
+#define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime
+#define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime
+#define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base
+#define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx
+#define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked
+#define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask
+#define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd
+#define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce
+#define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx
+#define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked
+
+#endif
+#endif
diff --git a/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc
new file mode 100644
index 0000000000..900abb03b5
--- /dev/null
+++ b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc
@@ -0,0 +1,334 @@
+
+%ifndef ring_core_generated_PREFIX_SYMBOLS_NASM_INC
+%define ring_core_generated_PREFIX_SYMBOLS_NASM_INC
+
+%ifidn __OUTPUT_FORMAT__,win32
+%define _ecp_nistz256_point_double _p256_point_double
+%define _ecp_nistz256_point_add _p256_point_add
+%define _ecp_nistz256_point_add_affine _p256_point_add_affine
+%define _ecp_nistz256_ord_mul_mont _p256_scalar_mul_mont
+%define _ecp_nistz256_ord_sqr_mont _p256_scalar_sqr_rep_mont
+%define _ecp_nistz256_mul_mont _p256_mul_mont
+%define _ecp_nistz256_sqr_mont _p256_sqr_mont
+%define _adx_bmi2_available _ring_core_0_17_14__adx_bmi2_available
+%define _avx2_available _ring_core_0_17_14__avx2_available
+%define _CRYPTO_memcmp _ring_core_0_17_14__CRYPTO_memcmp
+%define _CRYPTO_poly1305_finish _ring_core_0_17_14__CRYPTO_poly1305_finish
+%define _CRYPTO_poly1305_finish_neon _ring_core_0_17_14__CRYPTO_poly1305_finish_neon
+%define _CRYPTO_poly1305_init _ring_core_0_17_14__CRYPTO_poly1305_init
+%define _CRYPTO_poly1305_init_neon _ring_core_0_17_14__CRYPTO_poly1305_init_neon
+%define _CRYPTO_poly1305_update _ring_core_0_17_14__CRYPTO_poly1305_update
+%define _CRYPTO_poly1305_update_neon _ring_core_0_17_14__CRYPTO_poly1305_update_neon
+%define _ChaCha20_ctr32 _ring_core_0_17_14__ChaCha20_ctr32
+%define _ChaCha20_ctr32_avx2 _ring_core_0_17_14__ChaCha20_ctr32_avx2
+%define _ChaCha20_ctr32_neon _ring_core_0_17_14__ChaCha20_ctr32_neon
+%define _ChaCha20_ctr32_nohw _ring_core_0_17_14__ChaCha20_ctr32_nohw
+%define _ChaCha20_ctr32_ssse3 _ring_core_0_17_14__ChaCha20_ctr32_ssse3
+%define _ChaCha20_ctr32_ssse3_4x _ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x
+%define _LIMB_is_zero _ring_core_0_17_14__LIMB_is_zero
+%define _LIMBS_add_mod _ring_core_0_17_14__LIMBS_add_mod
+%define _LIMBS_are_zero _ring_core_0_17_14__LIMBS_are_zero
+%define _LIMBS_equal _ring_core_0_17_14__LIMBS_equal
+%define _LIMBS_less_than _ring_core_0_17_14__LIMBS_less_than
+%define _LIMBS_reduce_once _ring_core_0_17_14__LIMBS_reduce_once
+%define _LIMBS_select_512_32 _ring_core_0_17_14__LIMBS_select_512_32
+%define _LIMBS_shl_mod _ring_core_0_17_14__LIMBS_shl_mod
+%define _LIMBS_sub_mod _ring_core_0_17_14__LIMBS_sub_mod
+%define _LIMBS_window5_split_window _ring_core_0_17_14__LIMBS_window5_split_window
+%define _LIMBS_window5_unsplit_window _ring_core_0_17_14__LIMBS_window5_unsplit_window
+%define _LIMB_shr _ring_core_0_17_14__LIMB_shr
+%define _OPENSSL_cpuid_setup _ring_core_0_17_14__OPENSSL_cpuid_setup
+%define _aes_gcm_dec_kernel _ring_core_0_17_14__aes_gcm_dec_kernel
+%define _aes_gcm_dec_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2
+%define _aes_gcm_enc_kernel _ring_core_0_17_14__aes_gcm_enc_kernel
+%define _aes_gcm_enc_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2
+%define _aes_hw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks
+%define _aes_hw_set_encrypt_key _ring_core_0_17_14__aes_hw_set_encrypt_key
+%define _aes_hw_set_encrypt_key_alt _ring_core_0_17_14__aes_hw_set_encrypt_key_alt
+%define _aes_hw_set_encrypt_key_base _ring_core_0_17_14__aes_hw_set_encrypt_key_base
+%define _aes_nohw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks
+%define _aes_nohw_encrypt _ring_core_0_17_14__aes_nohw_encrypt
+%define _aes_nohw_set_encrypt_key _ring_core_0_17_14__aes_nohw_set_encrypt_key
+%define _aesni_gcm_decrypt _ring_core_0_17_14__aesni_gcm_decrypt
+%define _aesni_gcm_encrypt _ring_core_0_17_14__aesni_gcm_encrypt
+%define _bn_from_montgomery_in_place _ring_core_0_17_14__bn_from_montgomery_in_place
+%define _bn_gather5 _ring_core_0_17_14__bn_gather5
+%define _bn_mul_mont _ring_core_0_17_14__bn_mul_mont
+%define _bn_mul_mont_nohw _ring_core_0_17_14__bn_mul_mont_nohw
+%define _bn_mul4x_mont _ring_core_0_17_14__bn_mul4x_mont
+%define _bn_mulx4x_mont _ring_core_0_17_14__bn_mulx4x_mont
+%define _bn_mul8x_mont_neon _ring_core_0_17_14__bn_mul8x_mont_neon
+%define _bn_mul4x_mont_gather5 _ring_core_0_17_14__bn_mul4x_mont_gather5
+%define _bn_mulx4x_mont_gather5 _ring_core_0_17_14__bn_mulx4x_mont_gather5
+%define _bn_neg_inv_mod_r_u64 _ring_core_0_17_14__bn_neg_inv_mod_r_u64
+%define _bn_power5_nohw _ring_core_0_17_14__bn_power5_nohw
+%define _bn_powerx5 _ring_core_0_17_14__bn_powerx5
+%define _bn_scatter5 _ring_core_0_17_14__bn_scatter5
+%define _bn_sqr8x_internal _ring_core_0_17_14__bn_sqr8x_internal
+%define _bn_sqr8x_mont _ring_core_0_17_14__bn_sqr8x_mont
+%define _bn_sqrx8x_internal _ring_core_0_17_14__bn_sqrx8x_internal
+%define _bsaes_ctr32_encrypt_blocks _ring_core_0_17_14__bsaes_ctr32_encrypt_blocks
+%define _bssl_constant_time_test_conditional_memcpy _ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy
+%define _bssl_constant_time_test_conditional_memxor _ring_core_0_17_14__bssl_constant_time_test_conditional_memxor
+%define _bssl_constant_time_test_main _ring_core_0_17_14__bssl_constant_time_test_main
+%define _chacha20_poly1305_open _ring_core_0_17_14__chacha20_poly1305_open
+%define _chacha20_poly1305_open_avx2 _ring_core_0_17_14__chacha20_poly1305_open_avx2
+%define _chacha20_poly1305_open_sse41 _ring_core_0_17_14__chacha20_poly1305_open_sse41
+%define _chacha20_poly1305_seal _ring_core_0_17_14__chacha20_poly1305_seal
+%define _chacha20_poly1305_seal_avx2 _ring_core_0_17_14__chacha20_poly1305_seal_avx2
+%define _chacha20_poly1305_seal_sse41 _ring_core_0_17_14__chacha20_poly1305_seal_sse41
+%define _ecp_nistz256_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_mul_mont_adx
+%define _ecp_nistz256_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_mul_mont_nohw
+%define _ecp_nistz256_ord_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx
+%define _ecp_nistz256_ord_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw
+%define _ecp_nistz256_ord_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx
+%define _ecp_nistz256_ord_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw
+%define _ecp_nistz256_point_add_adx _ring_core_0_17_14__ecp_nistz256_point_add_adx
+%define _ecp_nistz256_point_add_nohw _ring_core_0_17_14__ecp_nistz256_point_add_nohw
+%define _ecp_nistz256_point_add_affine_adx _ring_core_0_17_14__ecp_nistz256_point_add_affine_adx
+%define _ecp_nistz256_point_add_affine_nohw _ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw
+%define _ecp_nistz256_point_double_adx _ring_core_0_17_14__ecp_nistz256_point_double_adx
+%define _ecp_nistz256_point_double_nohw _ring_core_0_17_14__ecp_nistz256_point_double_nohw
+%define _ecp_nistz256_select_w5_avx2 _ring_core_0_17_14__ecp_nistz256_select_w5_avx2
+%define _ecp_nistz256_select_w5_nohw _ring_core_0_17_14__ecp_nistz256_select_w5_nohw
+%define _ecp_nistz256_select_w7_avx2 _ring_core_0_17_14__ecp_nistz256_select_w7_avx2
+%define _ecp_nistz256_select_w7_nohw _ring_core_0_17_14__ecp_nistz256_select_w7_nohw
+%define _ecp_nistz256_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_sqr_mont_adx
+%define _ecp_nistz256_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw
+%define _fiat_curve25519_adx_mul _ring_core_0_17_14__fiat_curve25519_adx_mul
+%define _fiat_curve25519_adx_square _ring_core_0_17_14__fiat_curve25519_adx_square
+%define _gcm_ghash_avx _ring_core_0_17_14__gcm_ghash_avx
+%define _gcm_ghash_clmul _ring_core_0_17_14__gcm_ghash_clmul
+%define _gcm_ghash_neon _ring_core_0_17_14__gcm_ghash_neon
+%define _gcm_ghash_vpclmulqdq_avx2_1 _ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1
+%define _gcm_gmult_clmul _ring_core_0_17_14__gcm_gmult_clmul
+%define _gcm_gmult_neon _ring_core_0_17_14__gcm_gmult_neon
+%define _gcm_init_avx _ring_core_0_17_14__gcm_init_avx
+%define _gcm_init_clmul _ring_core_0_17_14__gcm_init_clmul
+%define _gcm_init_neon _ring_core_0_17_14__gcm_init_neon
+%define _gcm_init_vpclmulqdq_avx2 _ring_core_0_17_14__gcm_init_vpclmulqdq_avx2
+%define _k25519Precomp _ring_core_0_17_14__k25519Precomp
+%define _limbs_mul_add_limb _ring_core_0_17_14__limbs_mul_add_limb
+%define _little_endian_bytes_from_scalar _ring_core_0_17_14__little_endian_bytes_from_scalar
+%define _ecp_nistz256_neg _ring_core_0_17_14__ecp_nistz256_neg
+%define _ecp_nistz256_select_w5 _ring_core_0_17_14__ecp_nistz256_select_w5
+%define _ecp_nistz256_select_w7 _ring_core_0_17_14__ecp_nistz256_select_w7
+%define _neon_available _ring_core_0_17_14__neon_available
+%define _p256_mul_mont _ring_core_0_17_14__p256_mul_mont
+%define _p256_point_add _ring_core_0_17_14__p256_point_add
+%define _p256_point_add_affine _ring_core_0_17_14__p256_point_add_affine
+%define _p256_point_double _ring_core_0_17_14__p256_point_double
+%define _p256_point_mul _ring_core_0_17_14__p256_point_mul
+%define _p256_point_mul_base _ring_core_0_17_14__p256_point_mul_base
+%define _p256_point_mul_base_vartime _ring_core_0_17_14__p256_point_mul_base_vartime
+%define _p256_scalar_mul_mont _ring_core_0_17_14__p256_scalar_mul_mont
+%define _p256_scalar_sqr_rep_mont _ring_core_0_17_14__p256_scalar_sqr_rep_mont
+%define _p256_sqr_mont _ring_core_0_17_14__p256_sqr_mont
+%define _p384_elem_div_by_2 _ring_core_0_17_14__p384_elem_div_by_2
+%define _p384_elem_mul_mont _ring_core_0_17_14__p384_elem_mul_mont
+%define _p384_elem_neg _ring_core_0_17_14__p384_elem_neg
+%define _p384_elem_sub _ring_core_0_17_14__p384_elem_sub
+%define _p384_point_add _ring_core_0_17_14__p384_point_add
+%define _p384_point_double _ring_core_0_17_14__p384_point_double
+%define _p384_point_mul _ring_core_0_17_14__p384_point_mul
+%define _p384_scalar_mul_mont _ring_core_0_17_14__p384_scalar_mul_mont
+%define _openssl_poly1305_neon2_addmulmod _ring_core_0_17_14__openssl_poly1305_neon2_addmulmod
+%define _openssl_poly1305_neon2_blocks _ring_core_0_17_14__openssl_poly1305_neon2_blocks
+%define _sha256_block_data_order _ring_core_0_17_14__sha256_block_data_order
+%define _sha256_block_data_order_avx _ring_core_0_17_14__sha256_block_data_order_avx
+%define _sha256_block_data_order_ssse3 _ring_core_0_17_14__sha256_block_data_order_ssse3
+%define _sha256_block_data_order_hw _ring_core_0_17_14__sha256_block_data_order_hw
+%define _sha256_block_data_order_neon _ring_core_0_17_14__sha256_block_data_order_neon
+%define _sha256_block_data_order_nohw _ring_core_0_17_14__sha256_block_data_order_nohw
+%define _sha512_block_data_order _ring_core_0_17_14__sha512_block_data_order
+%define _sha512_block_data_order_avx _ring_core_0_17_14__sha512_block_data_order_avx
+%define _sha512_block_data_order_hw _ring_core_0_17_14__sha512_block_data_order_hw
+%define _sha512_block_data_order_neon _ring_core_0_17_14__sha512_block_data_order_neon
+%define _sha512_block_data_order_nohw _ring_core_0_17_14__sha512_block_data_order_nohw
+%define _vpaes_ctr32_encrypt_blocks _ring_core_0_17_14__vpaes_ctr32_encrypt_blocks
+%define _vpaes_encrypt _ring_core_0_17_14__vpaes_encrypt
+%define _vpaes_encrypt_key_to_bsaes _ring_core_0_17_14__vpaes_encrypt_key_to_bsaes
+%define _vpaes_set_encrypt_key _ring_core_0_17_14__vpaes_set_encrypt_key
+%define _x25519_NEON _ring_core_0_17_14__x25519_NEON
+%define _x25519_fe_invert _ring_core_0_17_14__x25519_fe_invert
+%define _x25519_fe_isnegative _ring_core_0_17_14__x25519_fe_isnegative
+%define _x25519_fe_mul_ttt _ring_core_0_17_14__x25519_fe_mul_ttt
+%define _x25519_fe_neg _ring_core_0_17_14__x25519_fe_neg
+%define _x25519_fe_tobytes _ring_core_0_17_14__x25519_fe_tobytes
+%define _x25519_ge_double_scalarmult_vartime _ring_core_0_17_14__x25519_ge_double_scalarmult_vartime
+%define _x25519_ge_frombytes_vartime _ring_core_0_17_14__x25519_ge_frombytes_vartime
+%define _x25519_ge_scalarmult_base _ring_core_0_17_14__x25519_ge_scalarmult_base
+%define _x25519_ge_scalarmult_base_adx _ring_core_0_17_14__x25519_ge_scalarmult_base_adx
+%define _x25519_public_from_private_generic_masked _ring_core_0_17_14__x25519_public_from_private_generic_masked
+%define _x25519_sc_mask _ring_core_0_17_14__x25519_sc_mask
+%define _x25519_sc_muladd _ring_core_0_17_14__x25519_sc_muladd
+%define _x25519_sc_reduce _ring_core_0_17_14__x25519_sc_reduce
+%define _x25519_scalar_mult_adx _ring_core_0_17_14__x25519_scalar_mult_adx
+%define _x25519_scalar_mult_generic_masked _ring_core_0_17_14__x25519_scalar_mult_generic_masked
+
+%else
+%define ecp_nistz256_point_double p256_point_double
+%define ecp_nistz256_point_add p256_point_add
+%define ecp_nistz256_point_add_affine p256_point_add_affine
+%define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont
+%define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont
+%define ecp_nistz256_mul_mont p256_mul_mont
+%define ecp_nistz256_sqr_mont p256_sqr_mont
+%define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available
+%define avx2_available ring_core_0_17_14__avx2_available
+%define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp
+%define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish
+%define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon
+%define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init
+%define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon
+%define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update
+%define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon
+%define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32
+%define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2
+%define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon
+%define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw
+%define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3
+%define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x
+%define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero
+%define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod
+%define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero
+%define LIMBS_equal ring_core_0_17_14__LIMBS_equal
+%define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than
+%define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once
+%define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32
+%define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod
+%define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod
+%define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window
+%define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window
+%define LIMB_shr ring_core_0_17_14__LIMB_shr
+%define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup
+%define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel
+%define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2
+%define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel
+%define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2
+%define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks
+%define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key
+%define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt
+%define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base
+%define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks
+%define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt
+%define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key
+%define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt
+%define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt
+%define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place
+%define bn_gather5 ring_core_0_17_14__bn_gather5
+%define bn_mul_mont ring_core_0_17_14__bn_mul_mont
+%define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw
+%define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont
+%define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont
+%define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon
+%define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5
+%define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5
+%define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64
+%define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw
+%define bn_powerx5 ring_core_0_17_14__bn_powerx5
+%define bn_scatter5 ring_core_0_17_14__bn_scatter5
+%define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal
+%define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont
+%define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal
+%define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks
+%define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy
+%define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor
+%define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main
+%define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open
+%define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2
+%define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41
+%define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal
+%define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2
+%define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41
+%define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx
+%define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw
+%define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx
+%define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw
+%define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx
+%define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw
+%define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx
+%define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw
+%define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx
+%define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw
+%define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx
+%define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw
+%define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2
+%define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw
+%define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2
+%define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw
+%define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx
+%define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw
+%define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul
+%define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square
+%define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx
+%define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul
+%define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon
+%define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1
+%define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul
+%define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon
+%define gcm_init_avx ring_core_0_17_14__gcm_init_avx
+%define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul
+%define gcm_init_neon ring_core_0_17_14__gcm_init_neon
+%define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2
+%define k25519Precomp ring_core_0_17_14__k25519Precomp
+%define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb
+%define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar
+%define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg
+%define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5
+%define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7
+%define neon_available ring_core_0_17_14__neon_available
+%define p256_mul_mont ring_core_0_17_14__p256_mul_mont
+%define p256_point_add ring_core_0_17_14__p256_point_add
+%define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine
+%define p256_point_double ring_core_0_17_14__p256_point_double
+%define p256_point_mul ring_core_0_17_14__p256_point_mul
+%define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base
+%define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime
+%define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont
+%define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont
+%define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont
+%define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2
+%define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont
+%define p384_elem_neg ring_core_0_17_14__p384_elem_neg
+%define p384_elem_sub ring_core_0_17_14__p384_elem_sub
+%define p384_point_add ring_core_0_17_14__p384_point_add
+%define p384_point_double ring_core_0_17_14__p384_point_double
+%define p384_point_mul ring_core_0_17_14__p384_point_mul
+%define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont
+%define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod
+%define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks
+%define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order
+%define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx
+%define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3
+%define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw
+%define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon
+%define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw
+%define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order
+%define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx
+%define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw
+%define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon
+%define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw
+%define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks
+%define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt
+%define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes
+%define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key
+%define x25519_NEON ring_core_0_17_14__x25519_NEON
+%define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert
+%define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative
+%define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt
+%define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg
+%define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes
+%define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime
+%define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime
+%define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base
+%define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx
+%define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked
+%define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask
+%define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd
+%define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce
+%define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx
+%define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked
+
+%endif
+%endif
diff --git a/ring-0.17.14/pregenerated/sha256-armv4-linux32.S b/ring-0.17.14/pregenerated/sha256-armv4-linux32.S
new file mode 100644
index 0000000000..b8b33352df
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-armv4-linux32.S
@@ -0,0 +1,2678 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@     https://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifdef __KERNEL__
+# define __ARM_ARCH __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+.align	5
+
+.globl	sha256_block_data_order_nohw
+.hidden	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	adr	r14,K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_neon+4)
+#else
+.word	K256-(.LK256_add_neon+8)
+#endif
+
+.globl	sha256_block_data_order_neon
+.hidden	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	5
+.skip	16
+sha256_block_data_order_neon:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+	sub	r11,sp,#16*4+16
+
+	@ K256 is just at the boundary of being easily referenced by an ADR from
+	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+	@ not fit. By moving code around, we could make it fit, but this is too
+	@ fragile. For simplicity, just load the offset from
+	@ .LK256_shortcut_neon.
+	@
+	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+	@ support it. We might be able to emulate it with a macro, but Android's
+	@ did not work when I tried it.
+	@ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm
+	ldr	r14,.LK256_shortcut_neon
+.LK256_add_neon:
+	add	r14,pc,r14
+
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8	{q0},[r1]!
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	vld1.32	{q8},[r14,:128]!
+	vld1.32	{q9},[r14,:128]!
+	vld1.32	{q10},[r14,:128]!
+	vld1.32	{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str	r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str	r1,[sp,#68]
+	mov	r1,sp
+	vrev32.8	q2,q2
+	str	r2,[sp,#72]
+	vrev32.8	q3,q3
+	str	r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32	{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32	{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32	{q10},[r1,:128]!
+	vst1.32	{q11},[r1,:128]!
+
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r1,r1,#64
+	ldr	r2,[sp,#0]
+	eor	r12,r12,r12
+	eor	r3,r5,r6
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr	r1,[sp,#68]
+	ldr	r0,[sp,#72]
+	sub	r14,r14,#256	@ rewind r14
+	teq	r1,r0
+	it	eq
+	subeq	r1,r1,#64		@ avoid SEGV
+	vld1.8	{q0},[r1]!		@ load next input block
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	it	ne
+	strne	r1,[sp,#68]
+	mov	r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8,r9,r10,r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/sha256-armv8-ios64.S b/ring-0.17.14/pregenerated/sha256-armv8-ios64.S
new file mode 100644
index 0000000000..8ecafb46a2
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-armv8-ios64.S
@@ -0,0 +1,1195 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	_sha256_block_data_order_nohw
+.private_extern	_sha256_block_data_order_nohw
+
+.align	6
+_sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,LK256@PAGE
+	add	x30,x30,LK256@PAGEOFF
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	__TEXT,__const
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	_sha256_block_data_order_hw
+.private_extern	_sha256_block_data_order_hw
+
+.align	6
+_sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,LK256@PAGE
+	add	x3,x3,LK256@PAGEOFF
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/sha256-armv8-linux64.S b/ring-0.17.14/pregenerated/sha256-armv8-linux64.S
new file mode 100644
index 0000000000..94c0682a95
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-armv8-linux64.S
@@ -0,0 +1,1195 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	sha256_block_data_order_nohw
+.hidden	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+.align	6
+sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,.LK256
+	add	x30,x30,:lo12:.LK256
+	stp	x0,x2,[x29,#96]
+
+.Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+.Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,.Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+
+.section	.rodata
+.align	6
+.type	.LK256,%object
+.LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+.size	.LK256,.-.LK256
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+.hidden	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
+.align	6
+sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,.LK256
+	add	x3,x3,:lo12:.LK256
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/sha256-armv8-win64.S b/ring-0.17.14/pregenerated/sha256-armv8-win64.S
new file mode 100644
index 0000000000..3049ef3eb1
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-armv8-win64.S
@@ -0,0 +1,1199 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	sha256_block_data_order_nohw
+
+.def sha256_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,LK256
+	add	x30,x30,:lo12:LK256
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	.rodata
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+
+.def sha256_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,LK256
+	add	x3,x3,:lo12:LK256
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-elf.S b/ring-0.17.14/pregenerated/sha256-x86_64-elf.S
new file mode 100644
index 0000000000..c800f8ee39
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-x86_64-elf.S
@@ -0,0 +1,4170 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	sha256_block_data_order_nohw
+.hidden sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,@function
+.align	16
+sha256_block_data_order_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+.section	.rodata
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	sha256_block_data_order_hw
+.hidden sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,@function
+.align	64
+sha256_block_data_order_hw:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+.globl	sha256_block_data_order_ssse3
+.hidden sha256_block_data_order_ssse3
+.type	sha256_block_data_order_ssse3,@function
+.align	64
+sha256_block_data_order_ssse3:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	K256+512(%rip),%xmm7
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rbp
+.byte	102,15,56,0,207
+	movdqa	0(%rbp),%xmm4
+	movdqa	32(%rbp),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	64(%rbp),%xmm6
+.byte	102,15,56,0,223
+	movdqa	96(%rbp),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	subq	$-128,%rbp
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	32(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	64(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	96(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_ssse3:
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.globl	sha256_block_data_order_avx
+.hidden sha256_block_data_order_avx
+.type	sha256_block_data_order_avx,@function
+.align	64
+sha256_block_data_order_avx:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+	vzeroupper
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%xmm8
+	vmovdqa	K256+512+64(%rip),%xmm9
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	subq	$-128,%rbp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpshufd	$80,%xmm0,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpshufd	$80,%xmm1,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpshufd	$80,%xmm2,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpshufd	$80,%xmm3,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lavx_00_47
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_avx
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_avx,.-sha256_block_data_order_avx
+#endif
diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-macosx.S b/ring-0.17.14/pregenerated/sha256-x86_64-macosx.S
new file mode 100644
index 0000000000..fe5d347a59
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-x86_64-macosx.S
@@ -0,0 +1,4170 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_sha256_block_data_order_nohw
+.private_extern _sha256_block_data_order_nohw
+
+.p2align	4
+_sha256_block_data_order_nohw:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+
+L$prologue:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	L$loop
+
+.p2align	4
+L$loop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	jmp	L$rounds_16_xx
+.p2align	4
+L$rounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jnz	L$rounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop
+
+	movq	88(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue:
+	ret
+
+
+.section	__DATA,__const
+.p2align	6
+
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	_sha256_block_data_order_hw
+.private_extern _sha256_block_data_order_hw
+
+.p2align	6
+_sha256_block_data_order_hw:
+
+_CET_ENDBR
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	L$oop_shaext
+
+.p2align	4
+L$oop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	L$oop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	ret
+
+
+.globl	_sha256_block_data_order_ssse3
+.private_extern _sha256_block_data_order_ssse3
+
+.p2align	6
+_sha256_block_data_order_ssse3:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+
+L$prologue_ssse3:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	L$loop_ssse3
+.p2align	4
+L$loop_ssse3:
+	movdqa	K256+512(%rip),%xmm7
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rbp
+.byte	102,15,56,0,207
+	movdqa	0(%rbp),%xmm4
+	movdqa	32(%rbp),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	64(%rbp),%xmm6
+.byte	102,15,56,0,223
+	movdqa	96(%rbp),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	L$ssse3_00_47
+
+.p2align	4
+L$ssse3_00_47:
+	subq	$-128,%rbp
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	32(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	64(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	96(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	L$ssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop_ssse3
+
+	movq	88(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue_ssse3:
+	ret
+
+
+.globl	_sha256_block_data_order_avx
+.private_extern _sha256_block_data_order_avx
+
+.p2align	6
+_sha256_block_data_order_avx:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+
+L$prologue_avx:
+
+	vzeroupper
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%xmm8
+	vmovdqa	K256+512+64(%rip),%xmm9
+	jmp	L$loop_avx
+.p2align	4
+L$loop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	L$avx_00_47
+
+.p2align	4
+L$avx_00_47:
+	subq	$-128,%rbp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpshufd	$80,%xmm0,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpshufd	$80,%xmm1,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpshufd	$80,%xmm2,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpshufd	$80,%xmm3,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	L$avx_00_47
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop_avx
+
+	movq	88(%rsp),%rsi
+
+	vzeroupper
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue_avx:
+	ret
+
+
+#endif
diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm
new file mode 100644
index 0000000000..ba0da615be
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm
@@ -0,0 +1,4413 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+global	sha256_block_data_order_nohw
+
+ALIGN	16
+sha256_block_data_order_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,16*4+4*8
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+$L$prologue:
+
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	edi,ebx
+	lea	rbp,[K256]
+	xor	edi,ecx
+	mov	r12d,DWORD[rsi]
+	mov	r13d,r8d
+	mov	r14d,eax
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r11d,r14d
+	mov	r12d,DWORD[4+rsi]
+	mov	r13d,edx
+	mov	r14d,r11d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[4+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r10d,r14d
+	mov	r12d,DWORD[8+rsi]
+	mov	r13d,ecx
+	mov	r14d,r10d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[8+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r9d,r14d
+	mov	r12d,DWORD[12+rsi]
+	mov	r13d,ebx
+	mov	r14d,r9d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[12+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	add	r8d,r14d
+	mov	r12d,DWORD[16+rsi]
+	mov	r13d,eax
+	mov	r14d,r8d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[16+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	add	edx,r14d
+	mov	r12d,DWORD[20+rsi]
+	mov	r13d,r11d
+	mov	r14d,edx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[20+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ecx,r14d
+	mov	r12d,DWORD[24+rsi]
+	mov	r13d,r10d
+	mov	r14d,ecx
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[24+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ebx,r14d
+	mov	r12d,DWORD[28+rsi]
+	mov	r13d,r9d
+	mov	r14d,ebx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[28+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	add	eax,r14d
+	mov	r12d,DWORD[32+rsi]
+	mov	r13d,r8d
+	mov	r14d,eax
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[32+rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r11d,r14d
+	mov	r12d,DWORD[36+rsi]
+	mov	r13d,edx
+	mov	r14d,r11d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[36+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r10d,r14d
+	mov	r12d,DWORD[40+rsi]
+	mov	r13d,ecx
+	mov	r14d,r10d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[40+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r9d,r14d
+	mov	r12d,DWORD[44+rsi]
+	mov	r13d,ebx
+	mov	r14d,r9d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[44+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	add	r8d,r14d
+	mov	r12d,DWORD[48+rsi]
+	mov	r13d,eax
+	mov	r14d,r8d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[48+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	add	edx,r14d
+	mov	r12d,DWORD[52+rsi]
+	mov	r13d,r11d
+	mov	r14d,edx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[52+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ecx,r14d
+	mov	r12d,DWORD[56+rsi]
+	mov	r13d,r10d
+	mov	r14d,ecx
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[56+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ebx,r14d
+	mov	r12d,DWORD[60+rsi]
+	mov	r13d,r9d
+	mov	r14d,ebx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[60+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	jmp	NEAR $L$rounds_16_xx
+ALIGN	16
+$L$rounds_16_xx:
+	mov	r13d,DWORD[4+rsp]
+	mov	r15d,DWORD[56+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	eax,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[36+rsp]
+
+	add	r12d,DWORD[rsp]
+	mov	r13d,r8d
+	add	r12d,r15d
+	mov	r14d,eax
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[8+rsp]
+	mov	edi,DWORD[60+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r11d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[40+rsp]
+
+	add	r12d,DWORD[4+rsp]
+	mov	r13d,edx
+	add	r12d,edi
+	mov	r14d,r11d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[4+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[12+rsp]
+	mov	r15d,DWORD[rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r10d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[44+rsp]
+
+	add	r12d,DWORD[8+rsp]
+	mov	r13d,ecx
+	add	r12d,r15d
+	mov	r14d,r10d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[8+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[16+rsp]
+	mov	edi,DWORD[4+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r9d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[48+rsp]
+
+	add	r12d,DWORD[12+rsp]
+	mov	r13d,ebx
+	add	r12d,edi
+	mov	r14d,r9d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[12+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[20+rsp]
+	mov	r15d,DWORD[8+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r8d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[52+rsp]
+
+	add	r12d,DWORD[16+rsp]
+	mov	r13d,eax
+	add	r12d,r15d
+	mov	r14d,r8d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[16+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[24+rsp]
+	mov	edi,DWORD[12+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	edx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[56+rsp]
+
+	add	r12d,DWORD[20+rsp]
+	mov	r13d,r11d
+	add	r12d,edi
+	mov	r14d,edx
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[20+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[28+rsp]
+	mov	r15d,DWORD[16+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ecx,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[60+rsp]
+
+	add	r12d,DWORD[24+rsp]
+	mov	r13d,r10d
+	add	r12d,r15d
+	mov	r14d,ecx
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[24+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[32+rsp]
+	mov	edi,DWORD[20+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ebx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[rsp]
+
+	add	r12d,DWORD[28+rsp]
+	mov	r13d,r9d
+	add	r12d,edi
+	mov	r14d,ebx
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[28+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[36+rsp]
+	mov	r15d,DWORD[24+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	eax,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[4+rsp]
+
+	add	r12d,DWORD[32+rsp]
+	mov	r13d,r8d
+	add	r12d,r15d
+	mov	r14d,eax
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[32+rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[40+rsp]
+	mov	edi,DWORD[28+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r11d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[8+rsp]
+
+	add	r12d,DWORD[36+rsp]
+	mov	r13d,edx
+	add	r12d,edi
+	mov	r14d,r11d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[36+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[44+rsp]
+	mov	r15d,DWORD[32+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r10d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[12+rsp]
+
+	add	r12d,DWORD[40+rsp]
+	mov	r13d,ecx
+	add	r12d,r15d
+	mov	r14d,r10d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[40+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[48+rsp]
+	mov	edi,DWORD[36+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r9d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[16+rsp]
+
+	add	r12d,DWORD[44+rsp]
+	mov	r13d,ebx
+	add	r12d,edi
+	mov	r14d,r9d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[44+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[52+rsp]
+	mov	r15d,DWORD[40+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r8d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[20+rsp]
+
+	add	r12d,DWORD[48+rsp]
+	mov	r13d,eax
+	add	r12d,r15d
+	mov	r14d,r8d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[48+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[56+rsp]
+	mov	edi,DWORD[44+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	edx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[24+rsp]
+
+	add	r12d,DWORD[52+rsp]
+	mov	r13d,r11d
+	add	r12d,edi
+	mov	r14d,edx
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[52+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[60+rsp]
+	mov	r15d,DWORD[48+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ecx,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[28+rsp]
+
+	add	r12d,DWORD[56+rsp]
+	mov	r13d,r10d
+	add	r12d,r15d
+	mov	r14d,ecx
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[56+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[rsp]
+	mov	edi,DWORD[52+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ebx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[32+rsp]
+
+	add	r12d,DWORD[60+rsp]
+	mov	r13d,r9d
+	add	r12d,edi
+	mov	r14d,ebx
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[60+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	cmp	BYTE[3+rbp],0
+	jnz	NEAR $L$rounds_16_xx
+
+	mov	rdi,QWORD[((64+0))+rsp]
+	add	eax,r14d
+	lea	rsi,[64+rsi]
+
+	add	eax,DWORD[rdi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop
+
+	mov	rsi,QWORD[88+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_nohw:
+section	.rdata rdata align=8
+ALIGN	64
+
+K256:
+	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	DD	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	DD	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	DD	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	DB	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+	DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+	DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+	DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+	DB	111,114,103,62,0
+section	.text
+
+global	sha256_block_data_order_hw
+
+ALIGN	64
+sha256_block_data_order_hw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_hw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rsp,[((-88))+rsp]
+	movaps	XMMWORD[(-8-80)+rax],xmm6
+	movaps	XMMWORD[(-8-64)+rax],xmm7
+	movaps	XMMWORD[(-8-48)+rax],xmm8
+	movaps	XMMWORD[(-8-32)+rax],xmm9
+	movaps	XMMWORD[(-8-16)+rax],xmm10
+$L$prologue_shaext:
+	lea	rcx,[((K256+128))]
+	movdqu	xmm1,XMMWORD[rdi]
+	movdqu	xmm2,XMMWORD[16+rdi]
+	movdqa	xmm7,XMMWORD[((512-128))+rcx]
+
+	pshufd	xmm0,xmm1,0x1b
+	pshufd	xmm1,xmm1,0xb1
+	pshufd	xmm2,xmm2,0x1b
+	movdqa	xmm8,xmm7
+DB	102,15,58,15,202,8
+	punpcklqdq	xmm2,xmm0
+	jmp	NEAR $L$oop_shaext
+
+ALIGN	16
+$L$oop_shaext:
+	movdqu	xmm3,XMMWORD[rsi]
+	movdqu	xmm4,XMMWORD[16+rsi]
+	movdqu	xmm5,XMMWORD[32+rsi]
+DB	102,15,56,0,223
+	movdqu	xmm6,XMMWORD[48+rsi]
+
+	movdqa	xmm0,XMMWORD[((0-128))+rcx]
+	paddd	xmm0,xmm3
+DB	102,15,56,0,231
+	movdqa	xmm10,xmm2
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	nop
+	movdqa	xmm9,xmm1
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((32-128))+rcx]
+	paddd	xmm0,xmm4
+DB	102,15,56,0,239
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	lea	rsi,[64+rsi]
+	DB	15,56,204,220
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((64-128))+rcx]
+	paddd	xmm0,xmm5
+DB	102,15,56,0,247
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+	DB	15,56,204,229
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((96-128))+rcx]
+	paddd	xmm0,xmm6
+	DB	15,56,205,222
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+	DB	15,56,204,238
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((128-128))+rcx]
+	paddd	xmm0,xmm3
+	DB	15,56,205,227
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+	DB	15,56,204,243
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((160-128))+rcx]
+	paddd	xmm0,xmm4
+	DB	15,56,205,236
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+	DB	15,56,204,220
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((192-128))+rcx]
+	paddd	xmm0,xmm5
+	DB	15,56,205,245
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+	DB	15,56,204,229
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((224-128))+rcx]
+	paddd	xmm0,xmm6
+	DB	15,56,205,222
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+	DB	15,56,204,238
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((256-128))+rcx]
+	paddd	xmm0,xmm3
+	DB	15,56,205,227
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+	DB	15,56,204,243
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((288-128))+rcx]
+	paddd	xmm0,xmm4
+	DB	15,56,205,236
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+	DB	15,56,204,220
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((320-128))+rcx]
+	paddd	xmm0,xmm5
+	DB	15,56,205,245
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+	DB	15,56,204,229
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((352-128))+rcx]
+	paddd	xmm0,xmm6
+	DB	15,56,205,222
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+	DB	15,56,204,238
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((384-128))+rcx]
+	paddd	xmm0,xmm3
+	DB	15,56,205,227
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+	DB	15,56,204,243
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((416-128))+rcx]
+	paddd	xmm0,xmm4
+	DB	15,56,205,236
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	DB	15,56,203,202
+	paddd	xmm6,xmm7
+
+	movdqa	xmm0,XMMWORD[((448-128))+rcx]
+	paddd	xmm0,xmm5
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	DB	15,56,205,245
+	movdqa	xmm7,xmm8
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((480-128))+rcx]
+	paddd	xmm0,xmm6
+	nop
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	dec	rdx
+	nop
+	DB	15,56,203,202
+
+	paddd	xmm2,xmm10
+	paddd	xmm1,xmm9
+	jnz	NEAR $L$oop_shaext
+
+	pshufd	xmm2,xmm2,0xb1
+	pshufd	xmm7,xmm1,0x1b
+	pshufd	xmm1,xmm1,0xb1
+	punpckhqdq	xmm1,xmm2
+DB	102,15,58,15,215,8
+
+	movdqu	XMMWORD[rdi],xmm1
+	movdqu	XMMWORD[16+rdi],xmm2
+	movaps	xmm6,XMMWORD[((-8-80))+rax]
+	movaps	xmm7,XMMWORD[((-8-64))+rax]
+	movaps	xmm8,XMMWORD[((-8-48))+rax]
+	movaps	xmm9,XMMWORD[((-8-32))+rax]
+	movaps	xmm10,XMMWORD[((-8-16))+rax]
+	mov	rsp,rax
+$L$epilogue_shaext:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_hw:
+global	sha256_block_data_order_ssse3
+
+ALIGN	64
+sha256_block_data_order_ssse3:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_ssse3:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,160
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+	movaps	XMMWORD[(64+32)+rsp],xmm6
+	movaps	XMMWORD[(64+48)+rsp],xmm7
+	movaps	XMMWORD[(64+64)+rsp],xmm8
+	movaps	XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_ssse3:
+
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+
+
+	jmp	NEAR $L$loop_ssse3
+ALIGN	16
+$L$loop_ssse3:
+	movdqa	xmm7,XMMWORD[((K256+512))]
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+DB	102,15,56,0,199
+	movdqu	xmm3,XMMWORD[48+rsi]
+	lea	rbp,[K256]
+DB	102,15,56,0,207
+	movdqa	xmm4,XMMWORD[rbp]
+	movdqa	xmm5,XMMWORD[32+rbp]
+DB	102,15,56,0,215
+	paddd	xmm4,xmm0
+	movdqa	xmm6,XMMWORD[64+rbp]
+DB	102,15,56,0,223
+	movdqa	xmm7,XMMWORD[96+rbp]
+	paddd	xmm5,xmm1
+	paddd	xmm6,xmm2
+	paddd	xmm7,xmm3
+	movdqa	XMMWORD[rsp],xmm4
+	mov	r14d,eax
+	movdqa	XMMWORD[16+rsp],xmm5
+	mov	edi,ebx
+	movdqa	XMMWORD[32+rsp],xmm6
+	xor	edi,ecx
+	movdqa	XMMWORD[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	NEAR $L$ssse3_00_47
+
+ALIGN	16
+$L$ssse3_00_47:
+	sub	rbp,-128
+	ror	r13d,14
+	movdqa	xmm4,xmm1
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm3
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,224,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,250,4
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm3,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm0,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm0,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm0,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD[rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm0
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD[rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm2
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm0
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,225,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,251,4
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm0,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm1,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm1,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm1,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD[32+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm1
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD[16+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm3
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm1
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,226,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,248,4
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm1,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm2,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm2,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm2,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD[64+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm2
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD[32+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm0
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm2
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,227,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,249,4
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm2,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm3,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm3,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm3,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD[96+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm3
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD[48+rsp],xmm6
+	cmp	BYTE[131+rbp],0
+	jne	NEAR $L$ssse3_00_47
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD[((64+0))+rsp]
+	mov	eax,r14d
+
+	add	eax,DWORD[rdi]
+	lea	rsi,[64+rsi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop_ssse3
+
+	mov	rsi,QWORD[88+rsp]
+
+	movaps	xmm6,XMMWORD[((64+32))+rsp]
+	movaps	xmm7,XMMWORD[((64+48))+rsp]
+	movaps	xmm8,XMMWORD[((64+64))+rsp]
+	movaps	xmm9,XMMWORD[((64+80))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_ssse3:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_ssse3:
+global	sha256_block_data_order_avx
+
+ALIGN	64
+sha256_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,160
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+	movaps	XMMWORD[(64+32)+rsp],xmm6
+	movaps	XMMWORD[(64+48)+rsp],xmm7
+	movaps	XMMWORD[(64+64)+rsp],xmm8
+	movaps	XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx:
+
+	vzeroupper
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+	vmovdqa	xmm8,XMMWORD[((K256+512+32))]
+	vmovdqa	xmm9,XMMWORD[((K256+512+64))]
+	jmp	NEAR $L$loop_avx
+ALIGN	16
+$L$loop_avx:
+	vmovdqa	xmm7,XMMWORD[((K256+512))]
+	vmovdqu	xmm0,XMMWORD[rsi]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	xmm2,XMMWORD[32+rsi]
+	vmovdqu	xmm3,XMMWORD[48+rsi]
+	vpshufb	xmm0,xmm0,xmm7
+	lea	rbp,[K256]
+	vpshufb	xmm1,xmm1,xmm7
+	vpshufb	xmm2,xmm2,xmm7
+	vpaddd	xmm4,xmm0,XMMWORD[rbp]
+	vpshufb	xmm3,xmm3,xmm7
+	vpaddd	xmm5,xmm1,XMMWORD[32+rbp]
+	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
+	vpaddd	xmm7,xmm3,XMMWORD[96+rbp]
+	vmovdqa	XMMWORD[rsp],xmm4
+	mov	r14d,eax
+	vmovdqa	XMMWORD[16+rsp],xmm5
+	mov	edi,ebx
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	xor	edi,ecx
+	vmovdqa	XMMWORD[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	NEAR $L$avx_00_47
+
+ALIGN	16
+$L$avx_00_47:
+	sub	rbp,-128
+	vpalignr	xmm4,xmm1,xmm0,4
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	vpalignr	xmm7,xmm3,xmm2,4
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	vpaddd	xmm0,xmm0,xmm7
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	vpslld	xmm5,xmm4,14
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	vpshufd	xmm7,xmm3,250
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	vpsrld	xmm6,xmm7,10
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	vpaddd	xmm0,xmm0,xmm4
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	vpaddd	xmm0,xmm0,xmm6
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	vpshufd	xmm7,xmm0,80
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	vpsrlq	xmm7,xmm7,2
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	vpaddd	xmm0,xmm0,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	vpaddd	xmm6,xmm0,XMMWORD[rbp]
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	vmovdqa	XMMWORD[rsp],xmm6
+	vpalignr	xmm4,xmm2,xmm1,4
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	vpalignr	xmm7,xmm0,xmm3,4
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	vpaddd	xmm1,xmm1,xmm7
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	vpslld	xmm5,xmm4,14
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	vpshufd	xmm7,xmm0,250
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	vpsrld	xmm6,xmm7,10
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	vpaddd	xmm1,xmm1,xmm4
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	vpaddd	xmm1,xmm1,xmm6
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	vpshufd	xmm7,xmm1,80
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	vpsrlq	xmm7,xmm7,2
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	vpaddd	xmm1,xmm1,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	vpaddd	xmm6,xmm1,XMMWORD[32+rbp]
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	vmovdqa	XMMWORD[16+rsp],xmm6
+	vpalignr	xmm4,xmm3,xmm2,4
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	vpalignr	xmm7,xmm1,xmm0,4
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	vpaddd	xmm2,xmm2,xmm7
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	vpslld	xmm5,xmm4,14
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	vpshufd	xmm7,xmm1,250
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	vpsrld	xmm6,xmm7,10
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	vpaddd	xmm2,xmm2,xmm4
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	vpaddd	xmm2,xmm2,xmm6
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	vpshufd	xmm7,xmm2,80
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	vpsrlq	xmm7,xmm7,2
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	vpaddd	xmm2,xmm2,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	vpalignr	xmm4,xmm0,xmm3,4
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	vpalignr	xmm7,xmm2,xmm1,4
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	vpaddd	xmm3,xmm3,xmm7
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	vpslld	xmm5,xmm4,14
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	vpshufd	xmm7,xmm2,250
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	vpsrld	xmm6,xmm7,10
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	vpaddd	xmm3,xmm3,xmm4
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	vpaddd	xmm3,xmm3,xmm6
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	vpshufd	xmm7,xmm3,80
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	vpsrlq	xmm7,xmm7,2
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	vpaddd	xmm3,xmm3,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	vpaddd	xmm6,xmm3,XMMWORD[96+rbp]
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	vmovdqa	XMMWORD[48+rsp],xmm6
+	cmp	BYTE[131+rbp],0
+	jne	NEAR $L$avx_00_47
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD[((64+0))+rsp]
+	mov	eax,r14d
+
+	add	eax,DWORD[rdi]
+	lea	rsi,[64+rsi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop_avx
+
+	mov	rsi,QWORD[88+rsp]
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((64+32))+rsp]
+	movaps	xmm7,XMMWORD[((64+48))+rsp]
+	movaps	xmm8,XMMWORD[((64+64))+rsp]
+	movaps	xmm9,XMMWORD[((64+80))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_avx:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+	mov	rsi,rax
+	mov	rax,QWORD[((64+24))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	rsi,[((64+32))+rsi]
+	lea	rdi,[512+r8]
+	mov	ecx,8
+	DD	0xa548f3fc
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+ALIGN	16
+shaext_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$prologue_shaext]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	r10,[$L$epilogue_shaext]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rsi,[((-8-80))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,10
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$in_prologue
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_avx wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_sha256_block_data_order_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_hw:
+	DB	9,0,0,0
+	DD	shaext_handler wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_ssse3:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_avx:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-nasm.o b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.o
new file mode 100644
index 0000000000..fcd5723481
Binary files /dev/null and b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/sha512-armv4-linux32.S b/ring-0.17.14/pregenerated/sha512-armv4-linux32.S
new file mode 100644
index 0000000000..9150b0f405
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-armv4-linux32.S
@@ -0,0 +1,1857 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@     https://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+# define adrl adr
+#else
+.code	32
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
+	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
+	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
+	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
+	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
+	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
+	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
+	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
+	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
+	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
+	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
+	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
+	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
+	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
+	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
+	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
+	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
+	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
+	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
+	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
+	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
+	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
+	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
+	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
+	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
+	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
+	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
+	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
+	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
+	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
+	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
+	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
+	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
+	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
+	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
+	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
+	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
+	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
+	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
+	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
+.size	K512,.-K512
+
+.globl	sha512_block_data_order_nohw
+.hidden	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	adr	r14,K512
+	sub	sp,sp,#9*8
+
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+.Loop:
+	str	r9, [sp,#48+0]
+	str	r10, [sp,#48+4]
+	str	r11, [sp,#56+0]
+	str	r12, [sp,#56+4]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	str	r3,[sp,#8+0]
+	str	r4,[sp,#8+4]
+	str	r9, [sp,#16+0]
+	str	r10, [sp,#16+4]
+	str	r11, [sp,#24+0]
+	str	r12, [sp,#24+4]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
+	str	r3,[sp,#40+0]
+	str	r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH<7
+	ldrb	r3,[r1,#7]
+	ldrb	r9, [r1,#6]
+	ldrb	r10, [r1,#5]
+	ldrb	r11, [r1,#4]
+	ldrb	r4,[r1,#3]
+	ldrb	r12, [r1,#2]
+	orr	r3,r3,r9,lsl#8
+	ldrb	r9, [r1,#1]
+	orr	r3,r3,r10,lsl#16
+	ldrb	r10, [r1],#8
+	orr	r3,r3,r11,lsl#24
+	orr	r4,r4,r12,lsl#8
+	orr	r4,r4,r9,lsl#16
+	orr	r4,r4,r10,lsl#24
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	tst	r14,#1
+	beq	.L00_15
+	ldr	r9,[sp,#184+0]
+	ldr	r10,[sp,#184+4]
+	bic	r14,r14,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
+	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
+	eor	r3,r3,r10,lsl#31
+	eor	r4,r4,r9,lsl#31
+	eor	r3,r3,r9,lsr#8
+	eor	r4,r4,r10,lsr#8
+	eor	r3,r3,r10,lsl#24
+	eor	r4,r4,r9,lsl#24
+	eor	r3,r3,r9,lsr#7
+	eor	r4,r4,r10,lsr#7
+	eor	r3,r3,r10,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	r9,r11,lsr#19
+	mov	r10,r12,lsr#19
+	eor	r9,r9,r12,lsl#13
+	eor	r10,r10,r11,lsl#13
+	eor	r9,r9,r12,lsr#29
+	eor	r10,r10,r11,lsr#29
+	eor	r9,r9,r11,lsl#3
+	eor	r10,r10,r12,lsl#3
+	eor	r9,r9,r11,lsr#6
+	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
+	eor	r9,r9,r12,lsl#26
+
+	ldr	r12,[sp,#120+4]
+	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
+	adc	r4,r4,r10
+
+	ldr	r10,[sp,#192+4]
+	adds	r3,r3,r11
+	adc	r4,r4,r12
+	adds	r3,r3,r9
+	adc	r4,r4,r10
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+#if __ARM_ARCH>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
+	beq	.L16_79
+	bic	r14,r14,#1
+
+	ldr	r3,[sp,#8+0]
+	ldr	r4,[sp,#8+4]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#8+HI]
+
+	ldr	r5,[sp,#16+0]
+	ldr	r6,[sp,#16+4]
+	ldr	r3,[sp,#24+0]
+	ldr	r4,[sp,#24+4]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#24+HI]
+
+	ldr	r3,[sp,#40+0]
+	ldr	r4,[sp,#40+4]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
+	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
+	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#40+HI]
+
+	ldr	r5,[sp,#48+0]
+	ldr	r6,[sp,#48+4]
+	ldr	r3,[sp,#56+0]
+	ldr	r4,[sp,#56+4]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#56+HI]
+
+	add	sp,sp,#640
+	sub	r14,r14,#640
+
+	teq	r1,r2
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	sha512_block_data_order_neon
+.hidden	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+	dmb	@ errata #451034 on early Cortex A8
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	adr	r3,K512
+	VFP_ABI_PUSH
+	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64	{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 0>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 1>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64	{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 2>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 3>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64	{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 4>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 5>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64	{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 6>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 7>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64	{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 8>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 9>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64	{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 10>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 11>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64	{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 12>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 13>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64	{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 14>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 15>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	mov	r12,#4
+.L16_79_neon:
+	subs	r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q7,#6
+	vsli.64	q12,q7,#45
+	vext.8	q14,q0,q1,#8	@ X[i+1]
+	vsli.64	q13,q7,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q4,q5,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 17>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q0,#6
+	vsli.64	q12,q0,#45
+	vext.8	q14,q1,q2,#8	@ X[i+1]
+	vsli.64	q13,q0,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q5,q6,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 19>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q1,#6
+	vsli.64	q12,q1,#45
+	vext.8	q14,q2,q3,#8	@ X[i+1]
+	vsli.64	q13,q1,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q6,q7,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 21>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q2,#6
+	vsli.64	q12,q2,#45
+	vext.8	q14,q3,q4,#8	@ X[i+1]
+	vsli.64	q13,q2,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q7,q0,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 23>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q3,#6
+	vsli.64	q12,q3,#45
+	vext.8	q14,q4,q5,#8	@ X[i+1]
+	vsli.64	q13,q3,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q0,q1,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 25>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q4,#6
+	vsli.64	q12,q4,#45
+	vext.8	q14,q5,q6,#8	@ X[i+1]
+	vsli.64	q13,q4,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q1,q2,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 27>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q5,#6
+	vsli.64	q12,q5,#45
+	vext.8	q14,q6,q7,#8	@ X[i+1]
+	vsli.64	q13,q5,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q2,q3,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 29>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q6,#6
+	vsli.64	q12,q6,#45
+	vext.8	q14,q7,q0,#8	@ X[i+1]
+	vsli.64	q13,q6,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q3,q4,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 31>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	bne	.L16_79_neon
+
+	vadd.i64	d16,d30		@ h+=Maj from the past
+	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
+	teq	r1,r2
+	sub	r3,#640	@ rewind K512
+	bne	.Loop_neon
+
+	VFP_ABI_POP
+	bx	lr				@ .word	0xe12fff1e
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/sha512-armv8-ios64.S b/ring-0.17.14/pregenerated/sha512-armv8-ios64.S
new file mode 100644
index 0000000000..06745ec50d
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-armv8-ios64.S
@@ -0,0 +1,1598 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	_sha512_block_data_order_nohw
+.private_extern	_sha512_block_data_order_nohw
+
+.align	6
+_sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,LK512@PAGE
+	add	x30,x30,LK512@PAGEOFF
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	__TEXT,__const
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	_sha512_block_data_order_hw
+.private_extern	_sha512_block_data_order_hw
+
+.align	6
+_sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512@PAGE
+	add	x3,x3,LK512@PAGEOFF
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/sha512-armv8-linux64.S b/ring-0.17.14/pregenerated/sha512-armv8-linux64.S
new file mode 100644
index 0000000000..c1aa3797a6
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-armv8-linux64.S
@@ -0,0 +1,1598 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	sha512_block_data_order_nohw
+.hidden	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+.align	6
+sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,.LK512
+	add	x30,x30,:lo12:.LK512
+	stp	x0,x2,[x29,#96]
+
+.Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+.Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,.Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+
+.section	.rodata
+.align	6
+.type	.LK512,%object
+.LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+.size	.LK512,.-.LK512
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+.hidden	sha512_block_data_order_hw
+.type	sha512_block_data_order_hw,%function
+.align	6
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,.LK512
+	add	x3,x3,:lo12:.LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	.Loop_hw
+
+.align	4
+.Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/sha512-armv8-win64.S b/ring-0.17.14/pregenerated/sha512-armv8-win64.S
new file mode 100644
index 0000000000..340b2b0ff5
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-armv8-win64.S
@@ -0,0 +1,1602 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+#endif
+
+.text
+
+.globl	sha512_block_data_order_nohw
+
+.def sha512_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,LK512
+	add	x30,x30,:lo12:LK512
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	.rodata
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+
+.def sha512_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512
+	add	x3,x3,:lo12:LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-elf.S b/ring-0.17.14/pregenerated/sha512-x86_64-elf.S
new file mode 100644
index 0000000000..7e7727757d
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-x86_64-elf.S
@@ -0,0 +1,2978 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	sha512_block_data_order_nohw
+.hidden sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,@function
+.align	16
+sha512_block_data_order_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movq	%rbx,%rdi
+	leaq	K512(%rip),%rbp
+	xorq	%rcx,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rax
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	72(%rsp),%r12
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	80(%rsp),%r12
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	88(%rsp),%r12
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	96(%rsp),%r12
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	104(%rsp),%r12
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	112(%rsp),%r12
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	120(%rsp),%r12
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	0(%rsp),%r12
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	8(%rsp),%r12
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	16(%rsp),%r12
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	24(%rsp),%r12
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	32(%rsp),%r12
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	40(%rsp),%r12
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	48(%rsp),%r12
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	56(%rsp),%r12
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	64(%rsp),%r12
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	cmpb	$0,7(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	addq	%r14,%rax
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop
+
+	movq	152(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	ret
+.cfi_endproc	
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+.section	.rodata
+.align	64
+.type	K512,@object
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	sha512_block_data_order_avx
+.hidden sha512_block_data_order_avx
+.type	sha512_block_data_order_avx,@function
+.align	64
+sha512_block_data_order_avx:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm0,%xmm0
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm7,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm7,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm7,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm0,%xmm0
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm1,%xmm1
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm0,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm0,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm0,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm1,%xmm1
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm2,%xmm2
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm1,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm1,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm1,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm2,%xmm2
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm3,%xmm3
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm2,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm2,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm2,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm3,%xmm3
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm4,%xmm4
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm3,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm3,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm3,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm4,%xmm4
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm5,%xmm5
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm4,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm4,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm4,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm5,%xmm5
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm6,%xmm6
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm5,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm5,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm5,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm6,%xmm6
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm7,%xmm7
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm6,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm6,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm6,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm7,%xmm7
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	.Lavx_00_47
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop_avx
+
+	movq	152(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	ret
+.cfi_endproc	
+.size	sha512_block_data_order_avx,.-sha512_block_data_order_avx
+#endif
diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-macosx.S b/ring-0.17.14/pregenerated/sha512-x86_64-macosx.S
new file mode 100644
index 0000000000..f882ce0193
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-x86_64-macosx.S
@@ -0,0 +1,2978 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_sha512_block_data_order_nohw
+.private_extern _sha512_block_data_order_nohw
+
+.p2align	4
+_sha512_block_data_order_nohw:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+
+L$prologue:
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	L$loop
+
+.p2align	4
+L$loop:
+	movq	%rbx,%rdi
+	leaq	K512(%rip),%rbp
+	xorq	%rcx,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rax
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	jmp	L$rounds_16_xx
+.p2align	4
+L$rounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	72(%rsp),%r12
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	80(%rsp),%r12
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	88(%rsp),%r12
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	96(%rsp),%r12
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	104(%rsp),%r12
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	112(%rsp),%r12
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	120(%rsp),%r12
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	0(%rsp),%r12
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	8(%rsp),%r12
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	16(%rsp),%r12
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	24(%rsp),%r12
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	32(%rsp),%r12
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	40(%rsp),%r12
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	48(%rsp),%r12
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	56(%rsp),%r12
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	64(%rsp),%r12
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	cmpb	$0,7(%rbp)
+	jnz	L$rounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	addq	%r14,%rax
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	L$loop
+
+	movq	152(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue:
+	ret
+
+
+.section	__DATA,__const
+.p2align	6
+
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	_sha512_block_data_order_avx
+.private_extern _sha512_block_data_order_avx
+
+.p2align	6
+_sha512_block_data_order_avx:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+
+L$prologue_avx:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	L$loop_avx
+.p2align	4
+L$loop_avx:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	L$avx_00_47
+
+.p2align	4
+L$avx_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm0,%xmm0
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm7,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm7,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm7,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm0,%xmm0
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm1,%xmm1
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm0,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm0,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm0,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm1,%xmm1
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm2,%xmm2
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm1,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm1,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm1,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm2,%xmm2
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm3,%xmm3
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm2,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm2,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm2,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm3,%xmm3
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm4,%xmm4
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm3,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm3,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm3,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm4,%xmm4
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm5,%xmm5
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm4,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm4,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm4,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm5,%xmm5
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm6,%xmm6
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm5,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm5,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm5,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm6,%xmm6
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm7,%xmm7
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm6,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm6,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm6,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm7,%xmm7
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	L$avx_00_47
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	L$loop_avx
+
+	movq	152(%rsp),%rsi
+
+	vzeroupper
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue_avx:
+	ret
+
+
+#endif
diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm
new file mode 100644
index 0000000000..96e1ad9f45
--- /dev/null
+++ b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm
@@ -0,0 +1,3138 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+global	sha512_block_data_order_nohw
+
+ALIGN	16
+sha512_block_data_order_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha512_block_data_order_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,16*8+4*8
+	lea	rdx,[rdx*8+rsi]
+	and	rsp,-64
+	mov	QWORD[((128+0))+rsp],rdi
+	mov	QWORD[((128+8))+rsp],rsi
+	mov	QWORD[((128+16))+rsp],rdx
+	mov	QWORD[152+rsp],rax
+
+$L$prologue:
+
+	mov	rax,QWORD[rdi]
+	mov	rbx,QWORD[8+rdi]
+	mov	rcx,QWORD[16+rdi]
+	mov	rdx,QWORD[24+rdi]
+	mov	r8,QWORD[32+rdi]
+	mov	r9,QWORD[40+rdi]
+	mov	r10,QWORD[48+rdi]
+	mov	r11,QWORD[56+rdi]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	rdi,rbx
+	lea	rbp,[K512]
+	xor	rdi,rcx
+	mov	r12,QWORD[rsi]
+	mov	r13,r8
+	mov	r14,rax
+	bswap	r12
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	add	r11,r14
+	mov	r12,QWORD[8+rsi]
+	mov	r13,rdx
+	mov	r14,r11
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[8+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	add	r10,r14
+	mov	r12,QWORD[16+rsi]
+	mov	r13,rcx
+	mov	r14,r10
+	bswap	r12
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[16+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	add	r9,r14
+	mov	r12,QWORD[24+rsi]
+	mov	r13,rbx
+	mov	r14,r9
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[24+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	add	r8,r14
+	mov	r12,QWORD[32+rsi]
+	mov	r13,rax
+	mov	r14,r8
+	bswap	r12
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[32+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	add	rdx,r14
+	mov	r12,QWORD[40+rsi]
+	mov	r13,r11
+	mov	r14,rdx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[40+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	add	rcx,r14
+	mov	r12,QWORD[48+rsi]
+	mov	r13,r10
+	mov	r14,rcx
+	bswap	r12
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[48+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	add	rbx,r14
+	mov	r12,QWORD[56+rsi]
+	mov	r13,r9
+	mov	r14,rbx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[56+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	add	rax,r14
+	mov	r12,QWORD[64+rsi]
+	mov	r13,r8
+	mov	r14,rax
+	bswap	r12
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[64+rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	add	r11,r14
+	mov	r12,QWORD[72+rsi]
+	mov	r13,rdx
+	mov	r14,r11
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[72+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	add	r10,r14
+	mov	r12,QWORD[80+rsi]
+	mov	r13,rcx
+	mov	r14,r10
+	bswap	r12
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[80+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	add	r9,r14
+	mov	r12,QWORD[88+rsi]
+	mov	r13,rbx
+	mov	r14,r9
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[88+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	add	r8,r14
+	mov	r12,QWORD[96+rsi]
+	mov	r13,rax
+	mov	r14,r8
+	bswap	r12
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[96+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	add	rdx,r14
+	mov	r12,QWORD[104+rsi]
+	mov	r13,r11
+	mov	r14,rdx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[104+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	add	rcx,r14
+	mov	r12,QWORD[112+rsi]
+	mov	r13,r10
+	mov	r14,rcx
+	bswap	r12
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[112+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	add	rbx,r14
+	mov	r12,QWORD[120+rsi]
+	mov	r13,r9
+	mov	r14,rbx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[120+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	jmp	NEAR $L$rounds_16_xx
+ALIGN	16
+$L$rounds_16_xx:
+	mov	r13,QWORD[8+rsp]
+	mov	r15,QWORD[112+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rax,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[72+rsp]
+
+	add	r12,QWORD[rsp]
+	mov	r13,r8
+	add	r12,r15
+	mov	r14,rax
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[16+rsp]
+	mov	rdi,QWORD[120+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r11,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[80+rsp]
+
+	add	r12,QWORD[8+rsp]
+	mov	r13,rdx
+	add	r12,rdi
+	mov	r14,r11
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[8+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[24+rsp]
+	mov	r15,QWORD[rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r10,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[88+rsp]
+
+	add	r12,QWORD[16+rsp]
+	mov	r13,rcx
+	add	r12,r15
+	mov	r14,r10
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[16+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[32+rsp]
+	mov	rdi,QWORD[8+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r9,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[96+rsp]
+
+	add	r12,QWORD[24+rsp]
+	mov	r13,rbx
+	add	r12,rdi
+	mov	r14,r9
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[24+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[40+rsp]
+	mov	r15,QWORD[16+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r8,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[104+rsp]
+
+	add	r12,QWORD[32+rsp]
+	mov	r13,rax
+	add	r12,r15
+	mov	r14,r8
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[32+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[48+rsp]
+	mov	rdi,QWORD[24+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rdx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[112+rsp]
+
+	add	r12,QWORD[40+rsp]
+	mov	r13,r11
+	add	r12,rdi
+	mov	r14,rdx
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[40+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[56+rsp]
+	mov	r15,QWORD[32+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rcx,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[120+rsp]
+
+	add	r12,QWORD[48+rsp]
+	mov	r13,r10
+	add	r12,r15
+	mov	r14,rcx
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[48+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[64+rsp]
+	mov	rdi,QWORD[40+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rbx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[rsp]
+
+	add	r12,QWORD[56+rsp]
+	mov	r13,r9
+	add	r12,rdi
+	mov	r14,rbx
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[56+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[72+rsp]
+	mov	r15,QWORD[48+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rax,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[8+rsp]
+
+	add	r12,QWORD[64+rsp]
+	mov	r13,r8
+	add	r12,r15
+	mov	r14,rax
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[64+rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[80+rsp]
+	mov	rdi,QWORD[56+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r11,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[16+rsp]
+
+	add	r12,QWORD[72+rsp]
+	mov	r13,rdx
+	add	r12,rdi
+	mov	r14,r11
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[72+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[88+rsp]
+	mov	r15,QWORD[64+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r10,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[24+rsp]
+
+	add	r12,QWORD[80+rsp]
+	mov	r13,rcx
+	add	r12,r15
+	mov	r14,r10
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[80+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[96+rsp]
+	mov	rdi,QWORD[72+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r9,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[32+rsp]
+
+	add	r12,QWORD[88+rsp]
+	mov	r13,rbx
+	add	r12,rdi
+	mov	r14,r9
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[88+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[104+rsp]
+	mov	r15,QWORD[80+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r8,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[40+rsp]
+
+	add	r12,QWORD[96+rsp]
+	mov	r13,rax
+	add	r12,r15
+	mov	r14,r8
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[96+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[112+rsp]
+	mov	rdi,QWORD[88+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rdx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[48+rsp]
+
+	add	r12,QWORD[104+rsp]
+	mov	r13,r11
+	add	r12,rdi
+	mov	r14,rdx
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[104+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[120+rsp]
+	mov	r15,QWORD[96+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rcx,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[56+rsp]
+
+	add	r12,QWORD[112+rsp]
+	mov	r13,r10
+	add	r12,r15
+	mov	r14,rcx
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[112+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[rsp]
+	mov	rdi,QWORD[104+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rbx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[64+rsp]
+
+	add	r12,QWORD[120+rsp]
+	mov	r13,r9
+	add	r12,rdi
+	mov	r14,rbx
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[120+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	cmp	BYTE[7+rbp],0
+	jnz	NEAR $L$rounds_16_xx
+
+	mov	rdi,QWORD[((128+0))+rsp]
+	add	rax,r14
+	lea	rsi,[128+rsi]
+
+	add	rax,QWORD[rdi]
+	add	rbx,QWORD[8+rdi]
+	add	rcx,QWORD[16+rdi]
+	add	rdx,QWORD[24+rdi]
+	add	r8,QWORD[32+rdi]
+	add	r9,QWORD[40+rdi]
+	add	r10,QWORD[48+rdi]
+	add	r11,QWORD[56+rdi]
+
+	cmp	rsi,QWORD[((128+16))+rsp]
+
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+	mov	QWORD[16+rdi],rcx
+	mov	QWORD[24+rdi],rdx
+	mov	QWORD[32+rdi],r8
+	mov	QWORD[40+rdi],r9
+	mov	QWORD[48+rdi],r10
+	mov	QWORD[56+rdi],r11
+	jb	NEAR $L$loop
+
+	mov	rsi,QWORD[152+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha512_block_data_order_nohw:
+section	.rdata rdata align=8
+ALIGN	64
+
+K512:
+	DQ	0x428a2f98d728ae22,0x7137449123ef65cd
+	DQ	0x428a2f98d728ae22,0x7137449123ef65cd
+	DQ	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	DQ	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	DQ	0x3956c25bf348b538,0x59f111f1b605d019
+	DQ	0x3956c25bf348b538,0x59f111f1b605d019
+	DQ	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	DQ	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	DQ	0xd807aa98a3030242,0x12835b0145706fbe
+	DQ	0xd807aa98a3030242,0x12835b0145706fbe
+	DQ	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	DQ	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	DQ	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	DQ	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	DQ	0x9bdc06a725c71235,0xc19bf174cf692694
+	DQ	0x9bdc06a725c71235,0xc19bf174cf692694
+	DQ	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	DQ	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	DQ	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	DQ	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	DQ	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	DQ	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	DQ	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	DQ	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	DQ	0x983e5152ee66dfab,0xa831c66d2db43210
+	DQ	0x983e5152ee66dfab,0xa831c66d2db43210
+	DQ	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	DQ	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	DQ	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	DQ	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	DQ	0x06ca6351e003826f,0x142929670a0e6e70
+	DQ	0x06ca6351e003826f,0x142929670a0e6e70
+	DQ	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	DQ	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	DQ	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	DQ	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	DQ	0x650a73548baf63de,0x766a0abb3c77b2a8
+	DQ	0x650a73548baf63de,0x766a0abb3c77b2a8
+	DQ	0x81c2c92e47edaee6,0x92722c851482353b
+	DQ	0x81c2c92e47edaee6,0x92722c851482353b
+	DQ	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	DQ	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	DQ	0xc24b8b70d0f89791,0xc76c51a30654be30
+	DQ	0xc24b8b70d0f89791,0xc76c51a30654be30
+	DQ	0xd192e819d6ef5218,0xd69906245565a910
+	DQ	0xd192e819d6ef5218,0xd69906245565a910
+	DQ	0xf40e35855771202a,0x106aa07032bbd1b8
+	DQ	0xf40e35855771202a,0x106aa07032bbd1b8
+	DQ	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	DQ	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	DQ	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	DQ	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	DQ	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	DQ	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	DQ	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	DQ	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	DQ	0x748f82ee5defb2fc,0x78a5636f43172f60
+	DQ	0x748f82ee5defb2fc,0x78a5636f43172f60
+	DQ	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	DQ	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	DQ	0x90befffa23631e28,0xa4506cebde82bde9
+	DQ	0x90befffa23631e28,0xa4506cebde82bde9
+	DQ	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	DQ	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	DQ	0xca273eceea26619c,0xd186b8c721c0c207
+	DQ	0xca273eceea26619c,0xd186b8c721c0c207
+	DQ	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	DQ	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	DQ	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	DQ	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	DQ	0x113f9804bef90dae,0x1b710b35131c471b
+	DQ	0x113f9804bef90dae,0x1b710b35131c471b
+	DQ	0x28db77f523047d84,0x32caab7b40c72493
+	DQ	0x28db77f523047d84,0x32caab7b40c72493
+	DQ	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	DQ	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	DQ	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	DQ	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	DQ	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	DQ	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+	DQ	0x0001020304050607,0x08090a0b0c0d0e0f
+	DQ	0x0001020304050607,0x08090a0b0c0d0e0f
+	DB	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+	DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+	DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+	DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+	DB	111,114,103,62,0
+section	.text
+
+global	sha512_block_data_order_avx
+
+ALIGN	64
+sha512_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,256
+	lea	rdx,[rdx*8+rsi]
+	and	rsp,-64
+	mov	QWORD[((128+0))+rsp],rdi
+	mov	QWORD[((128+8))+rsp],rsi
+	mov	QWORD[((128+16))+rsp],rdx
+	mov	QWORD[152+rsp],rax
+
+	movaps	XMMWORD[(128+32)+rsp],xmm6
+	movaps	XMMWORD[(128+48)+rsp],xmm7
+	movaps	XMMWORD[(128+64)+rsp],xmm8
+	movaps	XMMWORD[(128+80)+rsp],xmm9
+	movaps	XMMWORD[(128+96)+rsp],xmm10
+	movaps	XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+	vzeroupper
+	mov	rax,QWORD[rdi]
+	mov	rbx,QWORD[8+rdi]
+	mov	rcx,QWORD[16+rdi]
+	mov	rdx,QWORD[24+rdi]
+	mov	r8,QWORD[32+rdi]
+	mov	r9,QWORD[40+rdi]
+	mov	r10,QWORD[48+rdi]
+	mov	r11,QWORD[56+rdi]
+	jmp	NEAR $L$loop_avx
+ALIGN	16
+$L$loop_avx:
+	vmovdqa	xmm11,XMMWORD[((K512+1280))]
+	vmovdqu	xmm0,XMMWORD[rsi]
+	lea	rbp,[((K512+128))]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	xmm2,XMMWORD[32+rsi]
+	vpshufb	xmm0,xmm0,xmm11
+	vmovdqu	xmm3,XMMWORD[48+rsi]
+	vpshufb	xmm1,xmm1,xmm11
+	vmovdqu	xmm4,XMMWORD[64+rsi]
+	vpshufb	xmm2,xmm2,xmm11
+	vmovdqu	xmm5,XMMWORD[80+rsi]
+	vpshufb	xmm3,xmm3,xmm11
+	vmovdqu	xmm6,XMMWORD[96+rsi]
+	vpshufb	xmm4,xmm4,xmm11
+	vmovdqu	xmm7,XMMWORD[112+rsi]
+	vpshufb	xmm5,xmm5,xmm11
+	vpaddq	xmm8,xmm0,XMMWORD[((-128))+rbp]
+	vpshufb	xmm6,xmm6,xmm11
+	vpaddq	xmm9,xmm1,XMMWORD[((-96))+rbp]
+	vpshufb	xmm7,xmm7,xmm11
+	vpaddq	xmm10,xmm2,XMMWORD[((-64))+rbp]
+	vpaddq	xmm11,xmm3,XMMWORD[((-32))+rbp]
+	vmovdqa	XMMWORD[rsp],xmm8
+	vpaddq	xmm8,xmm4,XMMWORD[rbp]
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	vpaddq	xmm9,xmm5,XMMWORD[32+rbp]
+	vmovdqa	XMMWORD[32+rsp],xmm10
+	vpaddq	xmm10,xmm6,XMMWORD[64+rbp]
+	vmovdqa	XMMWORD[48+rsp],xmm11
+	vpaddq	xmm11,xmm7,XMMWORD[96+rbp]
+	vmovdqa	XMMWORD[64+rsp],xmm8
+	mov	r14,rax
+	vmovdqa	XMMWORD[80+rsp],xmm9
+	mov	rdi,rbx
+	vmovdqa	XMMWORD[96+rsp],xmm10
+	xor	rdi,rcx
+	vmovdqa	XMMWORD[112+rsp],xmm11
+	mov	r13,r8
+	jmp	NEAR $L$avx_00_47
+
+ALIGN	16
+$L$avx_00_47:
+	add	rbp,256
+	vpalignr	xmm8,xmm1,xmm0,8
+	shrd	r13,r13,23
+	mov	rax,r14
+	vpalignr	xmm11,xmm5,xmm4,8
+	mov	r12,r9
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r8
+	xor	r12,r10
+	vpaddq	xmm0,xmm0,xmm11
+	shrd	r13,r13,4
+	xor	r14,rax
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r8
+	xor	r13,r8
+	vpsllq	xmm9,xmm8,56
+	add	r11,QWORD[rsp]
+	mov	r15,rax
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r10
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rbx
+	add	r11,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rax
+	add	r11,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm7,6
+	add	rdx,r11
+	add	r11,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rdx
+	add	r14,r11
+	vpsllq	xmm10,xmm7,3
+	shrd	r13,r13,23
+	mov	r11,r14
+	vpaddq	xmm0,xmm0,xmm8
+	mov	r12,r8
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm7,19
+	xor	r13,rdx
+	xor	r12,r9
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r11
+	vpsllq	xmm10,xmm10,42
+	and	r12,rdx
+	xor	r13,rdx
+	vpxor	xmm11,xmm11,xmm9
+	add	r10,QWORD[8+rsp]
+	mov	rdi,r11
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r9
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rax
+	add	r10,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm0,xmm0,xmm11
+	xor	r14,r11
+	add	r10,r13
+	vpaddq	xmm10,xmm0,XMMWORD[((-128))+rbp]
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	vmovdqa	XMMWORD[rsp],xmm10
+	vpalignr	xmm8,xmm2,xmm1,8
+	shrd	r13,r13,23
+	mov	r10,r14
+	vpalignr	xmm11,xmm6,xmm5,8
+	mov	r12,rdx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rcx
+	xor	r12,r8
+	vpaddq	xmm1,xmm1,xmm11
+	shrd	r13,r13,4
+	xor	r14,r10
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rcx
+	xor	r13,rcx
+	vpsllq	xmm9,xmm8,56
+	add	r9,QWORD[16+rsp]
+	mov	r15,r10
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r8
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r11
+	add	r9,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r10
+	add	r9,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r11
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm0,6
+	add	rbx,r9
+	add	r9,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rbx
+	add	r14,r9
+	vpsllq	xmm10,xmm0,3
+	shrd	r13,r13,23
+	mov	r9,r14
+	vpaddq	xmm1,xmm1,xmm8
+	mov	r12,rcx
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm0,19
+	xor	r13,rbx
+	xor	r12,rdx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r9
+	vpsllq	xmm10,xmm10,42
+	and	r12,rbx
+	xor	r13,rbx
+	vpxor	xmm11,xmm11,xmm9
+	add	r8,QWORD[24+rsp]
+	mov	rdi,r9
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rdx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r10
+	add	r8,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm1,xmm1,xmm11
+	xor	r14,r9
+	add	r8,r13
+	vpaddq	xmm10,xmm1,XMMWORD[((-96))+rbp]
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	vmovdqa	XMMWORD[16+rsp],xmm10
+	vpalignr	xmm8,xmm3,xmm2,8
+	shrd	r13,r13,23
+	mov	r8,r14
+	vpalignr	xmm11,xmm7,xmm6,8
+	mov	r12,rbx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rax
+	xor	r12,rcx
+	vpaddq	xmm2,xmm2,xmm11
+	shrd	r13,r13,4
+	xor	r14,r8
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rax
+	xor	r13,rax
+	vpsllq	xmm9,xmm8,56
+	add	rdx,QWORD[32+rsp]
+	mov	r15,r8
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rcx
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r9
+	add	rdx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r8
+	add	rdx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r9
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm1,6
+	add	r11,rdx
+	add	rdx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r11
+	add	r14,rdx
+	vpsllq	xmm10,xmm1,3
+	shrd	r13,r13,23
+	mov	rdx,r14
+	vpaddq	xmm2,xmm2,xmm8
+	mov	r12,rax
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm1,19
+	xor	r13,r11
+	xor	r12,rbx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rdx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r11
+	xor	r13,r11
+	vpxor	xmm11,xmm11,xmm9
+	add	rcx,QWORD[40+rsp]
+	mov	rdi,rdx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rbx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r8
+	add	rcx,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm2,xmm2,xmm11
+	xor	r14,rdx
+	add	rcx,r13
+	vpaddq	xmm10,xmm2,XMMWORD[((-64))+rbp]
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	vmovdqa	XMMWORD[32+rsp],xmm10
+	vpalignr	xmm8,xmm4,xmm3,8
+	shrd	r13,r13,23
+	mov	rcx,r14
+	vpalignr	xmm11,xmm0,xmm7,8
+	mov	r12,r11
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r10
+	xor	r12,rax
+	vpaddq	xmm3,xmm3,xmm11
+	shrd	r13,r13,4
+	xor	r14,rcx
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r10
+	xor	r13,r10
+	vpsllq	xmm9,xmm8,56
+	add	rbx,QWORD[48+rsp]
+	mov	r15,rcx
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rax
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rdx
+	add	rbx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rcx
+	add	rbx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm2,6
+	add	r9,rbx
+	add	rbx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r9
+	add	r14,rbx
+	vpsllq	xmm10,xmm2,3
+	shrd	r13,r13,23
+	mov	rbx,r14
+	vpaddq	xmm3,xmm3,xmm8
+	mov	r12,r10
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm2,19
+	xor	r13,r9
+	xor	r12,r11
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rbx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r9
+	xor	r13,r9
+	vpxor	xmm11,xmm11,xmm9
+	add	rax,QWORD[56+rsp]
+	mov	rdi,rbx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r11
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rcx
+	add	rax,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm3,xmm3,xmm11
+	xor	r14,rbx
+	add	rax,r13
+	vpaddq	xmm10,xmm3,XMMWORD[((-32))+rbp]
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	vmovdqa	XMMWORD[48+rsp],xmm10
+	vpalignr	xmm8,xmm5,xmm4,8
+	shrd	r13,r13,23
+	mov	rax,r14
+	vpalignr	xmm11,xmm1,xmm0,8
+	mov	r12,r9
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r8
+	xor	r12,r10
+	vpaddq	xmm4,xmm4,xmm11
+	shrd	r13,r13,4
+	xor	r14,rax
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r8
+	xor	r13,r8
+	vpsllq	xmm9,xmm8,56
+	add	r11,QWORD[64+rsp]
+	mov	r15,rax
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r10
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rbx
+	add	r11,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rax
+	add	r11,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm3,6
+	add	rdx,r11
+	add	r11,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rdx
+	add	r14,r11
+	vpsllq	xmm10,xmm3,3
+	shrd	r13,r13,23
+	mov	r11,r14
+	vpaddq	xmm4,xmm4,xmm8
+	mov	r12,r8
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm3,19
+	xor	r13,rdx
+	xor	r12,r9
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r11
+	vpsllq	xmm10,xmm10,42
+	and	r12,rdx
+	xor	r13,rdx
+	vpxor	xmm11,xmm11,xmm9
+	add	r10,QWORD[72+rsp]
+	mov	rdi,r11
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r9
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rax
+	add	r10,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm4,xmm4,xmm11
+	xor	r14,r11
+	add	r10,r13
+	vpaddq	xmm10,xmm4,XMMWORD[rbp]
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	vmovdqa	XMMWORD[64+rsp],xmm10
+	vpalignr	xmm8,xmm6,xmm5,8
+	shrd	r13,r13,23
+	mov	r10,r14
+	vpalignr	xmm11,xmm2,xmm1,8
+	mov	r12,rdx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rcx
+	xor	r12,r8
+	vpaddq	xmm5,xmm5,xmm11
+	shrd	r13,r13,4
+	xor	r14,r10
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rcx
+	xor	r13,rcx
+	vpsllq	xmm9,xmm8,56
+	add	r9,QWORD[80+rsp]
+	mov	r15,r10
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r8
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r11
+	add	r9,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r10
+	add	r9,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r11
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm4,6
+	add	rbx,r9
+	add	r9,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rbx
+	add	r14,r9
+	vpsllq	xmm10,xmm4,3
+	shrd	r13,r13,23
+	mov	r9,r14
+	vpaddq	xmm5,xmm5,xmm8
+	mov	r12,rcx
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm4,19
+	xor	r13,rbx
+	xor	r12,rdx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r9
+	vpsllq	xmm10,xmm10,42
+	and	r12,rbx
+	xor	r13,rbx
+	vpxor	xmm11,xmm11,xmm9
+	add	r8,QWORD[88+rsp]
+	mov	rdi,r9
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rdx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r10
+	add	r8,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm5,xmm5,xmm11
+	xor	r14,r9
+	add	r8,r13
+	vpaddq	xmm10,xmm5,XMMWORD[32+rbp]
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	vmovdqa	XMMWORD[80+rsp],xmm10
+	vpalignr	xmm8,xmm7,xmm6,8
+	shrd	r13,r13,23
+	mov	r8,r14
+	vpalignr	xmm11,xmm3,xmm2,8
+	mov	r12,rbx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rax
+	xor	r12,rcx
+	vpaddq	xmm6,xmm6,xmm11
+	shrd	r13,r13,4
+	xor	r14,r8
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rax
+	xor	r13,rax
+	vpsllq	xmm9,xmm8,56
+	add	rdx,QWORD[96+rsp]
+	mov	r15,r8
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rcx
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r9
+	add	rdx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r8
+	add	rdx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r9
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm5,6
+	add	r11,rdx
+	add	rdx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r11
+	add	r14,rdx
+	vpsllq	xmm10,xmm5,3
+	shrd	r13,r13,23
+	mov	rdx,r14
+	vpaddq	xmm6,xmm6,xmm8
+	mov	r12,rax
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm5,19
+	xor	r13,r11
+	xor	r12,rbx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rdx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r11
+	xor	r13,r11
+	vpxor	xmm11,xmm11,xmm9
+	add	rcx,QWORD[104+rsp]
+	mov	rdi,rdx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rbx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r8
+	add	rcx,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm6,xmm6,xmm11
+	xor	r14,rdx
+	add	rcx,r13
+	vpaddq	xmm10,xmm6,XMMWORD[64+rbp]
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	vmovdqa	XMMWORD[96+rsp],xmm10
+	vpalignr	xmm8,xmm0,xmm7,8
+	shrd	r13,r13,23
+	mov	rcx,r14
+	vpalignr	xmm11,xmm4,xmm3,8
+	mov	r12,r11
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r10
+	xor	r12,rax
+	vpaddq	xmm7,xmm7,xmm11
+	shrd	r13,r13,4
+	xor	r14,rcx
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r10
+	xor	r13,r10
+	vpsllq	xmm9,xmm8,56
+	add	rbx,QWORD[112+rsp]
+	mov	r15,rcx
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rax
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rdx
+	add	rbx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rcx
+	add	rbx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm6,6
+	add	r9,rbx
+	add	rbx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r9
+	add	r14,rbx
+	vpsllq	xmm10,xmm6,3
+	shrd	r13,r13,23
+	mov	rbx,r14
+	vpaddq	xmm7,xmm7,xmm8
+	mov	r12,r10
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm6,19
+	xor	r13,r9
+	xor	r12,r11
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rbx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r9
+	xor	r13,r9
+	vpxor	xmm11,xmm11,xmm9
+	add	rax,QWORD[120+rsp]
+	mov	rdi,rbx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r11
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rcx
+	add	rax,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm7,xmm7,xmm11
+	xor	r14,rbx
+	add	rax,r13
+	vpaddq	xmm10,xmm7,XMMWORD[96+rbp]
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	vmovdqa	XMMWORD[112+rsp],xmm10
+	cmp	BYTE[135+rbp],0
+	jne	NEAR $L$avx_00_47
+	shrd	r13,r13,23
+	mov	rax,r14
+	mov	r12,r9
+	shrd	r14,r14,5
+	xor	r13,r8
+	xor	r12,r10
+	shrd	r13,r13,4
+	xor	r14,rax
+	and	r12,r8
+	xor	r13,r8
+	add	r11,QWORD[rsp]
+	mov	r15,rax
+	xor	r12,r10
+	shrd	r14,r14,6
+	xor	r15,rbx
+	add	r11,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rax
+	add	r11,r13
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	add	rdx,r11
+	add	r11,rdi
+	mov	r13,rdx
+	add	r14,r11
+	shrd	r13,r13,23
+	mov	r11,r14
+	mov	r12,r8
+	shrd	r14,r14,5
+	xor	r13,rdx
+	xor	r12,r9
+	shrd	r13,r13,4
+	xor	r14,r11
+	and	r12,rdx
+	xor	r13,rdx
+	add	r10,QWORD[8+rsp]
+	mov	rdi,r11
+	xor	r12,r9
+	shrd	r14,r14,6
+	xor	rdi,rax
+	add	r10,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r11
+	add	r10,r13
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	shrd	r13,r13,23
+	mov	r10,r14
+	mov	r12,rdx
+	shrd	r14,r14,5
+	xor	r13,rcx
+	xor	r12,r8
+	shrd	r13,r13,4
+	xor	r14,r10
+	and	r12,rcx
+	xor	r13,rcx
+	add	r9,QWORD[16+rsp]
+	mov	r15,r10
+	xor	r12,r8
+	shrd	r14,r14,6
+	xor	r15,r11
+	add	r9,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r10
+	add	r9,r13
+	xor	rdi,r11
+	shrd	r14,r14,28
+	add	rbx,r9
+	add	r9,rdi
+	mov	r13,rbx
+	add	r14,r9
+	shrd	r13,r13,23
+	mov	r9,r14
+	mov	r12,rcx
+	shrd	r14,r14,5
+	xor	r13,rbx
+	xor	r12,rdx
+	shrd	r13,r13,4
+	xor	r14,r9
+	and	r12,rbx
+	xor	r13,rbx
+	add	r8,QWORD[24+rsp]
+	mov	rdi,r9
+	xor	r12,rdx
+	shrd	r14,r14,6
+	xor	rdi,r10
+	add	r8,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r9
+	add	r8,r13
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	shrd	r13,r13,23
+	mov	r8,r14
+	mov	r12,rbx
+	shrd	r14,r14,5
+	xor	r13,rax
+	xor	r12,rcx
+	shrd	r13,r13,4
+	xor	r14,r8
+	and	r12,rax
+	xor	r13,rax
+	add	rdx,QWORD[32+rsp]
+	mov	r15,r8
+	xor	r12,rcx
+	shrd	r14,r14,6
+	xor	r15,r9
+	add	rdx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r8
+	add	rdx,r13
+	xor	rdi,r9
+	shrd	r14,r14,28
+	add	r11,rdx
+	add	rdx,rdi
+	mov	r13,r11
+	add	r14,rdx
+	shrd	r13,r13,23
+	mov	rdx,r14
+	mov	r12,rax
+	shrd	r14,r14,5
+	xor	r13,r11
+	xor	r12,rbx
+	shrd	r13,r13,4
+	xor	r14,rdx
+	and	r12,r11
+	xor	r13,r11
+	add	rcx,QWORD[40+rsp]
+	mov	rdi,rdx
+	xor	r12,rbx
+	shrd	r14,r14,6
+	xor	rdi,r8
+	add	rcx,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rdx
+	add	rcx,r13
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	shrd	r13,r13,23
+	mov	rcx,r14
+	mov	r12,r11
+	shrd	r14,r14,5
+	xor	r13,r10
+	xor	r12,rax
+	shrd	r13,r13,4
+	xor	r14,rcx
+	and	r12,r10
+	xor	r13,r10
+	add	rbx,QWORD[48+rsp]
+	mov	r15,rcx
+	xor	r12,rax
+	shrd	r14,r14,6
+	xor	r15,rdx
+	add	rbx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rcx
+	add	rbx,r13
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	add	r9,rbx
+	add	rbx,rdi
+	mov	r13,r9
+	add	r14,rbx
+	shrd	r13,r13,23
+	mov	rbx,r14
+	mov	r12,r10
+	shrd	r14,r14,5
+	xor	r13,r9
+	xor	r12,r11
+	shrd	r13,r13,4
+	xor	r14,rbx
+	and	r12,r9
+	xor	r13,r9
+	add	rax,QWORD[56+rsp]
+	mov	rdi,rbx
+	xor	r12,r11
+	shrd	r14,r14,6
+	xor	rdi,rcx
+	add	rax,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rbx
+	add	rax,r13
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	shrd	r13,r13,23
+	mov	rax,r14
+	mov	r12,r9
+	shrd	r14,r14,5
+	xor	r13,r8
+	xor	r12,r10
+	shrd	r13,r13,4
+	xor	r14,rax
+	and	r12,r8
+	xor	r13,r8
+	add	r11,QWORD[64+rsp]
+	mov	r15,rax
+	xor	r12,r10
+	shrd	r14,r14,6
+	xor	r15,rbx
+	add	r11,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rax
+	add	r11,r13
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	add	rdx,r11
+	add	r11,rdi
+	mov	r13,rdx
+	add	r14,r11
+	shrd	r13,r13,23
+	mov	r11,r14
+	mov	r12,r8
+	shrd	r14,r14,5
+	xor	r13,rdx
+	xor	r12,r9
+	shrd	r13,r13,4
+	xor	r14,r11
+	and	r12,rdx
+	xor	r13,rdx
+	add	r10,QWORD[72+rsp]
+	mov	rdi,r11
+	xor	r12,r9
+	shrd	r14,r14,6
+	xor	rdi,rax
+	add	r10,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r11
+	add	r10,r13
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	shrd	r13,r13,23
+	mov	r10,r14
+	mov	r12,rdx
+	shrd	r14,r14,5
+	xor	r13,rcx
+	xor	r12,r8
+	shrd	r13,r13,4
+	xor	r14,r10
+	and	r12,rcx
+	xor	r13,rcx
+	add	r9,QWORD[80+rsp]
+	mov	r15,r10
+	xor	r12,r8
+	shrd	r14,r14,6
+	xor	r15,r11
+	add	r9,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r10
+	add	r9,r13
+	xor	rdi,r11
+	shrd	r14,r14,28
+	add	rbx,r9
+	add	r9,rdi
+	mov	r13,rbx
+	add	r14,r9
+	shrd	r13,r13,23
+	mov	r9,r14
+	mov	r12,rcx
+	shrd	r14,r14,5
+	xor	r13,rbx
+	xor	r12,rdx
+	shrd	r13,r13,4
+	xor	r14,r9
+	and	r12,rbx
+	xor	r13,rbx
+	add	r8,QWORD[88+rsp]
+	mov	rdi,r9
+	xor	r12,rdx
+	shrd	r14,r14,6
+	xor	rdi,r10
+	add	r8,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r9
+	add	r8,r13
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	shrd	r13,r13,23
+	mov	r8,r14
+	mov	r12,rbx
+	shrd	r14,r14,5
+	xor	r13,rax
+	xor	r12,rcx
+	shrd	r13,r13,4
+	xor	r14,r8
+	and	r12,rax
+	xor	r13,rax
+	add	rdx,QWORD[96+rsp]
+	mov	r15,r8
+	xor	r12,rcx
+	shrd	r14,r14,6
+	xor	r15,r9
+	add	rdx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r8
+	add	rdx,r13
+	xor	rdi,r9
+	shrd	r14,r14,28
+	add	r11,rdx
+	add	rdx,rdi
+	mov	r13,r11
+	add	r14,rdx
+	shrd	r13,r13,23
+	mov	rdx,r14
+	mov	r12,rax
+	shrd	r14,r14,5
+	xor	r13,r11
+	xor	r12,rbx
+	shrd	r13,r13,4
+	xor	r14,rdx
+	and	r12,r11
+	xor	r13,r11
+	add	rcx,QWORD[104+rsp]
+	mov	rdi,rdx
+	xor	r12,rbx
+	shrd	r14,r14,6
+	xor	rdi,r8
+	add	rcx,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rdx
+	add	rcx,r13
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	shrd	r13,r13,23
+	mov	rcx,r14
+	mov	r12,r11
+	shrd	r14,r14,5
+	xor	r13,r10
+	xor	r12,rax
+	shrd	r13,r13,4
+	xor	r14,rcx
+	and	r12,r10
+	xor	r13,r10
+	add	rbx,QWORD[112+rsp]
+	mov	r15,rcx
+	xor	r12,rax
+	shrd	r14,r14,6
+	xor	r15,rdx
+	add	rbx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rcx
+	add	rbx,r13
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	add	r9,rbx
+	add	rbx,rdi
+	mov	r13,r9
+	add	r14,rbx
+	shrd	r13,r13,23
+	mov	rbx,r14
+	mov	r12,r10
+	shrd	r14,r14,5
+	xor	r13,r9
+	xor	r12,r11
+	shrd	r13,r13,4
+	xor	r14,rbx
+	and	r12,r9
+	xor	r13,r9
+	add	rax,QWORD[120+rsp]
+	mov	rdi,rbx
+	xor	r12,r11
+	shrd	r14,r14,6
+	xor	rdi,rcx
+	add	rax,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rbx
+	add	rax,r13
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	mov	rdi,QWORD[((128+0))+rsp]
+	mov	rax,r14
+
+	add	rax,QWORD[rdi]
+	lea	rsi,[128+rsi]
+	add	rbx,QWORD[8+rdi]
+	add	rcx,QWORD[16+rdi]
+	add	rdx,QWORD[24+rdi]
+	add	r8,QWORD[32+rdi]
+	add	r9,QWORD[40+rdi]
+	add	r10,QWORD[48+rdi]
+	add	r11,QWORD[56+rdi]
+
+	cmp	rsi,QWORD[((128+16))+rsp]
+
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+	mov	QWORD[16+rdi],rcx
+	mov	QWORD[24+rdi],rdx
+	mov	QWORD[32+rdi],r8
+	mov	QWORD[40+rdi],r9
+	mov	QWORD[48+rdi],r10
+	mov	QWORD[56+rdi],r11
+	jb	NEAR $L$loop_avx
+
+	mov	rsi,QWORD[152+rsp]
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((128+32))+rsp]
+	movaps	xmm7,XMMWORD[((128+48))+rsp]
+	movaps	xmm8,XMMWORD[((128+64))+rsp]
+	movaps	xmm9,XMMWORD[((128+80))+rsp]
+	movaps	xmm10,XMMWORD[((128+96))+rsp]
+	movaps	xmm11,XMMWORD[((128+112))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha512_block_data_order_avx:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+	mov	rsi,rax
+	mov	rax,QWORD[((128+24))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	rsi,[((128+32))+rsi]
+	lea	rdi,[512+r8]
+	mov	ecx,12
+	DD	0xa548f3fc
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_sha512_block_data_order_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-nasm.o b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.o
new file mode 100644
index 0000000000..94e2ffe0f2
Binary files /dev/null and b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/vpaes-armv7-linux32.S b/ring-0.17.14/pregenerated/vpaes-armv7-linux32.S
new file mode 100644
index 0000000000..bdf08a2741
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-armv7-linux32.S
@@ -0,0 +1,722 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+.syntax	unified
+
+.arch	armv7-a
+.fpu	neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.type	_vpaes_consts,%object
+.align	7	@ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:@ mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:@ sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:@ inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:@ input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:@ sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:@ sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:@ sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+@@
+@@  _aes_preheat
+@@
+@@  Fills q9-q15 as specified below.
+@@
+.type	_vpaes_preheat,%function
+.align	4
+_vpaes_preheat:
+	adr	r10, .Lk_inv
+	vmov.i8	q9, #0x0f		@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
+	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
+	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
+	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
+	bx	lr
+
+@@
+@@  _aes_encrypt_core
+@@
+@@  AES-encrypt q0.
+@@
+@@  Inputs:
+@@     q0 = input
+@@     q9-q15 as in _vpaes_preheat
+@@    [r2] = scheduled keys
+@@
+@@  Output in q0
+@@  Clobbers  q1-q5, r8-r11
+@@  Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+	adr	r11, .Lk_ipt
+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	vld1.64	{q2, q3}, [r11]
+	adr	r11, .Lk_mc_forward+16
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
+	vtbl.8	d3, {q2}, d3
+	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
+	vtbl.8	d5, {q3}, d1
+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+
+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Lenc_loop.
+	tst	r8, r8
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	@ middle of middle round
+	add	r10, r11, #0x40
+	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	vtbl.8	d9, {q13}, d5
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	vtbl.8	d1, {q12}, d7
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	vtbl.8	d11, {q15}, d5
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
+	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	vtbl.8	d5, {q14}, d7
+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	vtbl.8	d7, {q0}, d3
+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	@ Write to q5 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	vtbl.8	d11, {q0}, d9
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	vtbl.8	d9, {q3}, d3
+	@ Here we restore the original q0/q5 usage.
+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	subs	r8, r8, #1		@ nr--
+
+.Lenc_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	vtbl.8	d11, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
+	bne	.Lenc_loop
+
+	@ middle of last round
+	add	r10, r11, #0x80
+
+	adr	r11, .Lk_sbo
+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+	@ overlap table and destination registers.
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
+	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	@ Write to q2 instead of q0 below, to avoid overlapping table and
+	@ destination registers.
+	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	vtbl.8	d5, {q0}, d7
+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	@ Here we restore the original q0/q2 usage.
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
+	vtbl.8	d1, {q2}, d3
+	bx	lr
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                                                    @@
+@@                  AES key schedule                  @@
+@@                                                    @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@  Key schedule constants
+@
+.type	_vpaes_key_consts,%object
+.align	4
+_vpaes_key_consts:
+.Lk_rcon:@ rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:@ output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size	_vpaes_key_consts,.-_vpaes_key_consts
+
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	r11, .Lk_rcon
+	vmov.i8	q12, #0x5b			@ .Lk_s63
+	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
+	vmov.i8	q9, #0x0f			@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]		@ .Lk_inv
+	vld1.64	{q8}, [r11]			@ .Lk_rcon
+	bx	lr
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
+	@ so save an extra register.
+	stmdb	sp!, {r3,lr}
+
+	bl	_vpaes_key_preheat	@ load the tables
+
+	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
+	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	@ input transform
+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+	@ overlap table and destination.
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
+
+	add	r8, r8, r10
+
+	@ encrypting, output zeroth round key after transform
+	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
+
+	@ *ring*: Decryption removed.
+
+.Lschedule_go:
+	cmp	r1, #192		@ cmp	$192,	%esi
+	bhi	.Lschedule_256
+	@ 128: fall though
+
+@@
+@@  .schedule_128
+@@
+@@  128-bit specific part of key schedule.
+@@
+@@  This schedule is really simple, because all its parts
+@@  are accomplished by the subroutines.
+@@
+.Lschedule_128:
+	mov	r0, #10		@ mov	$10, %esi
+
+.Loop_schedule_128:
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1		@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle	@ write output
+	b	.Loop_schedule_128
+
+@@
+@@  .aes_schedule_256
+@@
+@@  256-bit specific part of key schedule.
+@@
+@@  The structure here is very similar to the 128-bit
+@@  schedule, but with an additional "low side" in
+@@  q6.  The low side's rounds are the same as the
+@@  high side's, except no rcon and no rotation.
+@@
+.align	4
+.Lschedule_256:
+	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	mov	r0, #7			@ mov	$7, %esi
+
+.Loop_schedule_256:
+	bl	_vpaes_schedule_mangle		@ output low result
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	@ high round
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	@ low round. swap xmm7 and xmm6
+	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vmov.i8	q4, #0
+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+@@
+@@  .aes_schedule_mangle_last
+@@
+@@  Mangler for last round of key schedule
+@@  Mangles q0
+@@    when encrypting, outputs out(q0) ^ 63
+@@    when decrypting, outputs unskew(q0)
+@@
+@@  Always called right before return... jumps to cleanup and exits
+@@
+.align	4
+.Lschedule_mangle_last:
+	@ schedule last round key from xmm0
+	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+
+	@ encrypting
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
+	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	r2, r2, #32		@ add		$32,	%rdx
+	vmov	q2, q0
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+	vtbl.8	d1, {q2}, d3
+
+.Lschedule_mangle_last_dec:
+	sub	r2, r2, #16			@ add	$-16,	%rdx
+	veor	q0, q0, q12			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	@ output transform
+	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	@ cleanup
+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
+	ldmia	sp!, {r3,pc}		@ return
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@  .aes_schedule_round
+@@
+@@  Runs one main round of the key schedule on q0, q7
+@@
+@@  Specifically, runs subbytes on the high dword of q0
+@@  then rotates it by one byte and xors into the low dword of
+@@  q7.
+@@
+@@  Adds rcon from low byte of q8, then rotates q8 for
+@@  next rcon.
+@@
+@@  Smears the dwords of q7 by xoring the low into the
+@@  second low, result into third, result into highest.
+@@
+@@  Returns results in q7 = q0.
+@@  Clobbers q1-q4, r11.
+@@
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	@ extract rcon from xmm8
+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
+	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
+
+	@ rotate
+	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	@ fall through...
+
+	@ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+	@ We pin other values in _vpaes_key_preheat, so load them now.
+	adr	r11, .Lk_sb1
+	vld1.64	{q14,q15}, [r11]
+
+	@ smear xmm7
+	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
+	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
+
+	@ subbytes
+	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
+	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
+	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
+	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q7, q7, q12			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
+	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	vtbl.8	d7, {q10}, d7
+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	vtbl.8	d5, {q10}, d9
+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	vtbl.8	d9, {q15}, d7
+	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	vtbl.8	d3, {q14}, d5
+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	@ add in smeared stuff
+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
+	bx	lr
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@  .aes_schedule_transform
+@@
+@@  Linear-transform q0 according to tables at [r11]
+@@
+@@  Requires that q9 = 0x0F0F... as in preheat
+@@  Output in q0
+@@  Clobbers q1, q2, q14, q15
+@@
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
+					@ vmovdqa	16(%r11),	%xmm1 # hi
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d3
+	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q15}, d1
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+	bx	lr
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@  .aes_schedule_mangle
+@@
+@@  Mangles q0 from (basis-transformed) standard version
+@@  to our version.
+@@
+@@  On encrypt,
+@@    xor with 0x63
+@@    multiply by circulant 0,1,1,1
+@@    apply shiftrows transform
+@@
+@@  On decrypt,
+@@    xor with 0x63
+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
+@@    deskew
+@@    apply shiftrows transform
+@@
+@@
+@@  Writes out to [r2], and increments or decrements it
+@@  Keeps track of round number mod 4 in r8
+@@  Preserves q0
+@@  Clobbers q1-q5
+@@
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	tst	r3, r3
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	@ encrypting
+	@ Write to q2 so we do not overlap table and destination below.
+	veor	q2, q0, q12		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	r2, r2, #16		@ add		$16,	%rdx
+	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
+	vtbl.8	d9, {q2}, d11
+	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
+	vtbl.8	d3, {q4}, d11
+	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
+	vtbl.8	d7, {q1}, d11
+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
+
+.Lschedule_mangle_both:
+	@ Write to q2 so table and destination do not overlap.
+	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d5, {q3}, d3
+	add	r8, r8, #64-16		@ add	$-16,	%r8
+	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
+	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	bx	lr
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	r3, #0		@ mov	$0,%ecx
+	mov	r8, #0x30		@ mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	r0, r0, r0
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+@ Additional constants for converting to bsaes.
+.type	_vpaes_convert_consts,%object
+.align	4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	vpaes_encrypt_key_to_bsaes
+.hidden	vpaes_encrypt_key_to_bsaes
+.type	vpaes_encrypt_key_to_bsaes,%function
+.align	4
+vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+	vld1.64	{q12}, [r2]
+	vmov.i8	q10, #0x5b		@ .Lk_s63 from vpaes-x86_64
+	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
+	vmov.i8	q11, #0x63		@ .LK_s63 without .Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+	vmov	q0, q1
+	vtbl.8	d4, {q1}, d24
+	vtbl.8	d5, {q1}, d25
+	veor	q0, q0, q2
+	vtbl.8	d2, {q2}, d24
+	vtbl.8	d3, {q2}, d25
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, q10
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+	b	.Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ q11, not q10.
+	veor	q0, q0, q11
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	cmp	r2, #0
+	@ r8 is passed on the stack.
+	ldr	r8, [ip]
+	beq	.Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+	mov	r9, r3
+	mov	r3, r2
+	mov	r2, r9
+
+	@ Load the IV and counter portion.
+	ldr	r7, [r8, #12]
+	vld1.8	{q7}, [r8]
+
+	bl	_vpaes_preheat
+	rev	r7, r7		@ The counter is big-endian.
+
+.Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [r0]!		@ .Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [r1]!
+	subs	r3, r3, #1
+	@ Update the counter.
+	add	r7, r7, #1
+	rev	r9, r7
+	vmov.32	d15[1], r9
+	bne	.Lctr32_loop
+
+.Lctr32_done:
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/vpaes-armv8-ios64.S b/ring-0.17.14/pregenerated/vpaes-armv8-ios64.S
new file mode 100644
index 0000000000..b4e29dcc17
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-armv8-ios64.S
@@ -0,0 +1,744 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+.section	__TEXT,__const
+
+
+.align	7	// totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Key schedule constants
+//
+Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
+	ret
+
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward@PAGE+16
+	add	x11, x11, Lk_mc_forward@PAGEOFF+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+
+
+
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward@PAGE+16
+	add	x11, x11, Lk_mc_forward@PAGEOFF+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	Lenc_2x_entry
+
+.align	4
+Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v16.16b, #0x5b			// Lk_s63
+	adrp	x11, Lk_sb1@PAGE
+	add	x11, x11, Lk_sb1@PAGEOFF
+	movi	v17.16b, #0x0f			// Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
+	adrp	x10, Lk_dksd@PAGE
+	add	x10, x10, Lk_dksd@PAGEOFF
+	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
+	adrp	x11, Lk_mc_forward@PAGE
+	add	x11, x11, Lk_mc_forward@PAGEOFF
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
+	ld1	{v8.2d}, [x10]			// Lk_rcon
+	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
+	ret
+
+
+
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, Lk_sr@PAGE		// lea	Lk_sr(%rip),%r10
+	add	x10, x10, Lk_sr@PAGEOFF
+
+	add	x8, x8, x10
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	Lschedule_256
+	b.eq	Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, Lk_deskew@PAGE	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, Lk_deskew@PAGEOFF
+
+	cbnz	w3, Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, Lk_opt@PAGE		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, Lk_opt@PAGEOFF
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+
+
+.globl	_vpaes_set_encrypt_key
+.private_extern	_vpaes_set_encrypt_key
+
+.align	4
+_vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_vpaes_ctr32_encrypt_blocks
+.private_extern	_vpaes_ctr32_encrypt_blocks
+
+.align	4
+_vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	Lctr32_done
+
+Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	Lctr32_loop
+
+Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/ring-0.17.14/pregenerated/vpaes-armv8-linux64.S b/ring-0.17.14/pregenerated/vpaes-armv8-linux64.S
new file mode 100644
index 0000000000..c3c60f605f
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-armv8-linux64.S
@@ -0,0 +1,744 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+.section	.rodata
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, .Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.type	_vpaes_encrypt_2x,%function
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, .Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align	4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adrp	x11, .Lk_sb1
+	add	x11, x11, :lo12:.Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adrp	x10, .Lk_dksd
+	add	x10, x10, :lo12:.Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
+	adrp	x11, .Lk_mc_forward
+	add	x11, x11, :lo12:.Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, .Lk_sr		// lea	.Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:.Lk_sr
+
+	add	x8, x8, x10
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+.Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+.Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+.Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, .Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:.Lk_deskew
+
+	cbnz	w3, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, .Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:.Lk_opt
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, .Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	.Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// .Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	.Lctr32_done
+
+.Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+.Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// .Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	.Lctr32_loop
+
+.Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/vpaes-armv8-win64.S b/ring-0.17.14/pregenerated/vpaes-armv8-win64.S
new file mode 100644
index 0000000000..6a524370c0
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-armv8-win64.S
@@ -0,0 +1,766 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+.section	.rodata
+
+
+.align	7	// totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Key schedule constants
+//
+Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.def _vpaes_encrypt_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
+	ret
+
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.def _vpaes_encrypt_core
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward+16
+	add	x11, x11, :lo12:Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+
+
+.def _vpaes_encrypt_2x
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward+16
+	add	x11, x11, :lo12:Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	Lenc_2x_entry
+
+.align	4
+Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.def _vpaes_key_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v16.16b, #0x5b			// Lk_s63
+	adrp	x11, Lk_sb1
+	add	x11, x11, :lo12:Lk_sb1
+	movi	v17.16b, #0x0f			// Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
+	adrp	x10, Lk_dksd
+	add	x10, x10, :lo12:Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
+	adrp	x11, Lk_mc_forward
+	add	x11, x11, :lo12:Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
+	ld1	{v8.2d}, [x10]			// Lk_rcon
+	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
+	ret
+
+
+.def _vpaes_schedule_core
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, Lk_sr		// lea	Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:Lk_sr
+
+	add	x8, x8, x10
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	Lschedule_256
+	b.eq	Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, Lk_deskew	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:Lk_deskew
+
+	cbnz	w3, Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, Lk_opt		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:Lk_opt
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.def _vpaes_schedule_192_smear
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.def _vpaes_schedule_round
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.def _vpaes_schedule_transform
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.def _vpaes_schedule_mangle
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+
+
+.globl	vpaes_set_encrypt_key
+
+.def vpaes_set_encrypt_key
+   .type 32
+.endef
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	vpaes_ctr32_encrypt_blocks
+
+.def vpaes_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	Lctr32_done
+
+Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	Lctr32_loop
+
+Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/ring-0.17.14/pregenerated/vpaes-x86-elf.S b/ring-0.17.14/pregenerated/vpaes-x86-elf.S
new file mode 100644
index 0000000000..e4077ae72d
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-x86-elf.S
@@ -0,0 +1,422 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align	64
+.L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	64
+.hidden	_vpaes_preheat
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+.hidden	_vpaes_encrypt_core
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+	pxor	%xmm5,%xmm2
+	psrld	$4,%xmm1
+	addl	$16,%edx
+.byte	102,15,56,0,193
+	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
+	jmp	.L000enc_entry
+.align	16
+.L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	64(%ebp),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	subl	$1,%eax
+	pxor	%xmm3,%xmm0
+.L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+.hidden	_vpaes_schedule_core
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	.L002schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	.L003schedule_go
+.L002schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+.L003schedule_go:
+	cmpl	$192,%eax
+	ja	.L004schedule_256
+.L005schedule_128:
+	movl	$10,%eax
+.L006loop_schedule_128:
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L007schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.L006loop_schedule_128
+.align	16
+.L004schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%eax
+.L008loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L007schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	.L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	.L008loop_schedule_256
+.align	16
+.L007schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	.L009schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+.L009schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+.hidden	_vpaes_schedule_round
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+.hidden	_vpaes_schedule_transform
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+.hidden	_vpaes_schedule_mangle
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	.L010schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	.L011schedule_mangle_both
+.align	16
+.L010schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+.L011schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L012pic_for_function_hit
+.L012pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+5-.L012pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	.L_vpaes_consts+0x30-.L013pic_point,%ebp
+	call	_vpaes_schedule_core
+.L013pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L014pic_for_function_hit
+.L014pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+4-.L014pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	leal	.L_vpaes_consts+0x30-.L015pic_point,%ebp
+	call	_vpaes_preheat
+.L015pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_encrypt,.-.L_vpaes_encrypt_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/vpaes-x86-win32n.asm b/ring-0.17.14/pregenerated/vpaes-x86-win32n.asm
new file mode 100644
index 0000000000..f52f7ca210
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-x86-win32n.asm
@@ -0,0 +1,408 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+%ifdef BORINGSSL_DISPATCH_TEST
+extern	_BORINGSSL_function_hit
+%endif
+align	64
+L$_vpaes_consts:
+dd	218628480,235210255,168496130,67568393
+dd	252381056,17041926,33884169,51187212
+dd	252645135,252645135,252645135,252645135
+dd	1512730624,3266504856,1377990664,3401244816
+dd	830229760,1275146365,2969422977,3447763452
+dd	3411033600,2979783055,338359620,2782886510
+dd	4209124096,907596821,221174255,1006095553
+dd	191964160,3799684038,3164090317,1589111125
+dd	182528256,1777043520,2877432650,3265356744
+dd	1874708224,3503451415,3305285752,363511674
+dd	1606117888,3487855781,1093350906,2384367825
+dd	197121,67569157,134941193,202313229
+dd	67569157,134941193,202313229,197121
+dd	134941193,202313229,197121,67569157
+dd	202313229,197121,67569157,134941193
+dd	33619971,100992007,168364043,235736079
+dd	235736079,33619971,100992007,168364043
+dd	168364043,235736079,33619971,100992007
+dd	100992007,168364043,235736079,33619971
+dd	50462976,117835012,185207048,252579084
+dd	252314880,51251460,117574920,184942860
+dd	184682752,252054788,50987272,118359308
+dd	118099200,185467140,251790600,50727180
+dd	2946363062,528716217,1300004225,1881839624
+dd	1532713819,1532713819,1532713819,1532713819
+dd	3602276352,4288629033,3737020424,4153884961
+dd	1354558464,32357713,2958822624,3775749553
+dd	1201988352,132424512,1572796698,503232858
+dd	2213177600,1597421020,4103937655,675398315
+db	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+db	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+db	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+db	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+db	118,101,114,115,105,116,121,41,0
+align	64
+align	16
+__vpaes_preheat:
+	add	ebp,DWORD [esp]
+	movdqa	xmm7,[ebp-48]
+	movdqa	xmm6,[ebp-16]
+	ret
+align	16
+__vpaes_encrypt_core:
+	mov	ecx,16
+	mov	eax,DWORD [240+edx]
+	movdqa	xmm1,xmm6
+	movdqa	xmm2,[ebp]
+	pandn	xmm1,xmm0
+	pand	xmm0,xmm6
+	movdqu	xmm5,[edx]
+db	102,15,56,0,208
+	movdqa	xmm0,[16+ebp]
+	pxor	xmm2,xmm5
+	psrld	xmm1,4
+	add	edx,16
+db	102,15,56,0,193
+	lea	ebx,[192+ebp]
+	pxor	xmm0,xmm2
+	jmp	NEAR L$000enc_entry
+align	16
+L$001enc_loop:
+	movdqa	xmm4,[32+ebp]
+	movdqa	xmm0,[48+ebp]
+db	102,15,56,0,226
+db	102,15,56,0,195
+	pxor	xmm4,xmm5
+	movdqa	xmm5,[64+ebp]
+	pxor	xmm0,xmm4
+	movdqa	xmm1,[ecx*1+ebx-64]
+db	102,15,56,0,234
+	movdqa	xmm2,[80+ebp]
+	movdqa	xmm4,[ecx*1+ebx]
+db	102,15,56,0,211
+	movdqa	xmm3,xmm0
+	pxor	xmm2,xmm5
+db	102,15,56,0,193
+	add	edx,16
+	pxor	xmm0,xmm2
+db	102,15,56,0,220
+	add	ecx,16
+	pxor	xmm3,xmm0
+db	102,15,56,0,193
+	and	ecx,48
+	sub	eax,1
+	pxor	xmm0,xmm3
+L$000enc_entry:
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,[ebp-32]
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm6
+db	102,15,56,0,232
+	movdqa	xmm3,xmm7
+	pxor	xmm0,xmm1
+db	102,15,56,0,217
+	movdqa	xmm4,xmm7
+	pxor	xmm3,xmm5
+db	102,15,56,0,224
+	movdqa	xmm2,xmm7
+	pxor	xmm4,xmm5
+db	102,15,56,0,211
+	movdqa	xmm3,xmm7
+	pxor	xmm2,xmm0
+db	102,15,56,0,220
+	movdqu	xmm5,[edx]
+	pxor	xmm3,xmm1
+	jnz	NEAR L$001enc_loop
+	movdqa	xmm4,[96+ebp]
+	movdqa	xmm0,[112+ebp]
+db	102,15,56,0,226
+	pxor	xmm4,xmm5
+db	102,15,56,0,195
+	movdqa	xmm1,[64+ecx*1+ebx]
+	pxor	xmm0,xmm4
+db	102,15,56,0,193
+	ret
+align	16
+__vpaes_schedule_core:
+	add	ebp,DWORD [esp]
+	movdqu	xmm0,[esi]
+	movdqa	xmm2,[320+ebp]
+	movdqa	xmm3,xmm0
+	lea	ebx,[ebp]
+	movdqa	[4+esp],xmm2
+	call	__vpaes_schedule_transform
+	movdqa	xmm7,xmm0
+	test	edi,edi
+	jnz	NEAR L$002schedule_am_decrypting
+	movdqu	[edx],xmm0
+	jmp	NEAR L$003schedule_go
+L$002schedule_am_decrypting:
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,217
+	movdqu	[edx],xmm3
+	xor	ecx,48
+L$003schedule_go:
+	cmp	eax,192
+	ja	NEAR L$004schedule_256
+L$005schedule_128:
+	mov	eax,10
+L$006loop_schedule_128:
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$007schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	jmp	NEAR L$006loop_schedule_128
+align	16
+L$004schedule_256:
+	movdqu	xmm0,[16+esi]
+	call	__vpaes_schedule_transform
+	mov	eax,7
+L$008loop_schedule_256:
+	call	__vpaes_schedule_mangle
+	movdqa	xmm6,xmm0
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$007schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	pshufd	xmm0,xmm0,255
+	movdqa	[20+esp],xmm7
+	movdqa	xmm7,xmm6
+	call	L$_vpaes_schedule_low_round
+	movdqa	xmm7,[20+esp]
+	jmp	NEAR L$008loop_schedule_256
+align	16
+L$007schedule_mangle_last:
+	lea	ebx,[384+ebp]
+	test	edi,edi
+	jnz	NEAR L$009schedule_mangle_last_dec
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,193
+	lea	ebx,[352+ebp]
+	add	edx,32
+L$009schedule_mangle_last_dec:
+	add	edx,-16
+	pxor	xmm0,[336+ebp]
+	call	__vpaes_schedule_transform
+	movdqu	[edx],xmm0
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	ret
+align	16
+__vpaes_schedule_round:
+	movdqa	xmm2,[8+esp]
+	pxor	xmm1,xmm1
+db	102,15,58,15,202,15
+db	102,15,58,15,210,15
+	pxor	xmm7,xmm1
+	pshufd	xmm0,xmm0,255
+db	102,15,58,15,192,1
+	movdqa	[8+esp],xmm2
+L$_vpaes_schedule_low_round:
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,4
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,8
+	pxor	xmm7,xmm1
+	pxor	xmm7,[336+ebp]
+	movdqa	xmm4,[ebp-16]
+	movdqa	xmm5,[ebp-48]
+	movdqa	xmm1,xmm4
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm4
+	movdqa	xmm2,[ebp-32]
+db	102,15,56,0,208
+	pxor	xmm0,xmm1
+	movdqa	xmm3,xmm5
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+	movdqa	xmm4,xmm5
+db	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm5
+db	102,15,56,0,211
+	pxor	xmm2,xmm0
+	movdqa	xmm3,xmm5
+db	102,15,56,0,220
+	pxor	xmm3,xmm1
+	movdqa	xmm4,[32+ebp]
+db	102,15,56,0,226
+	movdqa	xmm0,[48+ebp]
+db	102,15,56,0,195
+	pxor	xmm0,xmm4
+	pxor	xmm0,xmm7
+	movdqa	xmm7,xmm0
+	ret
+align	16
+__vpaes_schedule_transform:
+	movdqa	xmm2,[ebp-16]
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm2
+	movdqa	xmm2,[ebx]
+db	102,15,56,0,208
+	movdqa	xmm0,[16+ebx]
+db	102,15,56,0,193
+	pxor	xmm0,xmm2
+	ret
+align	16
+__vpaes_schedule_mangle:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,[128+ebp]
+	test	edi,edi
+	jnz	NEAR L$010schedule_mangle_dec
+	add	edx,16
+	pxor	xmm4,[336+ebp]
+db	102,15,56,0,229
+	movdqa	xmm3,xmm4
+db	102,15,56,0,229
+	pxor	xmm3,xmm4
+db	102,15,56,0,229
+	pxor	xmm3,xmm4
+	jmp	NEAR L$011schedule_mangle_both
+align	16
+L$010schedule_mangle_dec:
+	movdqa	xmm2,[ebp-16]
+	lea	esi,[ebp]
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm4
+	psrld	xmm1,4
+	pand	xmm4,xmm2
+	movdqa	xmm2,[esi]
+db	102,15,56,0,212
+	movdqa	xmm3,[16+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[32+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[48+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[64+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[80+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[96+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[112+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+	add	edx,-16
+L$011schedule_mangle_both:
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,217
+	add	ecx,-16
+	and	ecx,48
+	movdqu	[edx],xmm3
+	ret
+global	_vpaes_set_encrypt_key
+align	16
+_vpaes_set_encrypt_key:
+L$_vpaes_set_encrypt_key_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$012pic_for_function_hit
+L$012pic_for_function_hit:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+5-L$012pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	eax,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	mov	ebx,eax
+	shr	ebx,5
+	add	ebx,5
+	mov	DWORD [240+edx],ebx
+	mov	ecx,48
+	mov	edi,0
+	lea	ebp,[(L$_vpaes_consts+0x30-L$013pic_point)]
+	call	__vpaes_schedule_core
+L$013pic_point:
+	mov	esp,DWORD [48+esp]
+	xor	eax,eax
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_vpaes_encrypt
+align	16
+_vpaes_encrypt:
+L$_vpaes_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$014pic_for_function_hit
+L$014pic_for_function_hit:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+4-L$014pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	lea	ebp,[(L$_vpaes_consts+0x30-L$015pic_point)]
+	call	__vpaes_preheat
+L$015pic_point:
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	edi,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	movdqu	xmm0,[esi]
+	call	__vpaes_encrypt_core
+	movdqu	[edi],xmm0
+	mov	esp,DWORD [48+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/vpaes-x86-win32n.o b/ring-0.17.14/pregenerated/vpaes-x86-win32n.o
new file mode 100644
index 0000000000..b630709091
Binary files /dev/null and b/ring-0.17.14/pregenerated/vpaes-x86-win32n.o differ
diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-elf.S b/ring-0.17.14/pregenerated/vpaes-x86_64-elf.S
new file mode 100644
index 0000000000..146439f6b8
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-x86_64-elf.S
@@ -0,0 +1,758 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.cfi_endproc	
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core_2x,@function
+.align	16
+_vpaes_encrypt_core_2x:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	.Lk_ipt(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	movdqu	(%r9),%xmm5
+
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,208
+.byte	102,68,15,56,0,198
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,247
+	pxor	%xmm5,%xmm2
+	pxor	%xmm5,%xmm8
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc2x_entry
+
+.align	16
+.Lenc2x_loop:
+
+	movdqa	.Lk_sb1(%rip),%xmm4
+	movdqa	.Lk_sb1+16(%rip),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+	movdqa	.Lk_sb2(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+	movdqa	-64(%r11,%r10,1),%xmm1
+
+.byte	102,15,56,0,234
+.byte	102,69,15,56,0,232
+	movdqa	(%r11,%r10,1),%xmm4
+
+	movdqa	.Lk_sb2+16(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm6,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm13,%xmm8
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,220
+.byte	102,68,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm11
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+	pxor	%xmm11,%xmm6
+
+.Lenc2x_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	.Lk_inv+16(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,232
+.byte	102,68,15,56,0,238
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm1,%xmm0
+	pxor	%xmm7,%xmm6
+.byte	102,15,56,0,217
+.byte	102,68,15,56,0,223
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm10,%xmm12
+	pxor	%xmm5,%xmm3
+	pxor	%xmm13,%xmm11
+.byte	102,15,56,0,224
+.byte	102,68,15,56,0,230
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm10,%xmm8
+	pxor	%xmm5,%xmm4
+	pxor	%xmm13,%xmm12
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm0,%xmm2
+	pxor	%xmm6,%xmm8
+.byte	102,15,56,0,220
+.byte	102,69,15,56,0,220
+	movdqu	(%r9),%xmm5
+
+	pxor	%xmm1,%xmm3
+	pxor	%xmm7,%xmm11
+	jnz	.Lenc2x_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	movdqa	64(%r11,%r10,1),%xmm1
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	ret
+.cfi_endproc	
+.size	_vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+.cfi_startproc	
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+
+
+	movdqu	%xmm0,(%rdx)
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+.cfi_startproc	
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+.cfi_startproc	
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+.cfi_startproc	
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+5(%rip)
+#endif
+
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	ret
+.cfi_endproc	
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,@function
+.align	16
+vpaes_ctr32_encrypt_blocks:
+.cfi_startproc	
+_CET_ENDBR
+
+	xchgq	%rcx,%rdx
+	testq	%rcx,%rcx
+	jz	.Lctr32_abort
+	movdqu	(%r8),%xmm0
+	movdqa	.Lctr_add_one(%rip),%xmm8
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	movdqa	%xmm0,%xmm6
+	pshufb	.Lrev_ctr(%rip),%xmm6
+
+	testq	$1,%rcx
+	jz	.Lctr32_prep_loop
+
+
+
+	movdqu	(%rdi),%xmm7
+	call	_vpaes_encrypt_core
+	pxor	%xmm7,%xmm0
+	paddd	%xmm8,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	subq	$1,%rcx
+	leaq	16(%rdi),%rdi
+	jz	.Lctr32_done
+
+.Lctr32_prep_loop:
+
+
+	movdqa	%xmm6,%xmm14
+	movdqa	%xmm6,%xmm15
+	paddd	%xmm8,%xmm15
+
+.Lctr32_loop:
+	movdqa	.Lrev_ctr(%rip),%xmm1
+	movdqa	%xmm14,%xmm0
+	movdqa	%xmm15,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	.Lctr_add_two(%rip),%xmm3
+	pxor	%xmm1,%xmm0
+	pxor	%xmm2,%xmm6
+	paddd	%xmm3,%xmm14
+	paddd	%xmm3,%xmm15
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	movdqu	%xmm6,16(%rsi,%rdi,1)
+	subq	$2,%rcx
+	leaq	32(%rdi),%rdi
+	jnz	.Lctr32_loop
+
+.Lctr32_done:
+.Lctr32_abort:
+	ret
+.cfi_endproc	
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+.cfi_startproc	
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	ret
+.cfi_endproc	
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.section	.rodata
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+.Lrev_ctr:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+.Lctr_add_one:
+.quad	0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+.quad	0x0000000000000000, 0x0000000200000000
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S b/ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S
new file mode 100644
index 0000000000..b113e79f51
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S
@@ -0,0 +1,757 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_encrypt_core:
+
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	L$k_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	L$k_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	L$k_mc_backward(%rip),%r10
+	jmp	L$enc_entry
+
+.p2align	4
+L$enc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+L$enc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	L$enc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_encrypt_core_2x:
+
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	L$k_ipt(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	movdqu	(%r9),%xmm5
+
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,208
+.byte	102,68,15,56,0,198
+	movdqa	L$k_ipt+16(%rip),%xmm0
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,247
+	pxor	%xmm5,%xmm2
+	pxor	%xmm5,%xmm8
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+	leaq	L$k_mc_backward(%rip),%r10
+	jmp	L$enc2x_entry
+
+.p2align	4
+L$enc2x_loop:
+
+	movdqa	L$k_sb1(%rip),%xmm4
+	movdqa	L$k_sb1+16(%rip),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+	movdqa	L$k_sb2(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+	movdqa	-64(%r11,%r10,1),%xmm1
+
+.byte	102,15,56,0,234
+.byte	102,69,15,56,0,232
+	movdqa	(%r11,%r10,1),%xmm4
+
+	movdqa	L$k_sb2+16(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm6,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm13,%xmm8
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,220
+.byte	102,68,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm11
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+	pxor	%xmm11,%xmm6
+
+L$enc2x_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	L$k_inv+16(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,232
+.byte	102,68,15,56,0,238
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm1,%xmm0
+	pxor	%xmm7,%xmm6
+.byte	102,15,56,0,217
+.byte	102,68,15,56,0,223
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm10,%xmm12
+	pxor	%xmm5,%xmm3
+	pxor	%xmm13,%xmm11
+.byte	102,15,56,0,224
+.byte	102,68,15,56,0,230
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm10,%xmm8
+	pxor	%xmm5,%xmm4
+	pxor	%xmm13,%xmm12
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm0,%xmm2
+	pxor	%xmm6,%xmm8
+.byte	102,15,56,0,220
+.byte	102,69,15,56,0,220
+	movdqu	(%r9),%xmm5
+
+	pxor	%xmm1,%xmm3
+	pxor	%xmm7,%xmm11
+	jnz	L$enc2x_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	movdqa	64(%r11,%r10,1),%xmm1
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	ret
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_core:
+
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	L$k_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	L$k_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	L$k_sr(%rip),%r10
+
+
+	movdqu	%xmm0,(%rdx)
+
+L$schedule_go:
+	cmpl	$192,%esi
+	ja	L$schedule_256
+
+
+
+
+
+
+
+
+
+
+
+L$schedule_128:
+	movl	$10,%esi
+
+L$oop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+L$schedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+L$oop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+L$schedule_mangle_last:
+
+	leaq	L$k_deskew(%rip),%r11
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	L$k_opt(%rip),%r11
+	addq	$32,%rdx
+
+L$schedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	L$k_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_round:
+
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	L$k_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_transform:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_mangle:
+
+	movdqa	%xmm0,%xmm4
+	movdqa	L$k_mc_forward(%rip),%xmm5
+
+
+	addq	$16,%rdx
+	pxor	L$k_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+L$schedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	ret
+
+
+
+
+
+
+.globl	_vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+
+.p2align	4
+_vpaes_set_encrypt_key:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+5(%rip)
+#endif
+
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	ret
+
+
+.globl	_vpaes_ctr32_encrypt_blocks
+.private_extern _vpaes_ctr32_encrypt_blocks
+
+.p2align	4
+_vpaes_ctr32_encrypt_blocks:
+
+_CET_ENDBR
+
+	xchgq	%rcx,%rdx
+	testq	%rcx,%rcx
+	jz	L$ctr32_abort
+	movdqu	(%r8),%xmm0
+	movdqa	L$ctr_add_one(%rip),%xmm8
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	movdqa	%xmm0,%xmm6
+	pshufb	L$rev_ctr(%rip),%xmm6
+
+	testq	$1,%rcx
+	jz	L$ctr32_prep_loop
+
+
+
+	movdqu	(%rdi),%xmm7
+	call	_vpaes_encrypt_core
+	pxor	%xmm7,%xmm0
+	paddd	%xmm8,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	subq	$1,%rcx
+	leaq	16(%rdi),%rdi
+	jz	L$ctr32_done
+
+L$ctr32_prep_loop:
+
+
+	movdqa	%xmm6,%xmm14
+	movdqa	%xmm6,%xmm15
+	paddd	%xmm8,%xmm15
+
+L$ctr32_loop:
+	movdqa	L$rev_ctr(%rip),%xmm1
+	movdqa	%xmm14,%xmm0
+	movdqa	%xmm15,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	L$ctr_add_two(%rip),%xmm3
+	pxor	%xmm1,%xmm0
+	pxor	%xmm2,%xmm6
+	paddd	%xmm3,%xmm14
+	paddd	%xmm3,%xmm15
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	movdqu	%xmm6,16(%rsi,%rdi,1)
+	subq	$2,%rcx
+	leaq	32(%rdi),%rdi
+	jnz	L$ctr32_loop
+
+L$ctr32_done:
+L$ctr32_abort:
+	ret
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_preheat:
+
+	leaq	L$k_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	ret
+
+
+
+
+
+
+
+
+.section	__DATA,__const
+.p2align	6
+_vpaes_consts:
+L$k_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+L$k_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+L$k_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+L$k_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+L$k_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+L$k_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+L$k_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+L$k_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+L$k_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+L$k_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+L$k_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+L$k_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+L$k_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+L$rev_ctr:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+L$ctr_add_one:
+.quad	0x0000000000000000, 0x0000000100000000
+L$ctr_add_two:
+.quad	0x0000000000000000, 0x0000000200000000
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.p2align	6
+
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm
new file mode 100644
index 0000000000..c7e6eed496
--- /dev/null
+++ b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm
@@ -0,0 +1,940 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_encrypt_core:
+
+	mov	r9,rdx
+	mov	r11,16
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_ipt]
+	pandn	xmm1,xmm0
+	movdqu	xmm5,XMMWORD[r9]
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
+DB	102,15,56,0,193
+	pxor	xmm2,xmm5
+	add	r9,16
+	pxor	xmm0,xmm2
+	lea	r10,[$L$k_mc_backward]
+	jmp	NEAR $L$enc_entry
+
+ALIGN	16
+$L$enc_loop:
+
+	movdqa	xmm4,xmm13
+	movdqa	xmm0,xmm12
+DB	102,15,56,0,226
+DB	102,15,56,0,195
+	pxor	xmm4,xmm5
+	movdqa	xmm5,xmm15
+	pxor	xmm0,xmm4
+	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
+DB	102,15,56,0,234
+	movdqa	xmm4,XMMWORD[r10*1+r11]
+	movdqa	xmm2,xmm14
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm0
+	pxor	xmm2,xmm5
+DB	102,15,56,0,193
+	add	r9,16
+	pxor	xmm0,xmm2
+DB	102,15,56,0,220
+	add	r11,16
+	pxor	xmm3,xmm0
+DB	102,15,56,0,193
+	and	r11,0x30
+	sub	rax,1
+	pxor	xmm0,xmm3
+
+$L$enc_entry:
+
+	movdqa	xmm1,xmm9
+	movdqa	xmm5,xmm11
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,232
+	movdqa	xmm3,xmm10
+	pxor	xmm0,xmm1
+DB	102,15,56,0,217
+	movdqa	xmm4,xmm10
+	pxor	xmm3,xmm5
+DB	102,15,56,0,224
+	movdqa	xmm2,xmm10
+	pxor	xmm4,xmm5
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm10
+	pxor	xmm2,xmm0
+DB	102,15,56,0,220
+	movdqu	xmm5,XMMWORD[r9]
+	pxor	xmm3,xmm1
+	jnz	NEAR $L$enc_loop
+
+
+	movdqa	xmm4,XMMWORD[((-96))+r10]
+	movdqa	xmm0,XMMWORD[((-80))+r10]
+DB	102,15,56,0,226
+	pxor	xmm4,xmm5
+DB	102,15,56,0,195
+	movdqa	xmm1,XMMWORD[64+r10*1+r11]
+	pxor	xmm0,xmm4
+DB	102,15,56,0,193
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_encrypt_core_2x:
+
+	mov	r9,rdx
+	mov	r11,16
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm7,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_ipt]
+	movdqa	xmm8,xmm2
+	pandn	xmm1,xmm0
+	pandn	xmm7,xmm6
+	movdqu	xmm5,XMMWORD[r9]
+
+	psrld	xmm1,4
+	psrld	xmm7,4
+	pand	xmm0,xmm9
+	pand	xmm6,xmm9
+DB	102,15,56,0,208
+DB	102,68,15,56,0,198
+	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,193
+DB	102,15,56,0,247
+	pxor	xmm2,xmm5
+	pxor	xmm8,xmm5
+	add	r9,16
+	pxor	xmm0,xmm2
+	pxor	xmm6,xmm8
+	lea	r10,[$L$k_mc_backward]
+	jmp	NEAR $L$enc2x_entry
+
+ALIGN	16
+$L$enc2x_loop:
+
+	movdqa	xmm4,XMMWORD[$L$k_sb1]
+	movdqa	xmm0,XMMWORD[(($L$k_sb1+16))]
+	movdqa	xmm12,xmm4
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,226
+DB	102,69,15,56,0,224
+DB	102,15,56,0,195
+DB	102,65,15,56,0,243
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm5
+	movdqa	xmm5,XMMWORD[$L$k_sb2]
+	movdqa	xmm13,xmm5
+	pxor	xmm0,xmm4
+	pxor	xmm6,xmm12
+	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
+
+DB	102,15,56,0,234
+DB	102,69,15,56,0,232
+	movdqa	xmm4,XMMWORD[r10*1+r11]
+
+	movdqa	xmm2,XMMWORD[(($L$k_sb2+16))]
+	movdqa	xmm8,xmm2
+DB	102,15,56,0,211
+DB	102,69,15,56,0,195
+	movdqa	xmm3,xmm0
+	movdqa	xmm11,xmm6
+	pxor	xmm2,xmm5
+	pxor	xmm8,xmm13
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	add	r9,16
+	pxor	xmm0,xmm2
+	pxor	xmm6,xmm8
+DB	102,15,56,0,220
+DB	102,68,15,56,0,220
+	add	r11,16
+	pxor	xmm3,xmm0
+	pxor	xmm11,xmm6
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	and	r11,0x30
+	sub	rax,1
+	pxor	xmm0,xmm3
+	pxor	xmm6,xmm11
+
+$L$enc2x_entry:
+
+	movdqa	xmm1,xmm9
+	movdqa	xmm7,xmm9
+	movdqa	xmm5,XMMWORD[(($L$k_inv+16))]
+	movdqa	xmm13,xmm5
+	pandn	xmm1,xmm0
+	pandn	xmm7,xmm6
+	psrld	xmm1,4
+	psrld	xmm7,4
+	pand	xmm0,xmm9
+	pand	xmm6,xmm9
+DB	102,15,56,0,232
+DB	102,68,15,56,0,238
+	movdqa	xmm3,xmm10
+	movdqa	xmm11,xmm10
+	pxor	xmm0,xmm1
+	pxor	xmm6,xmm7
+DB	102,15,56,0,217
+DB	102,68,15,56,0,223
+	movdqa	xmm4,xmm10
+	movdqa	xmm12,xmm10
+	pxor	xmm3,xmm5
+	pxor	xmm11,xmm13
+DB	102,15,56,0,224
+DB	102,68,15,56,0,230
+	movdqa	xmm2,xmm10
+	movdqa	xmm8,xmm10
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm13
+DB	102,15,56,0,211
+DB	102,69,15,56,0,195
+	movdqa	xmm3,xmm10
+	movdqa	xmm11,xmm10
+	pxor	xmm2,xmm0
+	pxor	xmm8,xmm6
+DB	102,15,56,0,220
+DB	102,69,15,56,0,220
+	movdqu	xmm5,XMMWORD[r9]
+
+	pxor	xmm3,xmm1
+	pxor	xmm11,xmm7
+	jnz	NEAR $L$enc2x_loop
+
+
+	movdqa	xmm4,XMMWORD[((-96))+r10]
+	movdqa	xmm0,XMMWORD[((-80))+r10]
+	movdqa	xmm12,xmm4
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,226
+DB	102,69,15,56,0,224
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm5
+DB	102,15,56,0,195
+DB	102,65,15,56,0,243
+	movdqa	xmm1,XMMWORD[64+r10*1+r11]
+
+	pxor	xmm0,xmm4
+	pxor	xmm6,xmm12
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	ret
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_core:
+
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	xmm8,XMMWORD[$L$k_rcon]
+	movdqu	xmm0,XMMWORD[rdi]
+
+
+	movdqa	xmm3,xmm0
+	lea	r11,[$L$k_ipt]
+	call	_vpaes_schedule_transform
+	movdqa	xmm7,xmm0
+
+	lea	r10,[$L$k_sr]
+
+
+	movdqu	XMMWORD[rdx],xmm0
+
+$L$schedule_go:
+	cmp	esi,192
+	ja	NEAR $L$schedule_256
+
+
+
+
+
+
+
+
+
+
+
+$L$schedule_128:
+	mov	esi,10
+
+$L$oop_schedule_128:
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	NEAR $L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_256:
+	movdqu	xmm0,XMMWORD[16+rdi]
+	call	_vpaes_schedule_transform
+	mov	esi,7
+
+$L$oop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	xmm6,xmm0
+
+
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	xmm0,xmm0,0xFF
+	movdqa	xmm5,xmm7
+	movdqa	xmm7,xmm6
+	call	_vpaes_schedule_low_round
+	movdqa	xmm7,xmm5
+
+	jmp	NEAR $L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_mangle_last:
+
+	lea	r11,[$L$k_deskew]
+
+
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,193
+	lea	r11,[$L$k_opt]
+	add	rdx,32
+
+$L$schedule_mangle_last_dec:
+	add	rdx,-16
+	pxor	xmm0,XMMWORD[$L$k_s63]
+	call	_vpaes_schedule_transform
+	movdqu	XMMWORD[rdx],xmm0
+
+
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_round:
+
+
+	pxor	xmm1,xmm1
+DB	102,65,15,58,15,200,15
+DB	102,69,15,58,15,192,15
+	pxor	xmm7,xmm1
+
+
+	pshufd	xmm0,xmm0,0xFF
+DB	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,4
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,8
+	pxor	xmm7,xmm1
+	pxor	xmm7,XMMWORD[$L$k_s63]
+
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+	movdqa	xmm2,xmm11
+DB	102,15,56,0,208
+	pxor	xmm0,xmm1
+	movdqa	xmm3,xmm10
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+	movdqa	xmm4,xmm10
+DB	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm10
+DB	102,15,56,0,211
+	pxor	xmm2,xmm0
+	movdqa	xmm3,xmm10
+DB	102,15,56,0,220
+	pxor	xmm3,xmm1
+	movdqa	xmm4,xmm13
+DB	102,15,56,0,226
+	movdqa	xmm0,xmm12
+DB	102,15,56,0,195
+	pxor	xmm0,xmm4
+
+
+	pxor	xmm0,xmm7
+	movdqa	xmm7,xmm0
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_transform:
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+	movdqa	xmm2,XMMWORD[r11]
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[16+r11]
+DB	102,15,56,0,193
+	pxor	xmm0,xmm2
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_mangle:
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,XMMWORD[$L$k_mc_forward]
+
+
+	add	rdx,16
+	pxor	xmm4,XMMWORD[$L$k_s63]
+DB	102,15,56,0,229
+	movdqa	xmm3,xmm4
+DB	102,15,56,0,229
+	pxor	xmm3,xmm4
+DB	102,15,56,0,229
+	pxor	xmm3,xmm4
+
+$L$schedule_mangle_both:
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,217
+	add	r8,-16
+	and	r8,0x30
+	movdqu	XMMWORD[rdx],xmm3
+	ret
+
+
+
+
+
+
+global	vpaes_set_encrypt_key
+
+ALIGN	16
+vpaes_set_encrypt_key:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_set_encrypt_key:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+5))],1
+%endif
+
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$enc_key_body:
+	mov	eax,esi
+	shr	eax,5
+	add	eax,5
+	mov	DWORD[240+rdx],eax
+
+	mov	ecx,0
+	mov	r8d,0x30
+	call	_vpaes_schedule_core
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$enc_key_epilogue:
+	xor	eax,eax
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_set_encrypt_key:
+global	vpaes_ctr32_encrypt_blocks
+
+ALIGN	16
+vpaes_ctr32_encrypt_blocks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_ctr32_encrypt_blocks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+
+	xchg	rdx,rcx
+	test	rcx,rcx
+	jz	NEAR $L$ctr32_abort
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$ctr32_body:
+	movdqu	xmm0,XMMWORD[r8]
+	movdqa	xmm8,XMMWORD[$L$ctr_add_one]
+	sub	rsi,rdi
+	call	_vpaes_preheat
+	movdqa	xmm6,xmm0
+	pshufb	xmm6,XMMWORD[$L$rev_ctr]
+
+	test	rcx,1
+	jz	NEAR $L$ctr32_prep_loop
+
+
+
+	movdqu	xmm7,XMMWORD[rdi]
+	call	_vpaes_encrypt_core
+	pxor	xmm0,xmm7
+	paddd	xmm6,xmm8
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	sub	rcx,1
+	lea	rdi,[16+rdi]
+	jz	NEAR $L$ctr32_done
+
+$L$ctr32_prep_loop:
+
+
+	movdqa	xmm14,xmm6
+	movdqa	xmm15,xmm6
+	paddd	xmm15,xmm8
+
+$L$ctr32_loop:
+	movdqa	xmm1,XMMWORD[$L$rev_ctr]
+	movdqa	xmm0,xmm14
+	movdqa	xmm6,xmm15
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	xmm1,XMMWORD[rdi]
+	movdqu	xmm2,XMMWORD[16+rdi]
+	movdqa	xmm3,XMMWORD[$L$ctr_add_two]
+	pxor	xmm0,xmm1
+	pxor	xmm6,xmm2
+	paddd	xmm14,xmm3
+	paddd	xmm15,xmm3
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	movdqu	XMMWORD[16+rdi*1+rsi],xmm6
+	sub	rcx,2
+	lea	rdi,[32+rdi]
+	jnz	NEAR $L$ctr32_loop
+
+$L$ctr32_done:
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$ctr32_epilogue:
+$L$ctr32_abort:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_ctr32_encrypt_blocks:
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_preheat:
+
+	lea	r10,[$L$k_s0F]
+	movdqa	xmm10,XMMWORD[((-32))+r10]
+	movdqa	xmm11,XMMWORD[((-16))+r10]
+	movdqa	xmm9,XMMWORD[r10]
+	movdqa	xmm13,XMMWORD[48+r10]
+	movdqa	xmm12,XMMWORD[64+r10]
+	movdqa	xmm15,XMMWORD[80+r10]
+	movdqa	xmm14,XMMWORD[96+r10]
+	ret
+
+
+
+
+
+
+
+
+section	.rdata rdata align=8
+ALIGN	64
+_vpaes_consts:
+$L$k_inv:
+	DQ	0x0E05060F0D080180,0x040703090A0B0C02
+	DQ	0x01040A060F0B0780,0x030D0E0C02050809
+
+$L$k_s0F:
+	DQ	0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F
+
+$L$k_ipt:
+	DQ	0xC2B2E8985A2A7000,0xCABAE09052227808
+	DQ	0x4C01307D317C4D00,0xCD80B1FCB0FDCC81
+
+$L$k_sb1:
+	DQ	0xB19BE18FCB503E00,0xA5DF7A6E142AF544
+	DQ	0x3618D415FAE22300,0x3BF7CCC10D2ED9EF
+$L$k_sb2:
+	DQ	0xE27A93C60B712400,0x5EB7E955BC982FCD
+	DQ	0x69EB88400AE12900,0xC2A163C8AB82234A
+$L$k_sbo:
+	DQ	0xD0D26D176FBDC700,0x15AABF7AC502A878
+	DQ	0xCFE474A55FBB6A00,0x8E1E90D1412B35FA
+
+$L$k_mc_forward:
+	DQ	0x0407060500030201,0x0C0F0E0D080B0A09
+	DQ	0x080B0A0904070605,0x000302010C0F0E0D
+	DQ	0x0C0F0E0D080B0A09,0x0407060500030201
+	DQ	0x000302010C0F0E0D,0x080B0A0904070605
+
+$L$k_mc_backward:
+	DQ	0x0605040702010003,0x0E0D0C0F0A09080B
+	DQ	0x020100030E0D0C0F,0x0A09080B06050407
+	DQ	0x0E0D0C0F0A09080B,0x0605040702010003
+	DQ	0x0A09080B06050407,0x020100030E0D0C0F
+
+$L$k_sr:
+	DQ	0x0706050403020100,0x0F0E0D0C0B0A0908
+	DQ	0x030E09040F0A0500,0x0B06010C07020D08
+	DQ	0x0F060D040B020900,0x070E050C030A0108
+	DQ	0x0B0E0104070A0D00,0x0306090C0F020508
+
+$L$k_rcon:
+	DQ	0x1F8391B9AF9DEEB6,0x702A98084D7C7D81
+
+$L$k_s63:
+	DQ	0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B
+
+$L$k_opt:
+	DQ	0xFF9F4929D6B66000,0xF7974121DEBE6808
+	DQ	0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0
+
+$L$k_deskew:
+	DQ	0x07E4A34047A4E300,0x1DFEB95A5DBEF91A
+	DQ	0x5F36B5DC83EA6900,0x2841C2ABF49D1E77
+
+
+$L$rev_ctr:
+	DQ	0x0706050403020100,0x0c0d0e0f0b0a0908
+
+
+$L$ctr_add_one:
+	DQ	0x0000000000000000,0x0000000100000000
+$L$ctr_add_two:
+	DQ	0x0000000000000000,0x0000000200000000
+
+	DB	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+	DB	111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54
+	DB	52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97
+	DB	109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32
+	DB	85,110,105,118,101,114,115,105,116,121,41,0
+ALIGN	64
+
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rsi,[16+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+	lea	rax,[184+rax]
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_vpaes_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_end_vpaes_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_info_vpaes_set_encrypt_key wrt ..imagebase
+
+	DD	$L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_vpaes_set_encrypt_key:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_ctr32_encrypt_blocks:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o
new file mode 100644
index 0000000000..4b04ba3f84
Binary files /dev/null and b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/x86-mont-elf.S b/ring-0.17.14/pregenerated/x86-mont-elf.S
new file mode 100644
index 0000000000..e1923ea008
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86-mont-elf.S
@@ -0,0 +1,220 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%ebp
+	negl	%edi
+	movl	%ebp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L000page_walk
+	jmp	.L001page_walk_done
+.align	16
+.L000page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L000page_walk
+.L001page_walk_done:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebp
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%ebp,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%edx,24(%esp)
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	16
+.L0021st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	.L0021st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+.L003outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+.L004inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	.L004inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	.L003outer
+	emms
+	jmp	.L005common_tail
+.align	16
+.L005common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	16
+.L006sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	.L006sub
+	sbbl	$0,%eax
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	.L007copy
+.align	16
+.L007copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
+	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
+	decl	%ebx
+	jge	.L007copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/ring-0.17.14/pregenerated/x86-mont-win32n.asm b/ring-0.17.14/pregenerated/x86-mont-win32n.asm
new file mode 100644
index 0000000000..fe0cc973e1
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86-mont-win32n.asm
@@ -0,0 +1,226 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_bn_mul_mont
+align	16
+_bn_mul_mont:
+L$_bn_mul_mont_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	xor	eax,eax
+	mov	edi,DWORD [40+esp]
+	lea	esi,[20+esp]
+	lea	edx,[24+esp]
+	add	edi,2
+	neg	edi
+	lea	ebp,[edi*4+esp-32]
+	neg	edi
+	mov	eax,ebp
+	sub	eax,edx
+	and	eax,2047
+	sub	ebp,eax
+	xor	edx,ebp
+	and	edx,2048
+	xor	edx,2048
+	sub	ebp,edx
+	and	ebp,-64
+	mov	eax,esp
+	sub	eax,ebp
+	and	eax,-4096
+	mov	edx,esp
+	lea	esp,[eax*1+ebp]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$000page_walk
+	jmp	NEAR L$001page_walk_done
+align	16
+L$000page_walk:
+	lea	esp,[esp-4096]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$000page_walk
+L$001page_walk_done:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	ebp,DWORD [12+esi]
+	mov	esi,DWORD [16+esi]
+	mov	esi,DWORD [esi]
+	mov	DWORD [4+esp],eax
+	mov	DWORD [8+esp],ebx
+	mov	DWORD [12+esp],ecx
+	mov	DWORD [16+esp],ebp
+	mov	DWORD [20+esp],esi
+	lea	ebx,[edi-3]
+	mov	DWORD [24+esp],edx
+	mov	eax,-1
+	movd	mm7,eax
+	mov	esi,DWORD [8+esp]
+	mov	edi,DWORD [12+esp]
+	mov	ebp,DWORD [16+esp]
+	xor	edx,edx
+	xor	ecx,ecx
+	movd	mm4,DWORD [edi]
+	movd	mm5,DWORD [esi]
+	movd	mm3,DWORD [ebp]
+	pmuludq	mm5,mm4
+	movq	mm2,mm5
+	movq	mm0,mm5
+	pand	mm0,mm7
+	pmuludq	mm5,[20+esp]
+	pmuludq	mm3,mm5
+	paddq	mm3,mm0
+	movd	mm1,DWORD [4+ebp]
+	movd	mm0,DWORD [4+esi]
+	psrlq	mm2,32
+	psrlq	mm3,32
+	inc	ecx
+align	16
+L$0021st:
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	movd	mm1,DWORD [4+ecx*4+ebp]
+	paddq	mm3,mm0
+	movd	mm0,DWORD [4+ecx*4+esi]
+	psrlq	mm2,32
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm3,32
+	lea	ecx,[1+ecx]
+	cmp	ecx,ebx
+	jl	NEAR L$0021st
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	paddq	mm3,mm0
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm2,32
+	psrlq	mm3,32
+	paddq	mm3,mm2
+	movq	[32+ebx*4+esp],mm3
+	inc	edx
+L$003outer:
+	xor	ecx,ecx
+	movd	mm4,DWORD [edx*4+edi]
+	movd	mm5,DWORD [esi]
+	movd	mm6,DWORD [32+esp]
+	movd	mm3,DWORD [ebp]
+	pmuludq	mm5,mm4
+	paddq	mm5,mm6
+	movq	mm0,mm5
+	movq	mm2,mm5
+	pand	mm0,mm7
+	pmuludq	mm5,[20+esp]
+	pmuludq	mm3,mm5
+	paddq	mm3,mm0
+	movd	mm6,DWORD [36+esp]
+	movd	mm1,DWORD [4+ebp]
+	movd	mm0,DWORD [4+esi]
+	psrlq	mm2,32
+	psrlq	mm3,32
+	paddq	mm2,mm6
+	inc	ecx
+	dec	ebx
+L$004inner:
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	movd	mm6,DWORD [36+ecx*4+esp]
+	pand	mm0,mm7
+	movd	mm1,DWORD [4+ecx*4+ebp]
+	paddq	mm3,mm0
+	movd	mm0,DWORD [4+ecx*4+esi]
+	psrlq	mm2,32
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm3,32
+	paddq	mm2,mm6
+	dec	ebx
+	lea	ecx,[1+ecx]
+	jnz	NEAR L$004inner
+	mov	ebx,ecx
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	paddq	mm3,mm0
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm2,32
+	psrlq	mm3,32
+	movd	mm6,DWORD [36+ebx*4+esp]
+	paddq	mm3,mm2
+	paddq	mm3,mm6
+	movq	[32+ebx*4+esp],mm3
+	lea	edx,[1+edx]
+	cmp	edx,ebx
+	jle	NEAR L$003outer
+	emms
+	jmp	NEAR L$005common_tail
+align	16
+L$005common_tail:
+	mov	ebp,DWORD [16+esp]
+	mov	edi,DWORD [4+esp]
+	lea	esi,[32+esp]
+	mov	eax,DWORD [esi]
+	mov	ecx,ebx
+	xor	edx,edx
+align	16
+L$006sub:
+	sbb	eax,DWORD [edx*4+ebp]
+	mov	DWORD [edx*4+edi],eax
+	dec	ecx
+	mov	eax,DWORD [4+edx*4+esi]
+	lea	edx,[1+edx]
+	jge	NEAR L$006sub
+	sbb	eax,0
+	mov	edx,-1
+	xor	edx,eax
+	jmp	NEAR L$007copy
+align	16
+L$007copy:
+	mov	esi,DWORD [32+ebx*4+esp]
+	mov	ebp,DWORD [ebx*4+edi]
+	mov	DWORD [32+ebx*4+esp],ecx
+	and	esi,eax
+	and	ebp,edx
+	or	ebp,esi
+	mov	DWORD [ebx*4+edi],ebp
+	dec	ebx
+	jge	NEAR L$007copy
+	mov	esp,DWORD [24+esp]
+	mov	eax,1
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+db	111,114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/x86-mont-win32n.o b/ring-0.17.14/pregenerated/x86-mont-win32n.o
new file mode 100644
index 0000000000..9d1b72b85d
Binary files /dev/null and b/ring-0.17.14/pregenerated/x86-mont-win32n.o differ
diff --git a/ring-0.17.14/pregenerated/x86_64-mont-elf.S b/ring-0.17.14/pregenerated/x86_64-mont-elf.S
new file mode 100644
index 0000000000..01888e72bc
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86_64-mont-elf.S
@@ -0,0 +1,1237 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	bn_mul_mont_nohw
+.hidden bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,@function
+.align	16
+bn_mul_mont_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	movq	%r9,%r15
+
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsp,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+.Lcopy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r9,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+.globl	bn_mul4x_mont
+.hidden bn_mul4x_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jb	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdx
+	shrq	$2,%r15
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,224
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	$0,%xmm4,%xmm4
+	movq	%r9,%r15
+	pxor	%xmm4,%xmm5
+	shrq	$2,%r15
+	xorl	%eax,%eax
+
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqa	(%rsp,%rax,1),%xmm1
+	movdqu	(%rdi,%rax,1),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax,1),%xmm3
+	movdqa	%xmm0,(%rsp,%rax,1)
+	por	%xmm2,%xmm1
+	movdqu	16(%rdi,%rax,1),%xmm2
+	movdqu	%xmm1,(%rdi,%rax,1)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax,1)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16(%rdi,%rax,1)
+	leaq	32(%rax),%rax
+	decq	%r15
+	jnz	.Lcopy4x
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi, 8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.extern	bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.extern	bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+
+.globl	bn_sqr8x_mont
+.hidden bn_sqr8x_mont
+.type	bn_sqr8x_mont,@function
+.align	32
+bn_sqr8x_mont:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lsqr8x_prologue:
+
+	movl	%r9d,%r10d
+	shll	$3,%r9d
+	shlq	$3+2,%r10
+	negq	%r9
+
+
+
+
+
+
+	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	movq	(%r8),%r8
+	subq	%rsi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lsqr8x_sp_alt
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
+	jmp	.Lsqr8x_sp_done
+
+.align	32
+.Lsqr8x_sp_alt:
+	leaq	4096-64(,%r9,2),%r10
+	leaq	-64(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lsqr8x_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
+
+.align	16
+.Lsqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lsqr8x_body:
+
+.byte	102,72,15,110,209
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,73,15,110,218
+	testq	%rdx,%rdx
+	jz	.Lsqr8x_nox
+
+	call	bn_sqrx8x_internal
+
+
+
+
+	leaq	(%r8,%rcx,1),%rbx
+	movq	%rcx,%r9
+	movq	%rcx,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_nox:
+	call	bn_sqr8x_internal
+
+
+
+
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+	movq	%r9,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_sub:
+	movq	0(%rbx),%r12
+	movq	8(%rbx),%r13
+	movq	16(%rbx),%r14
+	movq	24(%rbx),%r15
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rbp),%r12
+	sbbq	8(%rbp),%r13
+	sbbq	16(%rbp),%r14
+	sbbq	24(%rbp),%r15
+	leaq	32(%rbp),%rbp
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+	incq	%rcx
+	jnz	.Lsqr8x_sub
+
+	sbbq	$0,%rax
+	leaq	(%rbx,%r9,1),%rbx
+	leaq	(%rdi,%r9,1),%rdi
+
+.byte	102,72,15,110,200
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	jmp	.Lsqr8x_cond_copy
+
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	movdqa	%xmm0,-32(%rbx,%rdx,1)
+	movdqa	%xmm0,-16(%rbx,%rdx,1)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	addq	$32,%r9
+	jnz	.Lsqr8x_cond_copy
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lsqr8x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+.globl	bn_mulx4x_mont
+.hidden bn_mulx4x_mont
+.type	bn_mulx4x_mont,@function
+.align	32
+bn_mulx4x_mont:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmulx4x_prologue:
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,1),%rbp
+	andq	$-128,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.align	16
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+	leaq	(%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r9,0(%rsp)
+	shrq	$5,%r9
+	movq	%r10,16(%rsp)
+	subq	$1,%r9
+	movq	%r8,24(%rsp)
+	movq	%rdi,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+	movq	%r9,48(%rsp)
+	jmp	.Lmulx4x_body
+
+.align	32
+.Lmulx4x_body:
+	leaq	8(%rdx),%rdi
+	movq	(%rdx),%rdx
+	leaq	64+32(%rsp),%rbx
+	movq	%rdx,%r9
+
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r14
+	addq	%rax,%r11
+	movq	%rdi,8(%rsp)
+	mulxq	16(%rsi),%r12,%r13
+	adcq	%r14,%r12
+	adcq	$0,%r13
+
+	movq	%r8,%rdi
+	imulq	24(%rsp),%r8
+	xorq	%rbp,%rbp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%rdi
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	movq	48(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	addq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	movq	(%rdi),%rdx
+	leaq	8(%rdi),%rdi
+	subq	%rax,%rsi
+	movq	%r15,(%rbx)
+	leaq	64+32(%rsp),%rbx
+	subq	%rax,%rcx
+
+	mulxq	0(%rsi),%r8,%r11
+	xorl	%ebp,%ebp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	adoxq	-16(%rbx),%r12
+	adcxq	%rbp,%r13
+	adoxq	%rbp,%r13
+
+	movq	%rdi,8(%rsp)
+	movq	%r8,%r15
+	imulq	24(%rsp),%r8
+	xorl	%ebp,%ebp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	adoxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	leaq	32(%rcx),%rcx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	movq	48(%rsp),%rdi
+	movq	%r12,-16(%rbx)
+
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-32(%rbx)
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rbp
+	adcq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+
+	cmpq	16(%rsp),%rdi
+	jne	.Lmulx4x_outer
+
+	leaq	64(%rsp),%rbx
+	subq	%rax,%rcx
+	negq	%r15
+	movq	%rax,%rdx
+	shrq	$3+2,%rax
+	movq	32(%rsp),%rdi
+	jmp	.Lmulx4x_sub
+
+.align	32
+.Lmulx4x_sub:
+	movq	0(%rbx),%r11
+	movq	8(%rbx),%r12
+	movq	16(%rbx),%r13
+	movq	24(%rbx),%r14
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rcx),%r11
+	sbbq	8(%rcx),%r12
+	sbbq	16(%rcx),%r13
+	sbbq	24(%rcx),%r14
+	leaq	32(%rcx),%rcx
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+	movq	%r14,24(%rdi)
+	leaq	32(%rdi),%rdi
+	decq	%rax
+	jnz	.Lmulx4x_sub
+
+	sbbq	$0,%r15
+	leaq	64(%rsp),%rbx
+	subq	%rdx,%rdi
+
+.byte	102,73,15,110,207
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	jmp	.Lmulx4x_cond_copy
+
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	subq	$32,%rdx
+	jnz	.Lmulx4x_cond_copy
+
+	movq	%rdx,(%rbx)
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mulx4x_mont,.-bn_mulx4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
+#endif
diff --git a/ring-0.17.14/pregenerated/x86_64-mont-macosx.S b/ring-0.17.14/pregenerated/x86_64-mont-macosx.S
new file mode 100644
index 0000000000..8b8185534c
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86_64-mont-macosx.S
@@ -0,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_bn_mul_mont_nohw
+.private_extern _bn_mul_mont_nohw
+
+.p2align	4
+_bn_mul_mont_nohw:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+	jmp	L$mul_page_walk_done
+
+.p2align	4
+L$mul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+L$mul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+
+L$mul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	L$1st_enter
+
+.p2align	4
+L$1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+L$1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	L$1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	L$outer
+.p2align	4
+L$outer:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	L$inner_enter
+
+.p2align	4
+L$inner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+L$inner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	L$inner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	L$outer
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	movq	%r9,%r15
+
+.p2align	4
+L$sub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsp,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	L$sub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+L$copy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r9,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	L$copy
+
+	movq	8(%rsp,%r9,8),%rsi
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul_epilogue:
+	ret
+
+
+.globl	_bn_mul4x_mont
+.private_extern _bn_mul4x_mont
+
+.p2align	4
+_bn_mul4x_mont:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul4x_page_walk
+	jmp	L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+
+L$mul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	L$1st4x
+.p2align	4
+L$1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	L$1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.p2align	2
+L$outer4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	L$inner4x
+.p2align	4
+L$inner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	L$inner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jb	L$outer4x
+	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdx
+	shrq	$2,%r15
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+
+L$sub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	L$sub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,224
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	$0,%xmm4,%xmm4
+	movq	%r9,%r15
+	pxor	%xmm4,%xmm5
+	shrq	$2,%r15
+	xorl	%eax,%eax
+
+	jmp	L$copy4x
+.p2align	4
+L$copy4x:
+	movdqa	(%rsp,%rax,1),%xmm1
+	movdqu	(%rdi,%rax,1),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax,1),%xmm3
+	movdqa	%xmm0,(%rsp,%rax,1)
+	por	%xmm2,%xmm1
+	movdqu	16(%rdi,%rax,1),%xmm2
+	movdqu	%xmm1,(%rdi,%rax,1)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax,1)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16(%rdi,%rax,1)
+	leaq	32(%rax),%rax
+	decq	%r15
+	jnz	L$copy4x
+	movq	8(%rsp,%r9,8),%rsi
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul4x_epilogue:
+	ret
+
+
+
+
+
+.globl	_bn_sqr8x_mont
+.private_extern _bn_sqr8x_mont
+
+.p2align	5
+_bn_sqr8x_mont:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$sqr8x_prologue:
+
+	movl	%r9d,%r10d
+	shll	$3,%r9d
+	shlq	$3+2,%r10
+	negq	%r9
+
+
+
+
+
+
+	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	movq	(%r8),%r8
+	subq	%rsi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$sqr8x_sp_alt
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
+	jmp	L$sqr8x_sp_done
+
+.p2align	5
+L$sqr8x_sp_alt:
+	leaq	4096-64(,%r9,2),%r10
+	leaq	-64(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$sqr8x_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$sqr8x_page_walk
+	jmp	L$sqr8x_page_walk_done
+
+.p2align	4
+L$sqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$sqr8x_page_walk
+L$sqr8x_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$sqr8x_body:
+
+.byte	102,72,15,110,209
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,73,15,110,218
+	testq	%rdx,%rdx
+	jz	L$sqr8x_nox
+
+	call	_bn_sqrx8x_internal
+
+
+
+
+	leaq	(%r8,%rcx,1),%rbx
+	movq	%rcx,%r9
+	movq	%rcx,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	L$sqr8x_sub
+
+.p2align	5
+L$sqr8x_nox:
+	call	_bn_sqr8x_internal
+
+
+
+
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+	movq	%r9,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	L$sqr8x_sub
+
+.p2align	5
+L$sqr8x_sub:
+	movq	0(%rbx),%r12
+	movq	8(%rbx),%r13
+	movq	16(%rbx),%r14
+	movq	24(%rbx),%r15
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rbp),%r12
+	sbbq	8(%rbp),%r13
+	sbbq	16(%rbp),%r14
+	sbbq	24(%rbp),%r15
+	leaq	32(%rbp),%rbp
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+	incq	%rcx
+	jnz	L$sqr8x_sub
+
+	sbbq	$0,%rax
+	leaq	(%rbx,%r9,1),%rbx
+	leaq	(%rdi,%r9,1),%rdi
+
+.byte	102,72,15,110,200
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+
+	jmp	L$sqr8x_cond_copy
+
+.p2align	5
+L$sqr8x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	movdqa	%xmm0,-32(%rbx,%rdx,1)
+	movdqa	%xmm0,-16(%rbx,%rdx,1)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	addq	$32,%r9
+	jnz	L$sqr8x_cond_copy
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$sqr8x_epilogue:
+	ret
+
+
+.globl	_bn_mulx4x_mont
+.private_extern _bn_mulx4x_mont
+
+.p2align	5
+_bn_mulx4x_mont:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mulx4x_prologue:
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,1),%rbp
+	andq	$-128,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+	jmp	L$mulx4x_page_walk_done
+
+.p2align	4
+L$mulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+L$mulx4x_page_walk_done:
+
+	leaq	(%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r9,0(%rsp)
+	shrq	$5,%r9
+	movq	%r10,16(%rsp)
+	subq	$1,%r9
+	movq	%r8,24(%rsp)
+	movq	%rdi,32(%rsp)
+	movq	%rax,40(%rsp)
+
+	movq	%r9,48(%rsp)
+	jmp	L$mulx4x_body
+
+.p2align	5
+L$mulx4x_body:
+	leaq	8(%rdx),%rdi
+	movq	(%rdx),%rdx
+	leaq	64+32(%rsp),%rbx
+	movq	%rdx,%r9
+
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r14
+	addq	%rax,%r11
+	movq	%rdi,8(%rsp)
+	mulxq	16(%rsi),%r12,%r13
+	adcq	%r14,%r12
+	adcq	$0,%r13
+
+	movq	%r8,%rdi
+	imulq	24(%rsp),%r8
+	xorq	%rbp,%rbp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%rdi
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	movq	48(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+
+	jmp	L$mulx4x_1st
+
+.p2align	5
+L$mulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_1st
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	addq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+	jmp	L$mulx4x_outer
+
+.p2align	5
+L$mulx4x_outer:
+	movq	(%rdi),%rdx
+	leaq	8(%rdi),%rdi
+	subq	%rax,%rsi
+	movq	%r15,(%rbx)
+	leaq	64+32(%rsp),%rbx
+	subq	%rax,%rcx
+
+	mulxq	0(%rsi),%r8,%r11
+	xorl	%ebp,%ebp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	adoxq	-16(%rbx),%r12
+	adcxq	%rbp,%r13
+	adoxq	%rbp,%r13
+
+	movq	%rdi,8(%rsp)
+	movq	%r8,%r15
+	imulq	24(%rsp),%r8
+	xorl	%ebp,%ebp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	adoxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	leaq	32(%rcx),%rcx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	movq	48(%rsp),%rdi
+	movq	%r12,-16(%rbx)
+
+	jmp	L$mulx4x_inner
+
+.p2align	5
+L$mulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-32(%rbx)
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_inner
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rbp
+	adcq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+
+	cmpq	16(%rsp),%rdi
+	jne	L$mulx4x_outer
+
+	leaq	64(%rsp),%rbx
+	subq	%rax,%rcx
+	negq	%r15
+	movq	%rax,%rdx
+	shrq	$3+2,%rax
+	movq	32(%rsp),%rdi
+	jmp	L$mulx4x_sub
+
+.p2align	5
+L$mulx4x_sub:
+	movq	0(%rbx),%r11
+	movq	8(%rbx),%r12
+	movq	16(%rbx),%r13
+	movq	24(%rbx),%r14
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rcx),%r11
+	sbbq	8(%rcx),%r12
+	sbbq	16(%rcx),%r13
+	sbbq	24(%rcx),%r14
+	leaq	32(%rcx),%rcx
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+	movq	%r14,24(%rdi)
+	leaq	32(%rdi),%rdi
+	decq	%rax
+	jnz	L$mulx4x_sub
+
+	sbbq	$0,%r15
+	leaq	64(%rsp),%rbx
+	subq	%rdx,%rdi
+
+.byte	102,73,15,110,207
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+
+	jmp	L$mulx4x_cond_copy
+
+.p2align	5
+L$mulx4x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	subq	$32,%rdx
+	jnz	L$mulx4x_cond_copy
+
+	movq	%rdx,(%rbx)
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mulx4x_epilogue:
+	ret
+
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	4
+#endif
diff --git a/ring-0.17.14/pregenerated/x86_64-mont-nasm.asm b/ring-0.17.14/pregenerated/x86_64-mont-nasm.asm
new file mode 100644
index 0000000000..361eaceeab
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86_64-mont-nasm.asm
@@ -0,0 +1,1468 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+global	bn_mul_mont_nohw
+
+ALIGN	16
+bn_mul_mont_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul_mont_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-16))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+ALIGN	16
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+	mov	r12,rdx
+	mov	r8,QWORD[r8]
+	mov	rbx,QWORD[r12]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$1st_enter
+
+ALIGN	16
+$L$1st:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r11
+	mov	r11,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$1st_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[1+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$1st
+
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+	mov	r11,r10
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	jmp	NEAR $L$outer
+ALIGN	16
+$L$outer:
+	mov	rbx,QWORD[r14*8+r12]
+	xor	r15,r15
+	mov	rbp,r8
+	mov	r10,QWORD[rsp]
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r10,QWORD[8+rsp]
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$inner_enter
+
+ALIGN	16
+$L$inner:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$inner_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+	lea	r15,[1+r15]
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$inner
+
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	cmp	r14,r9
+	jb	NEAR $L$outer
+
+	xor	r14,r14
+	mov	rax,QWORD[rsp]
+	mov	r15,r9
+
+ALIGN	16
+$L$sub:	sbb	rax,QWORD[r14*8+rcx]
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[8+r14*8+rsp]
+	lea	r14,[1+r14]
+	dec	r15
+	jnz	NEAR $L$sub
+
+	sbb	rax,0
+	mov	rbx,-1
+	xor	rbx,rax
+	xor	r14,r14
+	mov	r15,r9
+
+$L$copy:
+	mov	rcx,QWORD[r14*8+rdi]
+	mov	rdx,QWORD[r14*8+rsp]
+	and	rcx,rbx
+	and	rdx,rax
+	mov	QWORD[r14*8+rsp],r9
+	or	rdx,rcx
+	mov	QWORD[r14*8+rdi],rdx
+	lea	r14,[1+r14]
+	sub	r15,1
+	jnz	NEAR $L$copy
+
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul_mont_nohw:
+global	bn_mul4x_mont
+
+ALIGN	16
+bn_mul4x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul4x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-32))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul4x_body:
+	mov	QWORD[16+r9*8+rsp],rdi
+	mov	r12,rdx
+	mov	r8,QWORD[r8]
+	mov	rbx,QWORD[r12]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[4+r15]
+	adc	rdx,0
+	mov	QWORD[rsp],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$1st4x
+ALIGN	16
+$L$1st4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[4+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[((-16))+r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-32))+r15*8+rsp],rdi
+	mov	r13,rdx
+	cmp	r15,r9
+	jb	NEAR $L$1st4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	QWORD[r15*8+rsp],rdi
+
+	lea	r14,[1+r14]
+ALIGN	4
+$L$outer4x:
+	mov	rbx,QWORD[r14*8+r12]
+	xor	r15,r15
+	mov	r10,QWORD[rsp]
+	mov	rbp,r8
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+rsp]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[4+r15]
+	adc	rdx,0
+	mov	QWORD[rsp],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$inner4x
+ALIGN	16
+$L$inner4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[((-16))+r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r15*8+rsp]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r15*8+rsp]
+	adc	rdx,0
+	lea	r15,[4+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[((-16))+r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-32))+r15*8+rsp],rdi
+	mov	r13,rdx
+	cmp	r15,r9
+	jb	NEAR $L$inner4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[((-16))+r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r15*8+rsp]
+	adc	rdx,0
+	lea	r14,[1+r14]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	add	r13,QWORD[r9*8+rsp]
+	adc	rdi,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	QWORD[r15*8+rsp],rdi
+
+	cmp	r14,r9
+	jb	NEAR $L$outer4x
+	mov	rdi,QWORD[16+r9*8+rsp]
+	lea	r15,[((-4))+r9]
+	mov	rax,QWORD[rsp]
+	mov	rdx,QWORD[8+rsp]
+	shr	r15,2
+	lea	rsi,[rsp]
+	xor	r14,r14
+
+	sub	rax,QWORD[rcx]
+	mov	rbx,QWORD[16+rsi]
+	mov	rbp,QWORD[24+rsi]
+	sbb	rdx,QWORD[8+rcx]
+
+$L$sub4x:
+	mov	QWORD[r14*8+rdi],rax
+	mov	QWORD[8+r14*8+rdi],rdx
+	sbb	rbx,QWORD[16+r14*8+rcx]
+	mov	rax,QWORD[32+r14*8+rsi]
+	mov	rdx,QWORD[40+r14*8+rsi]
+	sbb	rbp,QWORD[24+r14*8+rcx]
+	mov	QWORD[16+r14*8+rdi],rbx
+	mov	QWORD[24+r14*8+rdi],rbp
+	sbb	rax,QWORD[32+r14*8+rcx]
+	mov	rbx,QWORD[48+r14*8+rsi]
+	mov	rbp,QWORD[56+r14*8+rsi]
+	sbb	rdx,QWORD[40+r14*8+rcx]
+	lea	r14,[4+r14]
+	dec	r15
+	jnz	NEAR $L$sub4x
+
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[32+r14*8+rsi]
+	sbb	rbx,QWORD[16+r14*8+rcx]
+	mov	QWORD[8+r14*8+rdi],rdx
+	sbb	rbp,QWORD[24+r14*8+rcx]
+	mov	QWORD[16+r14*8+rdi],rbx
+
+	sbb	rax,0
+	mov	QWORD[24+r14*8+rdi],rbp
+	pxor	xmm0,xmm0
+DB	102,72,15,110,224
+	pcmpeqd	xmm5,xmm5
+	pshufd	xmm4,xmm4,0
+	mov	r15,r9
+	pxor	xmm5,xmm4
+	shr	r15,2
+	xor	eax,eax
+
+	jmp	NEAR $L$copy4x
+ALIGN	16
+$L$copy4x:
+	movdqa	xmm1,XMMWORD[rax*1+rsp]
+	movdqu	xmm2,XMMWORD[rax*1+rdi]
+	pand	xmm1,xmm4
+	pand	xmm2,xmm5
+	movdqa	xmm3,XMMWORD[16+rax*1+rsp]
+	movdqa	XMMWORD[rax*1+rsp],xmm0
+	por	xmm1,xmm2
+	movdqu	xmm2,XMMWORD[16+rax*1+rdi]
+	movdqu	XMMWORD[rax*1+rdi],xmm1
+	pand	xmm3,xmm4
+	pand	xmm2,xmm5
+	movdqa	XMMWORD[16+rax*1+rsp],xmm0
+	por	xmm3,xmm2
+	movdqu	XMMWORD[16+rax*1+rdi],xmm3
+	lea	rax,[32+rax]
+	dec	r15
+	jnz	NEAR $L$copy4x
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul4x_mont:
+EXTERN	bn_sqrx8x_internal
+EXTERN	bn_sqr8x_internal
+
+global	bn_sqr8x_mont
+
+ALIGN	32
+bn_sqr8x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_sqr8x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqr8x_prologue:
+
+	mov	r10d,r9d
+	shl	r9d,3
+	shl	r10,3+2
+	neg	r9
+
+
+
+
+
+
+	lea	r11,[((-64))+r9*2+rsp]
+	mov	rbp,rsp
+	mov	r8,QWORD[r8]
+	sub	r11,rsi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$sqr8x_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-64))+r9*2+rbp]
+	jmp	NEAR $L$sqr8x_sp_done
+
+ALIGN	32
+$L$sqr8x_sp_alt:
+	lea	r10,[((4096-64))+r9*2]
+	lea	rbp,[((-64))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$sqr8x_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+	jmp	NEAR $L$sqr8x_page_walk_done
+
+ALIGN	16
+$L$sqr8x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+$L$sqr8x_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$sqr8x_body:
+
+DB	102,72,15,110,209
+	pxor	xmm0,xmm0
+DB	102,72,15,110,207
+DB	102,73,15,110,218
+	test	rdx,rdx
+	jz	NEAR $L$sqr8x_nox
+
+	call	bn_sqrx8x_internal
+
+
+
+
+	lea	rbx,[rcx*1+r8]
+	mov	r9,rcx
+	mov	rdx,rcx
+DB	102,72,15,126,207
+	sar	rcx,3+2
+	jmp	NEAR $L$sqr8x_sub
+
+ALIGN	32
+$L$sqr8x_nox:
+	call	bn_sqr8x_internal
+
+
+
+
+	lea	rbx,[r9*1+rdi]
+	mov	rcx,r9
+	mov	rdx,r9
+DB	102,72,15,126,207
+	sar	rcx,3+2
+	jmp	NEAR $L$sqr8x_sub
+
+ALIGN	32
+$L$sqr8x_sub:
+	mov	r12,QWORD[rbx]
+	mov	r13,QWORD[8+rbx]
+	mov	r14,QWORD[16+rbx]
+	mov	r15,QWORD[24+rbx]
+	lea	rbx,[32+rbx]
+	sbb	r12,QWORD[rbp]
+	sbb	r13,QWORD[8+rbp]
+	sbb	r14,QWORD[16+rbp]
+	sbb	r15,QWORD[24+rbp]
+	lea	rbp,[32+rbp]
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	lea	rdi,[32+rdi]
+	inc	rcx
+	jnz	NEAR $L$sqr8x_sub
+
+	sbb	rax,0
+	lea	rbx,[r9*1+rbx]
+	lea	rdi,[r9*1+rdi]
+
+DB	102,72,15,110,200
+	pxor	xmm0,xmm0
+	pshufd	xmm1,xmm1,0
+	mov	rsi,QWORD[40+rsp]
+
+	jmp	NEAR $L$sqr8x_cond_copy
+
+ALIGN	32
+$L$sqr8x_cond_copy:
+	movdqa	xmm2,XMMWORD[rbx]
+	movdqa	xmm3,XMMWORD[16+rbx]
+	lea	rbx,[32+rbx]
+	movdqu	xmm4,XMMWORD[rdi]
+	movdqu	xmm5,XMMWORD[16+rdi]
+	lea	rdi,[32+rdi]
+	movdqa	XMMWORD[(-32)+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rbx],xmm0
+	movdqa	XMMWORD[(-32)+rdx*1+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rdx*1+rbx],xmm0
+	pcmpeqd	xmm0,xmm1
+	pand	xmm2,xmm1
+	pand	xmm3,xmm1
+	pand	xmm4,xmm0
+	pand	xmm5,xmm0
+	pxor	xmm0,xmm0
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqu	XMMWORD[(-32)+rdi],xmm4
+	movdqu	XMMWORD[(-16)+rdi],xmm5
+	add	r9,32
+	jnz	NEAR $L$sqr8x_cond_copy
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$sqr8x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_sqr8x_mont:
+global	bn_mulx4x_mont
+
+ALIGN	32
+bn_mulx4x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mulx4x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx4x_prologue:
+
+	shl	r9d,3
+	xor	r10,r10
+	sub	r10,r9
+	mov	r8,QWORD[r8]
+	lea	rbp,[((-72))+r10*1+rsp]
+	and	rbp,-128
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+	jmp	NEAR $L$mulx4x_page_walk_done
+
+ALIGN	16
+$L$mulx4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+	lea	r10,[r9*1+rdx]
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[rsp],r9
+	shr	r9,5
+	mov	QWORD[16+rsp],r10
+	sub	r9,1
+	mov	QWORD[24+rsp],r8
+	mov	QWORD[32+rsp],rdi
+	mov	QWORD[40+rsp],rax
+
+	mov	QWORD[48+rsp],r9
+	jmp	NEAR $L$mulx4x_body
+
+ALIGN	32
+$L$mulx4x_body:
+	lea	rdi,[8+rdx]
+	mov	rdx,QWORD[rdx]
+	lea	rbx,[((64+32))+rsp]
+	mov	r9,rdx
+
+	mulx	rax,r8,QWORD[rsi]
+	mulx	r14,r11,QWORD[8+rsi]
+	add	r11,rax
+	mov	QWORD[8+rsp],rdi
+	mulx	r13,r12,QWORD[16+rsi]
+	adc	r12,r14
+	adc	r13,0
+
+	mov	rdi,r8
+	imul	r8,QWORD[24+rsp]
+	xor	rbp,rbp
+
+	mulx	r14,rax,QWORD[24+rsi]
+	mov	rdx,r8
+	lea	rsi,[32+rsi]
+	adcx	r13,rax
+	adcx	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	rdi,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	DB	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	mov	rdi,QWORD[48+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	adcx	r12,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r12
+
+	jmp	NEAR $L$mulx4x_1st
+
+ALIGN	32
+$L$mulx4x_1st:
+	adcx	r15,rbp
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	DB	0x67,0x67
+	mov	rdx,r8
+	adcx	r13,rax
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-32))+rbx],r11
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_1st
+
+	mov	rax,QWORD[rsp]
+	mov	rdi,QWORD[8+rsp]
+	adc	r15,rbp
+	add	r14,r15
+	sbb	r15,r15
+	mov	QWORD[((-8))+rbx],r14
+	jmp	NEAR $L$mulx4x_outer
+
+ALIGN	32
+$L$mulx4x_outer:
+	mov	rdx,QWORD[rdi]
+	lea	rdi,[8+rdi]
+	sub	rsi,rax
+	mov	QWORD[rbx],r15
+	lea	rbx,[((64+32))+rsp]
+	sub	rcx,rax
+
+	mulx	r11,r8,QWORD[rsi]
+	xor	ebp,ebp
+	mov	r9,rdx
+	mulx	r12,r14,QWORD[8+rsi]
+	adox	r8,QWORD[((-32))+rbx]
+	adcx	r11,r14
+	mulx	r13,r15,QWORD[16+rsi]
+	adox	r11,QWORD[((-24))+rbx]
+	adcx	r12,r15
+	adox	r12,QWORD[((-16))+rbx]
+	adcx	r13,rbp
+	adox	r13,rbp
+
+	mov	QWORD[8+rsp],rdi
+	mov	r15,r8
+	imul	r8,QWORD[24+rsp]
+	xor	ebp,ebp
+
+	mulx	r14,rax,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r13,rax
+	adox	r13,QWORD[((-8))+rbx]
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	adox	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	lea	rcx,[32+rcx]
+	adcx	r12,rax
+	adox	r15,rbp
+	mov	rdi,QWORD[48+rsp]
+	mov	QWORD[((-16))+rbx],r12
+
+	jmp	NEAR $L$mulx4x_inner
+
+ALIGN	32
+$L$mulx4x_inner:
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r15,rbp
+	adox	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r10,QWORD[rbx]
+	adox	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r11,QWORD[8+rbx]
+	adox	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r12,QWORD[16+rbx]
+	adox	r13,rax
+	adcx	r13,QWORD[24+rbx]
+	adox	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+	adcx	r14,rbp
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-32))+rbx],r11
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_inner
+
+	mov	rax,QWORD[rsp]
+	mov	rdi,QWORD[8+rsp]
+	adc	r15,rbp
+	sub	rbp,QWORD[rbx]
+	adc	r14,r15
+	sbb	r15,r15
+	mov	QWORD[((-8))+rbx],r14
+
+	cmp	rdi,QWORD[16+rsp]
+	jne	NEAR $L$mulx4x_outer
+
+	lea	rbx,[64+rsp]
+	sub	rcx,rax
+	neg	r15
+	mov	rdx,rax
+	shr	rax,3+2
+	mov	rdi,QWORD[32+rsp]
+	jmp	NEAR $L$mulx4x_sub
+
+ALIGN	32
+$L$mulx4x_sub:
+	mov	r11,QWORD[rbx]
+	mov	r12,QWORD[8+rbx]
+	mov	r13,QWORD[16+rbx]
+	mov	r14,QWORD[24+rbx]
+	lea	rbx,[32+rbx]
+	sbb	r11,QWORD[rcx]
+	sbb	r12,QWORD[8+rcx]
+	sbb	r13,QWORD[16+rcx]
+	sbb	r14,QWORD[24+rcx]
+	lea	rcx,[32+rcx]
+	mov	QWORD[rdi],r11
+	mov	QWORD[8+rdi],r12
+	mov	QWORD[16+rdi],r13
+	mov	QWORD[24+rdi],r14
+	lea	rdi,[32+rdi]
+	dec	rax
+	jnz	NEAR $L$mulx4x_sub
+
+	sbb	r15,0
+	lea	rbx,[64+rsp]
+	sub	rdi,rdx
+
+DB	102,73,15,110,207
+	pxor	xmm0,xmm0
+	pshufd	xmm1,xmm1,0
+	mov	rsi,QWORD[40+rsp]
+
+	jmp	NEAR $L$mulx4x_cond_copy
+
+ALIGN	32
+$L$mulx4x_cond_copy:
+	movdqa	xmm2,XMMWORD[rbx]
+	movdqa	xmm3,XMMWORD[16+rbx]
+	lea	rbx,[32+rbx]
+	movdqu	xmm4,XMMWORD[rdi]
+	movdqu	xmm5,XMMWORD[16+rdi]
+	lea	rdi,[32+rdi]
+	movdqa	XMMWORD[(-32)+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rbx],xmm0
+	pcmpeqd	xmm0,xmm1
+	pand	xmm2,xmm1
+	pand	xmm3,xmm1
+	pand	xmm4,xmm0
+	pand	xmm5,xmm0
+	pxor	xmm0,xmm0
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqu	XMMWORD[(-32)+rdi],xmm4
+	movdqu	XMMWORD[(-16)+rdi],xmm5
+	sub	rdx,32
+	jnz	NEAR $L$mulx4x_cond_copy
+
+	mov	QWORD[rbx],rdx
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mulx4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mulx4x_mont:
+	DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+	DB	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+	DB	54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
+	DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+	DB	115,108,46,111,114,103,62,0
+ALIGN	16
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+mul_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10,QWORD[192+r8]
+	mov	rax,QWORD[8+r10*8+rax]
+
+	jmp	NEAR $L$common_pop_regs
+
+
+
+ALIGN	16
+sqr_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[40+rax]
+
+$L$common_pop_regs:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_bn_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_end_bn_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_info_bn_mul_mont_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_mul4x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_mul4x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_mul4x_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_mulx4x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_mulx4x_mont wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_bn_mul_mont_nohw:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+$L$SEH_info_bn_mul4x_mont:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+$L$SEH_info_bn_sqr8x_mont:
+	DB	9,0,0,0
+	DD	sqr_handler wrt ..imagebase
+	DD	$L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mulx4x_mont:
+	DB	9,0,0,0
+	DD	sqr_handler wrt ..imagebase
+	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN	8
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/x86_64-mont-nasm.o b/ring-0.17.14/pregenerated/x86_64-mont-nasm.o
new file mode 100644
index 0000000000..211ade6370
Binary files /dev/null and b/ring-0.17.14/pregenerated/x86_64-mont-nasm.o differ
diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-elf.S b/ring-0.17.14/pregenerated/x86_64-mont5-elf.S
new file mode 100644
index 0000000000..f202ea3fc3
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86_64-mont5-elf.S
@@ -0,0 +1,3188 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	bn_mul4x_mont_gather5
+.hidden bn_mul4x_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	32
+bn_mul4x_mont_gather5:
+.cfi_startproc	
+_CET_ENDBR
+.byte	0x67
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmul4x_prologue:
+
+.byte	0x67
+
+
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmul4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmul4xsp_done
+
+.align	32
+.Lmul4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmul4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	negq	%r9
+
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmul4x_body:
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type	mul4x_internal,@function
+.align	32
+mul4x_internal:
+.cfi_startproc	
+	shlq	$5,%r9
+	movd	8(%rax),%xmm5
+	leaq	.Linc(%rip),%rax
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5,%r9
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r9,1),%r10
+	leaq	128(%rdx),%r12
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	%r13,16+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+	leaq	(%rsi,%r9,1),%rsi
+	negq	%r9
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	leaq	64+8(%rsp),%r14
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+
+.align	32
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	jmp	.Louter4x
+
+.align	32
+.Louter4x:
+	leaq	16+128(%r14),%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r14,%r9,1),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+	movq	%rdi,(%r14)
+
+	leaq	(%r14,%r9,1),%r14
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+
+.align	32
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	addq	(%r14),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	movq	-8(%rcx),%rbp
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	movq	%rdi,-16(%r14)
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%r14),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	cmpq	16+8(%rsp),%r12
+	jb	.Louter4x
+	xorq	%rax,%rax
+	subq	%r13,%rbp
+	adcq	%r15,%r15
+	orq	%r15,%rdi
+	subq	%rdi,%rax
+	leaq	(%r14,%r9,1),%rbx
+	movq	(%rcx),%r12
+	leaq	(%rcx),%rbp
+	movq	%r9,%rcx
+	sarq	$3+2,%rcx
+	movq	56+8(%rsp),%rdi
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+.cfi_endproc	
+.size	mul4x_internal,.-mul4x_internal
+.globl	bn_power5_nohw
+.hidden bn_power5_nohw
+.type	bn_power5_nohw,@function
+.align	32
+bn_power5_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpower5_prologue:
+
+
+
+
+	shll	$3,%r9d
+	leal	(%r9,%r9,2),%r10d
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwr_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwr_sp_done
+
+.align	32
+.Lpwr_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwr_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpower5_body:
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	%rsi,%rdi
+	movq	40(%rsp),%rax
+	leaq	32(%rsp),%r8
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpower5_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_power5_nohw,.-bn_power5_nohw
+
+.globl	bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.hidden	bn_sqr8x_internal
+.type	bn_sqr8x_internal,@function
+.align	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+.cfi_startproc	
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r11,-16(%rdi,%rbp,1)
+	movq	%rdx,%r10
+
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	leaq	(%rbp),%rcx
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	32
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	16(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%r10,8(%rdi,%rcx,1)
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	24(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,16(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	leaq	32(%rcx),%rcx
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	mulq	%r15
+	addq	%rax,%r13
+	leaq	16(%rbp),%rbp
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	32
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	-24(%rdi,%rbp,1),%r10
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r10,-24(%rdi,%rbp,1)
+	movq	%rdx,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-16(%rdi,%rbp,1),%r11
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	xorq	%r12,%r12
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-8(%rdi,%rbp,1),%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rbp,1)
+
+	leaq	(%rbp),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	32
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+
+.byte	0x67
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+.byte	0x67
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	%r10,-24(%rdi)
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	movq	-8(%rsi),%rbx
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,-16(%rdi)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi)
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	48+8(%rsp),%rdi
+	xorq	%r10,%r10
+	movq	8(%rdi),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	jmp	.Lsqr4x_shift_n_add
+
+.align	32
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+.byte	0x67
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+.byte	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xorq	%rax,%rax
+	leaq	(%r9,%rbp,1),%rcx
+	leaq	48+8(%rsp,%r9,2),%rdx
+	movq	%rcx,0+8(%rsp)
+	leaq	48+8(%rsp,%r9,1),%rdi
+	movq	%rdx,8+8(%rsp)
+	negq	%r9
+	jmp	.L8x_reduction_loop
+
+.align	32
+.L8x_reduction_loop:
+	leaq	(%rdi,%r9,1),%rdi
+.byte	0x66
+	movq	0(%rdi),%rbx
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,(%rdx)
+	leaq	64(%rdi),%rdi
+
+.byte	0x67
+	movq	%rbx,%r8
+	imulq	32+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	.L8x_reduce
+
+.align	32
+.L8x_reduce:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rbx,48-8+8(%rsp,%rcx,8)
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	32+8(%rsp),%rsi
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_reduce
+
+	leaq	64(%rbp),%rbp
+	xorq	%rax,%rax
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_no_tail
+
+.byte	0x66
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movq	48+56+8(%rsp),%rbx
+	movl	$8,%ecx
+	movq	0(%rbp),%rax
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail:
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rbp),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	leaq	8(%rdi),%rdi
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	48-16+8(%rsp,%rcx,8),%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	0(%rbp),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_tail
+
+	leaq	64(%rbp),%rbp
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_tail_done
+
+	movq	48+56+8(%rsp),%rbx
+	negq	%rsi
+	movq	0(%rbp),%rax
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movl	$8,%ecx
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail_done:
+	xorq	%rax,%rax
+	addq	(%rdx),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	negq	%rsi
+.L8x_no_tail:
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+	movq	-8(%rbp),%rcx
+	xorq	%rsi,%rsi
+
+.byte	102,72,15,126,213
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+.byte	102,73,15,126,217
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	leaq	64(%rdi),%rdi
+
+	cmpq	%rdx,%rdi
+	jb	.L8x_reduction_loop
+	ret
+.cfi_endproc	
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+.type	__bn_post4x_internal,@function
+.align	32
+__bn_post4x_internal:
+.cfi_startproc	
+	movq	0(%rbp),%r12
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+.byte	102,72,15,126,207
+	negq	%rax
+.byte	102,72,15,126,206
+	sarq	$3+2,%rcx
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+
+.align	16
+.Lsqr4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqr4x_sub_entry:
+	leaq	32(%rbp),%rbp
+	notq	%r12
+	notq	%r13
+	notq	%r14
+	notq	%r15
+	andq	%rax,%r12
+	andq	%rax,%r13
+	andq	%rax,%r14
+	andq	%rax,%r15
+
+	negq	%r10
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	adcq	16(%rbx),%r14
+	adcq	24(%rbx),%r15
+	movq	%r12,0(%rdi)
+	leaq	32(%rbx),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r10,%r10
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+
+	incq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%r9,%r10
+	negq	%r9
+	ret
+.cfi_endproc	
+.size	__bn_post4x_internal,.-__bn_post4x_internal
+.globl	bn_mulx4x_mont_gather5
+.hidden bn_mulx4x_mont_gather5
+.type	bn_mulx4x_mont_gather5,@function
+.align	32
+bn_mulx4x_mont_gather5:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmulx4x_prologue:
+
+
+
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmulx4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmulx4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type	mulx4x_internal,@function
+.align	32
+mulx4x_internal:
+.cfi_startproc	
+	movq	%r9,8(%rsp)
+	movq	%r9,%r10
+	negq	%r9
+	shlq	$5,%r9
+	negq	%r10
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5+5,%r9
+	movd	8(%rax),%xmm5
+	subq	$1,%r9
+	leaq	.Linc(%rip),%rax
+	movq	%r13,16+8(%rsp)
+	movq	%r9,24+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r10,1),%r10
+	leaq	128(%rdx),%rdi
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67
+	movdqa	%xmm1,%xmm2
+.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+
+	pand	64(%rdi),%xmm0
+	pand	80(%rdi),%xmm1
+	pand	96(%rdi),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%rdi),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%rdi),%xmm4
+	movdqa	-112(%rdi),%xmm5
+	movdqa	-96(%rdi),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%rdi),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%rdi),%xmm4
+	movdqa	-48(%rdi),%xmm5
+	movdqa	-32(%rdi),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%rdi),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%rdi),%xmm4
+	movdqa	16(%rdi),%xmm5
+	movdqa	32(%rdi),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%rdi),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	pxor	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+	leaq	64+32+8(%rsp),%rbx
+
+	movq	%rdx,%r9
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r12
+	addq	%rax,%r11
+	mulxq	16(%rsi),%rax,%r13
+	adcq	%rax,%r12
+	adcq	$0,%r13
+	mulxq	24(%rsi),%rax,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+	xorq	%rbp,%rbp
+	movq	%r8,%rdx
+
+	movq	%rdi,8+8(%rsp)
+
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	8(%rsp),%rax
+	adcq	%rbp,%r15
+	leaq	(%rsi,%rax,1),%rsi
+	addq	%r15,%r14
+	movq	8+8(%rsp),%rdi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	leaq	16-256(%rbx),%r10
+	pxor	%xmm4,%xmm4
+.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%rdi),%xmm0
+	movdqa	-112(%rdi),%xmm1
+	movdqa	-96(%rdi),%xmm2
+	pand	256(%r10),%xmm0
+	movdqa	-80(%rdi),%xmm3
+	pand	272(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	288(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	304(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%rdi),%xmm0
+	movdqa	-48(%rdi),%xmm1
+	movdqa	-32(%rdi),%xmm2
+	pand	320(%r10),%xmm0
+	movdqa	-16(%rdi),%xmm3
+	pand	336(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	352(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	368(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%rdi),%xmm0
+	movdqa	16(%rdi),%xmm1
+	movdqa	32(%rdi),%xmm2
+	pand	384(%r10),%xmm0
+	movdqa	48(%rdi),%xmm3
+	pand	400(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	416(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	432(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%rdi),%xmm0
+	movdqa	80(%rdi),%xmm1
+	movdqa	96(%rdi),%xmm2
+	pand	448(%r10),%xmm0
+	movdqa	112(%rdi),%xmm3
+	pand	464(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	480(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	496(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+
+	movq	%rbp,(%rbx)
+	leaq	32(%rbx,%rax,1),%rbx
+	mulxq	0(%rsi),%r8,%r11
+	xorq	%rbp,%rbp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	mulxq	24(%rsi),%rdx,%r14
+	adoxq	-16(%rbx),%r12
+	adcxq	%rdx,%r13
+	leaq	(%rcx,%rax,1),%rcx
+	leaq	32(%rsi),%rsi
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	adoxq	%rbp,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+
+	movq	%r8,%rdx
+	xorq	%rbp,%rbp
+	movq	%rdi,8+8(%rsp)
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-24(%rbx)
+	adoxq	%rbp,%r15
+	movq	%r12,-16(%rbx)
+	leaq	32(%rcx),%rcx
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	movq	%r11,-32(%rbx)
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	leaq	32(%rcx),%rcx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0+8(%rsp),%rax
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rdi
+	movq	8+8(%rsp),%rdi
+	movq	16+8(%rsp),%r10
+	adcq	%r15,%r14
+	leaq	(%rsi,%rax,1),%rsi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+
+	cmpq	%r10,%rdi
+	jb	.Lmulx4x_outer
+
+	movq	-8(%rcx),%r10
+	movq	%rbp,%r8
+	movq	(%rcx,%rax,1),%r12
+	leaq	(%rcx,%rax,1),%rbp
+	movq	%rax,%rcx
+	leaq	(%rbx,%rax,1),%rdi
+	xorl	%eax,%eax
+	xorq	%r15,%r15
+	subq	%r14,%r10
+	adcq	%r15,%r15
+	orq	%r15,%r8
+	sarq	$3+2,%rcx
+	subq	%r8,%rax
+	movq	56+8(%rsp),%rdx
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+.cfi_endproc	
+.size	mulx4x_internal,.-mulx4x_internal
+.globl	bn_powerx5
+.hidden bn_powerx5
+.type	bn_powerx5,@function
+.align	32
+bn_powerx5:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpowerx5_prologue:
+
+
+
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwrx_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwrx_sp_done
+
+.align	32
+.Lpwrx_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwrx_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+	jmp	.Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	movq	%r10,%r9
+	movq	%rsi,%rdi
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpowerx5_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_powerx5,.-bn_powerx5
+
+.globl	bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.hidden	bn_sqrx8x_internal
+.type	bn_sqrx8x_internal,@function
+.align	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc	
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	48+8(%rsp),%rdi
+	leaq	(%rsi,%r9,1),%rbp
+	movq	%r9,0+8(%rsp)
+	movq	%rbp,8+8(%rsp)
+	jmp	.Lsqr8x_zero_start
+
+.align	32
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte	0x3e
+	movdqa	%xmm0,0(%rdi)
+	movdqa	%xmm0,16(%rdi)
+	movdqa	%xmm0,32(%rdi)
+	movdqa	%xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+	movdqa	%xmm0,64(%rdi)
+	movdqa	%xmm0,80(%rdi)
+	movdqa	%xmm0,96(%rdi)
+	movdqa	%xmm0,112(%rdi)
+	leaq	128(%rdi),%rdi
+	subq	$64,%r9
+	jnz	.Lsqrx8x_zero
+
+	movq	0(%rsi),%rdx
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	leaq	48+8(%rsp),%rdi
+	xorq	%rbp,%rbp
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_loop:
+	mulxq	8(%rsi),%r8,%rax
+	adcxq	%r9,%r8
+	adoxq	%rax,%r10
+	mulxq	16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcxq	%r11,%r10
+	adoxq	%rax,%r12
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcxq	%r12,%r11
+	adoxq	%rax,%r13
+	mulxq	40(%rsi),%r12,%rax
+	adcxq	%r13,%r12
+	adoxq	%rax,%r14
+	mulxq	48(%rsi),%r13,%rax
+	adcxq	%r14,%r13
+	adoxq	%r15,%rax
+	mulxq	56(%rsi),%r14,%r15
+	movq	8(%rsi),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+	adcq	64(%rdi),%r15
+	movq	%r8,8(%rdi)
+	movq	%r9,16(%rdi)
+	sbbq	%rcx,%rcx
+	xorq	%rbp,%rbp
+
+
+	mulxq	16(%rsi),%r8,%rbx
+	mulxq	24(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	32(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%rbx,%r11
+.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcxq	%r13,%r11
+	adoxq	%r14,%r12
+.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbx,%r13
+	adcxq	%r15,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rbp,%r14
+
+	movq	%r8,24(%rdi)
+	movq	%r9,32(%rdi)
+
+	mulxq	24(%rsi),%r8,%rbx
+	mulxq	32(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	40(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%r13,%r11
+.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte	0x3e
+	movq	24(%rsi),%rdx
+	adcxq	%rbx,%r11
+	adoxq	%rax,%r12
+	adcxq	%r14,%r12
+	movq	%r8,40(%rdi)
+	movq	%r9,48(%rdi)
+	mulxq	32(%rsi),%r8,%rax
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+
+	mulxq	40(%rsi),%r9,%rbx
+	adcxq	%r10,%r8
+	adoxq	%rax,%r9
+	mulxq	48(%rsi),%r10,%rax
+	adcxq	%r11,%r9
+	adoxq	%r12,%r10
+	mulxq	56(%rsi),%r11,%r12
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%r14
+	adcxq	%rbx,%r10
+	adoxq	%rax,%r11
+	movq	48(%rsi),%r15
+	adcxq	%r13,%r11
+	adoxq	%rbp,%r12
+	adcxq	%rbp,%r12
+
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+
+	mulxq	%r14,%r9,%rax
+	movq	56(%rsi),%r8
+	adcxq	%r10,%r9
+	mulxq	%r15,%r10,%rbx
+	adoxq	%rax,%r10
+	adcxq	%r11,%r10
+	mulxq	%r8,%r11,%rax
+	movq	%r14,%rdx
+	adoxq	%rbx,%r11
+	adcxq	%r12,%r11
+
+	adcxq	%rbp,%rax
+
+	mulxq	%r15,%r14,%rbx
+	mulxq	%r8,%r12,%r13
+	movq	%r15,%rdx
+	leaq	64(%rsi),%rsi
+	adcxq	%r14,%r11
+	adoxq	%rbx,%r12
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+.byte	0x67,0x67
+	mulxq	%r8,%r8,%r14
+	adcxq	%r8,%r13
+	adcxq	%rbp,%r14
+
+	cmpq	8+8(%rsp),%rsi
+	je	.Lsqrx8x_outer_break
+
+	negq	%rcx
+	movq	$-8,%rcx
+	movq	%rbp,%r15
+	movq	64(%rdi),%r8
+	adcxq	72(%rdi),%r9
+	adcxq	80(%rdi),%r10
+	adcxq	88(%rdi),%r11
+	adcq	96(%rdi),%r12
+	adcq	104(%rdi),%r13
+	adcq	112(%rdi),%r14
+	adcq	120(%rdi),%r15
+	leaq	(%rsi),%rbp
+	leaq	128(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	movq	-64(%rsi),%rdx
+	movq	%rax,16+8(%rsp)
+	movq	%rdi,24+8(%rsp)
+
+
+	xorl	%eax,%eax
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_loop:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	movq	%rbx,(%rdi,%rcx,8)
+	movl	$0,%ebx
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	movq	8(%rsi,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbx,%r15
+	adcxq	%rbx,%r15
+
+.byte	0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_loop
+
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	cmpq	8+8(%rsp),%rbp
+	je	.Lsqrx8x_break
+
+	subq	16+8(%rsp),%rbx
+.byte	0x66
+	movq	-64(%rsi),%rdx
+	adcxq	0(%rdi),%r8
+	adcxq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+.byte	0x67
+	sbbq	%rax,%rax
+	xorl	%ebx,%ebx
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_break:
+	xorq	%rbp,%rbp
+	subq	16+8(%rsp),%rbx
+	adcxq	%rbp,%r8
+	movq	24+8(%rsp),%rcx
+	adcxq	%rbp,%r9
+	movq	0(%rsi),%rdx
+	adcq	$0,%r10
+	movq	%r8,0(%rdi)
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	cmpq	%rcx,%rdi
+	je	.Lsqrx8x_outer_loop
+
+	movq	%r9,8(%rdi)
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	movq	40(%rcx),%r13
+	movq	%r14,48(%rdi)
+	movq	48(%rcx),%r14
+	movq	%r15,56(%rdi)
+	movq	56(%rcx),%r15
+	movq	%rcx,%rdi
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_break:
+	movq	%r9,72(%rdi)
+.byte	102,72,15,126,217
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+	movq	%r12,96(%rdi)
+	movq	%r13,104(%rdi)
+	movq	%r14,112(%rdi)
+	leaq	48+8(%rsp),%rdi
+	movq	(%rsi,%rcx,1),%rdx
+
+	movq	8(%rdi),%r11
+	xorq	%r10,%r10
+	movq	0+8(%rsp),%r9
+	adoxq	%r11,%r11
+	movq	16(%rdi),%r12
+	movq	24(%rdi),%r13
+
+
+.align	32
+.Lsqrx4x_shift_n_add:
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	40(%rdi),%r11
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	movq	16(%rsi,%rcx,1),%rdx
+	movq	48(%rdi),%r12
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	56(%rdi),%r13
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+	movq	24(%rsi,%rcx,1),%rdx
+	leaq	32(%rcx),%rcx
+	movq	64(%rdi),%r10
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	72(%rdi),%r11
+	movq	%rax,32(%rdi)
+	movq	%rbx,40(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	jrcxz	.Lsqrx4x_shift_n_add_break
+.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	80(%rdi),%r12
+	movq	88(%rdi),%r13
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+	nop
+	jmp	.Lsqrx4x_shift_n_add
+
+.align	32
+.Lsqrx4x_shift_n_add_break:
+	adcxq	%r13,%rbx
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+.byte	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xorl	%eax,%eax
+	movq	32+8(%rsp),%rbx
+	movq	48+8(%rsp),%rdx
+	leaq	-64(%rbp,%r9,1),%rcx
+
+	movq	%rcx,0+8(%rsp)
+	movq	%rdi,8+8(%rsp)
+
+	leaq	48+8(%rsp),%rdi
+	jmp	.Lsqrx8x_reduction_loop
+
+.align	32
+.Lsqrx8x_reduction_loop:
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	%rdx,%r8
+	imulq	%rbx,%rdx
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,24+8(%rsp)
+
+	leaq	64(%rdi),%rdi
+	xorq	%rsi,%rsi
+	movq	$-8,%rcx
+	jmp	.Lsqrx8x_reduce
+
+.align	32
+.Lsqrx8x_reduce:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rbx,%r9
+	adcxq	%rbx,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	32+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+	movq	%rax,64+48+8(%rsp,%rcx,8)
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+.byte	0x67,0x67,0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_reduce
+
+	movq	%rsi,%rax
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_no_tail
+
+	movq	48+8(%rsp),%rdx
+	addq	0(%rdi),%r8
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	adcxq	8(%rdi),%r9
+	adcxq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	72+48+8(%rsp,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	movq	%rbx,(%rdi,%rcx,8)
+	movq	%r8,%rbx
+	adcxq	%rsi,%r15
+
+	incq	%rcx
+	jnz	.Lsqrx8x_tail
+
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_tail_done
+
+	subq	16+8(%rsp),%rsi
+	movq	48+8(%rsp),%rdx
+	leaq	64(%rbp),%rbp
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+	subq	$8,%rcx
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail_done:
+	xorq	%rax,%rax
+	addq	24+8(%rsp),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	subq	16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+	adcq	0(%rdi),%r8
+.byte	102,72,15,126,217
+	adcq	8(%rdi),%r9
+	movq	56(%rbp),%rsi
+.byte	102,72,15,126,213
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+
+	movq	32+8(%rsp),%rbx
+	movq	64(%rdi,%rcx,1),%rdx
+
+	movq	%r8,0(%rdi)
+	leaq	64(%rdi),%r8
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	64(%rdi,%rcx,1),%rdi
+	cmpq	8+8(%rsp),%r8
+	jb	.Lsqrx8x_reduction_loop
+	ret
+.cfi_endproc	
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align	32
+.type	__bn_postx4x_internal,@function
+__bn_postx4x_internal:
+.cfi_startproc	
+	movq	0(%rbp),%r12
+	movq	%rcx,%r10
+	movq	%rcx,%r9
+	negq	%rax
+	sarq	$3+2,%rcx
+
+.byte	102,72,15,126,202
+.byte	102,72,15,126,206
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+
+.align	16
+.Lsqrx4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+	andnq	%rax,%r12,%r12
+	leaq	32(%rbp),%rbp
+	andnq	%rax,%r13,%r13
+	andnq	%rax,%r14,%r14
+	andnq	%rax,%r15,%r15
+
+	negq	%r8
+	adcq	0(%rdi),%r12
+	adcq	8(%rdi),%r13
+	adcq	16(%rdi),%r14
+	adcq	24(%rdi),%r15
+	movq	%r12,0(%rdx)
+	leaq	32(%rdi),%rdi
+	movq	%r13,8(%rdx)
+	sbbq	%r8,%r8
+	movq	%r14,16(%rdx)
+	movq	%r15,24(%rdx)
+	leaq	32(%rdx),%rdx
+
+	incq	%rcx
+	jnz	.Lsqrx4x_sub
+
+	negq	%r9
+
+	ret
+.cfi_endproc	
+.size	__bn_postx4x_internal,.-__bn_postx4x_internal
+.globl	bn_scatter5
+.hidden bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+.cfi_startproc	
+_CET_ENDBR
+	cmpl	$0,%esi
+	jz	.Lscatter_epilogue
+
+
+
+
+
+
+
+
+
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subl	$1,%esi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.hidden bn_gather5
+.type	bn_gather5,@function
+.align	32
+bn_gather5:
+.cfi_startproc	
+.LSEH_begin_bn_gather5:
+_CET_ENDBR
+
+.byte	0x4c,0x8d,0x14,0x24
+.cfi_def_cfa_register	%r10
+.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	leaq	.Linc(%rip),%rax
+	andq	$-16,%rsp
+
+	movd	%ecx,%xmm5
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	128(%rdx),%r11
+	leaq	128(%rsp),%rax
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-128(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-112(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-96(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-80(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-48(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-16(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,16(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,48(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,80(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,96(%rax)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm3,112(%rax)
+	jmp	.Lgather
+
+.align	32
+.Lgather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r11),%xmm0
+	movdqa	-112(%r11),%xmm1
+	movdqa	-96(%r11),%xmm2
+	pand	-128(%rax),%xmm0
+	movdqa	-80(%r11),%xmm3
+	pand	-112(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r11),%xmm0
+	movdqa	-48(%r11),%xmm1
+	movdqa	-32(%r11),%xmm2
+	pand	-64(%rax),%xmm0
+	movdqa	-16(%r11),%xmm3
+	pand	-48(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	pand	0(%rax),%xmm0
+	movdqa	48(%r11),%xmm3
+	pand	16(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r11),%xmm0
+	movdqa	80(%r11),%xmm1
+	movdqa	96(%r11),%xmm2
+	pand	64(%rax),%xmm0
+	movdqa	112(%r11),%xmm3
+	pand	80(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	leaq	256(%r11),%r11
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subl	$1,%esi
+	jnz	.Lgather
+
+	leaq	(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+	ret
+.LSEH_end_bn_gather5:
+.cfi_endproc	
+.size	bn_gather5,.-bn_gather5
+.section	.rodata
+.align	64
+.Linc:
+.long	0,0, 1,1
+.long	2,2, 2,2
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-macosx.S b/ring-0.17.14/pregenerated/x86_64-mont5-macosx.S
new file mode 100644
index 0000000000..dd17dc90c2
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86_64-mont5-macosx.S
@@ -0,0 +1,3188 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_bn_mul4x_mont_gather5
+.private_extern _bn_mul4x_mont_gather5
+
+.p2align	5
+_bn_mul4x_mont_gather5:
+
+_CET_ENDBR
+.byte	0x67
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mul4x_prologue:
+
+.byte	0x67
+
+
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$mul4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$mul4xsp_done
+
+.p2align	5
+L$mul4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$mul4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mul4x_page_walk
+	jmp	L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+	negq	%r9
+
+	movq	%rax,40(%rsp)
+
+L$mul4x_body:
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul4x_epilogue:
+	ret
+
+
+
+
+.p2align	5
+mul4x_internal:
+
+	shlq	$5,%r9
+	movd	8(%rax),%xmm5
+	leaq	L$inc(%rip),%rax
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5,%r9
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r9,1),%r10
+	leaq	128(%rdx),%r12
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	%r13,16+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+	leaq	(%rsi,%r9,1),%rsi
+	negq	%r9
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	leaq	64+8(%rsp),%r14
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+	jmp	L$1st4x
+
+.p2align	5
+L$1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	L$1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	jmp	L$outer4x
+
+.p2align	5
+L$outer4x:
+	leaq	16+128(%r14),%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r14,%r9,1),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+	movq	%rdi,(%r14)
+
+	leaq	(%r14,%r9,1),%r14
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	L$inner4x
+
+.p2align	5
+L$inner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	addq	(%r14),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	L$inner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	movq	-8(%rcx),%rbp
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	movq	%rdi,-16(%r14)
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%r14),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	cmpq	16+8(%rsp),%r12
+	jb	L$outer4x
+	xorq	%rax,%rax
+	subq	%r13,%rbp
+	adcq	%r15,%r15
+	orq	%r15,%rdi
+	subq	%rdi,%rax
+	leaq	(%r14,%r9,1),%rbx
+	movq	(%rcx),%r12
+	leaq	(%rcx),%rbp
+	movq	%r9,%rcx
+	sarq	$3+2,%rcx
+	movq	56+8(%rsp),%rdi
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqr4x_sub_entry
+
+
+.globl	_bn_power5_nohw
+.private_extern _bn_power5_nohw
+
+.p2align	5
+_bn_power5_nohw:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$power5_prologue:
+
+
+
+
+	shll	$3,%r9d
+	leal	(%r9,%r9,2),%r10d
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$pwr_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$pwr_sp_done
+
+.p2align	5
+L$pwr_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$pwr_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwr_page_walk
+	jmp	L$pwr_page_walk_done
+
+L$pwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwr_page_walk
+L$pwr_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$power5_body:
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	%rsi,%rdi
+	movq	40(%rsp),%rax
+	leaq	32(%rsp),%r8
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$power5_epilogue:
+	ret
+
+
+
+.globl	_bn_sqr8x_internal
+.private_extern _bn_sqr8x_internal
+.private_extern	_bn_sqr8x_internal
+
+.p2align	5
+_bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r11,-16(%rdi,%rbp,1)
+	movq	%rdx,%r10
+
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	leaq	(%rbp),%rcx
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	L$sqr4x_1st
+
+.p2align	5
+L$sqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	16(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%r10,8(%rdi,%rcx,1)
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	24(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,16(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	leaq	32(%rcx),%rcx
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	L$sqr4x_1st
+
+	mulq	%r15
+	addq	%rax,%r13
+	leaq	16(%rbp),%rbp
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+	jmp	L$sqr4x_outer
+
+.p2align	5
+L$sqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	-24(%rdi,%rbp,1),%r10
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r10,-24(%rdi,%rbp,1)
+	movq	%rdx,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-16(%rdi,%rbp,1),%r11
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	xorq	%r12,%r12
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-8(%rdi,%rbp,1),%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rbp,1)
+
+	leaq	(%rbp),%rcx
+	jmp	L$sqr4x_inner
+
+.p2align	5
+L$sqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+
+.byte	0x67
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	L$sqr4x_inner
+
+.byte	0x67
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	L$sqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	%r10,-24(%rdi)
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	movq	-8(%rsi),%rbx
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,-16(%rdi)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi)
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	48+8(%rsp),%rdi
+	xorq	%r10,%r10
+	movq	8(%rdi),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	jmp	L$sqr4x_shift_n_add
+
+.p2align	5
+L$sqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	addq	$32,%rbp
+	jnz	L$sqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+.byte	0x67
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+.byte	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xorq	%rax,%rax
+	leaq	(%r9,%rbp,1),%rcx
+	leaq	48+8(%rsp,%r9,2),%rdx
+	movq	%rcx,0+8(%rsp)
+	leaq	48+8(%rsp,%r9,1),%rdi
+	movq	%rdx,8+8(%rsp)
+	negq	%r9
+	jmp	L$8x_reduction_loop
+
+.p2align	5
+L$8x_reduction_loop:
+	leaq	(%rdi,%r9,1),%rdi
+.byte	0x66
+	movq	0(%rdi),%rbx
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,(%rdx)
+	leaq	64(%rdi),%rdi
+
+.byte	0x67
+	movq	%rbx,%r8
+	imulq	32+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	L$8x_reduce
+
+.p2align	5
+L$8x_reduce:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rbx,48-8+8(%rsp,%rcx,8)
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	32+8(%rsp),%rsi
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	L$8x_reduce
+
+	leaq	64(%rbp),%rbp
+	xorq	%rax,%rax
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	L$8x_no_tail
+
+.byte	0x66
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movq	48+56+8(%rsp),%rbx
+	movl	$8,%ecx
+	movq	0(%rbp),%rax
+	jmp	L$8x_tail
+
+.p2align	5
+L$8x_tail:
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rbp),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	leaq	8(%rdi),%rdi
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	48-16+8(%rsp,%rcx,8),%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	0(%rbp),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	L$8x_tail
+
+	leaq	64(%rbp),%rbp
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	L$8x_tail_done
+
+	movq	48+56+8(%rsp),%rbx
+	negq	%rsi
+	movq	0(%rbp),%rax
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movl	$8,%ecx
+	jmp	L$8x_tail
+
+.p2align	5
+L$8x_tail_done:
+	xorq	%rax,%rax
+	addq	(%rdx),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	negq	%rsi
+L$8x_no_tail:
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+	movq	-8(%rbp),%rcx
+	xorq	%rsi,%rsi
+
+.byte	102,72,15,126,213
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+.byte	102,73,15,126,217
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	leaq	64(%rdi),%rdi
+
+	cmpq	%rdx,%rdi
+	jb	L$8x_reduction_loop
+	ret
+
+
+
+.p2align	5
+__bn_post4x_internal:
+
+	movq	0(%rbp),%r12
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+.byte	102,72,15,126,207
+	negq	%rax
+.byte	102,72,15,126,206
+	sarq	$3+2,%rcx
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqr4x_sub_entry
+
+.p2align	4
+L$sqr4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+L$sqr4x_sub_entry:
+	leaq	32(%rbp),%rbp
+	notq	%r12
+	notq	%r13
+	notq	%r14
+	notq	%r15
+	andq	%rax,%r12
+	andq	%rax,%r13
+	andq	%rax,%r14
+	andq	%rax,%r15
+
+	negq	%r10
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	adcq	16(%rbx),%r14
+	adcq	24(%rbx),%r15
+	movq	%r12,0(%rdi)
+	leaq	32(%rbx),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r10,%r10
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+
+	incq	%rcx
+	jnz	L$sqr4x_sub
+
+	movq	%r9,%r10
+	negq	%r9
+	ret
+
+
+.globl	_bn_mulx4x_mont_gather5
+.private_extern _bn_mulx4x_mont_gather5
+
+.p2align	5
+_bn_mulx4x_mont_gather5:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mulx4x_prologue:
+
+
+
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$mulx4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$mulx4xsp_done
+
+L$mulx4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$mulx4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+	jmp	L$mulx4x_page_walk_done
+
+L$mulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$mulx4x_body:
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mulx4x_epilogue:
+	ret
+
+
+
+
+.p2align	5
+mulx4x_internal:
+
+	movq	%r9,8(%rsp)
+	movq	%r9,%r10
+	negq	%r9
+	shlq	$5,%r9
+	negq	%r10
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5+5,%r9
+	movd	8(%rax),%xmm5
+	subq	$1,%r9
+	leaq	L$inc(%rip),%rax
+	movq	%r13,16+8(%rsp)
+	movq	%r9,24+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r10,1),%r10
+	leaq	128(%rdx),%rdi
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67
+	movdqa	%xmm1,%xmm2
+.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+
+	pand	64(%rdi),%xmm0
+	pand	80(%rdi),%xmm1
+	pand	96(%rdi),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%rdi),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%rdi),%xmm4
+	movdqa	-112(%rdi),%xmm5
+	movdqa	-96(%rdi),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%rdi),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%rdi),%xmm4
+	movdqa	-48(%rdi),%xmm5
+	movdqa	-32(%rdi),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%rdi),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%rdi),%xmm4
+	movdqa	16(%rdi),%xmm5
+	movdqa	32(%rdi),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%rdi),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	pxor	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+	leaq	64+32+8(%rsp),%rbx
+
+	movq	%rdx,%r9
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r12
+	addq	%rax,%r11
+	mulxq	16(%rsi),%rax,%r13
+	adcq	%rax,%r12
+	adcq	$0,%r13
+	mulxq	24(%rsi),%rax,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+	xorq	%rbp,%rbp
+	movq	%r8,%rdx
+
+	movq	%rdi,8+8(%rsp)
+
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+	jmp	L$mulx4x_1st
+
+.p2align	5
+L$mulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_1st
+
+	movq	8(%rsp),%rax
+	adcq	%rbp,%r15
+	leaq	(%rsi,%rax,1),%rsi
+	addq	%r15,%r14
+	movq	8+8(%rsp),%rdi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+	jmp	L$mulx4x_outer
+
+.p2align	5
+L$mulx4x_outer:
+	leaq	16-256(%rbx),%r10
+	pxor	%xmm4,%xmm4
+.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%rdi),%xmm0
+	movdqa	-112(%rdi),%xmm1
+	movdqa	-96(%rdi),%xmm2
+	pand	256(%r10),%xmm0
+	movdqa	-80(%rdi),%xmm3
+	pand	272(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	288(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	304(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%rdi),%xmm0
+	movdqa	-48(%rdi),%xmm1
+	movdqa	-32(%rdi),%xmm2
+	pand	320(%r10),%xmm0
+	movdqa	-16(%rdi),%xmm3
+	pand	336(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	352(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	368(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%rdi),%xmm0
+	movdqa	16(%rdi),%xmm1
+	movdqa	32(%rdi),%xmm2
+	pand	384(%r10),%xmm0
+	movdqa	48(%rdi),%xmm3
+	pand	400(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	416(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	432(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%rdi),%xmm0
+	movdqa	80(%rdi),%xmm1
+	movdqa	96(%rdi),%xmm2
+	pand	448(%r10),%xmm0
+	movdqa	112(%rdi),%xmm3
+	pand	464(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	480(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	496(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+
+	movq	%rbp,(%rbx)
+	leaq	32(%rbx,%rax,1),%rbx
+	mulxq	0(%rsi),%r8,%r11
+	xorq	%rbp,%rbp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	mulxq	24(%rsi),%rdx,%r14
+	adoxq	-16(%rbx),%r12
+	adcxq	%rdx,%r13
+	leaq	(%rcx,%rax,1),%rcx
+	leaq	32(%rsi),%rsi
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	adoxq	%rbp,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+
+	movq	%r8,%rdx
+	xorq	%rbp,%rbp
+	movq	%rdi,8+8(%rsp)
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-24(%rbx)
+	adoxq	%rbp,%r15
+	movq	%r12,-16(%rbx)
+	leaq	32(%rcx),%rcx
+	jmp	L$mulx4x_inner
+
+.p2align	5
+L$mulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	movq	%r11,-32(%rbx)
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	leaq	32(%rcx),%rcx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_inner
+
+	movq	0+8(%rsp),%rax
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rdi
+	movq	8+8(%rsp),%rdi
+	movq	16+8(%rsp),%r10
+	adcq	%r15,%r14
+	leaq	(%rsi,%rax,1),%rsi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+
+	cmpq	%r10,%rdi
+	jb	L$mulx4x_outer
+
+	movq	-8(%rcx),%r10
+	movq	%rbp,%r8
+	movq	(%rcx,%rax,1),%r12
+	leaq	(%rcx,%rax,1),%rbp
+	movq	%rax,%rcx
+	leaq	(%rbx,%rax,1),%rdi
+	xorl	%eax,%eax
+	xorq	%r15,%r15
+	subq	%r14,%r10
+	adcq	%r15,%r15
+	orq	%r15,%r8
+	sarq	$3+2,%rcx
+	subq	%r8,%rax
+	movq	56+8(%rsp),%rdx
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqrx4x_sub_entry
+
+
+.globl	_bn_powerx5
+.private_extern _bn_powerx5
+
+.p2align	5
+_bn_powerx5:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$powerx5_prologue:
+
+
+
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$pwrx_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$pwrx_sp_done
+
+.p2align	5
+L$pwrx_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$pwrx_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwrx_page_walk
+	jmp	L$pwrx_page_walk_done
+
+L$pwrx_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwrx_page_walk
+L$pwrx_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$powerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	movq	%r10,%r9
+	movq	%rsi,%rdi
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$powerx5_epilogue:
+	ret
+
+
+
+.globl	_bn_sqrx8x_internal
+.private_extern _bn_sqrx8x_internal
+.private_extern	_bn_sqrx8x_internal
+
+.p2align	5
+_bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	48+8(%rsp),%rdi
+	leaq	(%rsi,%r9,1),%rbp
+	movq	%r9,0+8(%rsp)
+	movq	%rbp,8+8(%rsp)
+	jmp	L$sqr8x_zero_start
+
+.p2align	5
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+L$sqrx8x_zero:
+.byte	0x3e
+	movdqa	%xmm0,0(%rdi)
+	movdqa	%xmm0,16(%rdi)
+	movdqa	%xmm0,32(%rdi)
+	movdqa	%xmm0,48(%rdi)
+L$sqr8x_zero_start:
+	movdqa	%xmm0,64(%rdi)
+	movdqa	%xmm0,80(%rdi)
+	movdqa	%xmm0,96(%rdi)
+	movdqa	%xmm0,112(%rdi)
+	leaq	128(%rdi),%rdi
+	subq	$64,%r9
+	jnz	L$sqrx8x_zero
+
+	movq	0(%rsi),%rdx
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	leaq	48+8(%rsp),%rdi
+	xorq	%rbp,%rbp
+	jmp	L$sqrx8x_outer_loop
+
+.p2align	5
+L$sqrx8x_outer_loop:
+	mulxq	8(%rsi),%r8,%rax
+	adcxq	%r9,%r8
+	adoxq	%rax,%r10
+	mulxq	16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcxq	%r11,%r10
+	adoxq	%rax,%r12
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcxq	%r12,%r11
+	adoxq	%rax,%r13
+	mulxq	40(%rsi),%r12,%rax
+	adcxq	%r13,%r12
+	adoxq	%rax,%r14
+	mulxq	48(%rsi),%r13,%rax
+	adcxq	%r14,%r13
+	adoxq	%r15,%rax
+	mulxq	56(%rsi),%r14,%r15
+	movq	8(%rsi),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+	adcq	64(%rdi),%r15
+	movq	%r8,8(%rdi)
+	movq	%r9,16(%rdi)
+	sbbq	%rcx,%rcx
+	xorq	%rbp,%rbp
+
+
+	mulxq	16(%rsi),%r8,%rbx
+	mulxq	24(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	32(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%rbx,%r11
+.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcxq	%r13,%r11
+	adoxq	%r14,%r12
+.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbx,%r13
+	adcxq	%r15,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rbp,%r14
+
+	movq	%r8,24(%rdi)
+	movq	%r9,32(%rdi)
+
+	mulxq	24(%rsi),%r8,%rbx
+	mulxq	32(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	40(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%r13,%r11
+.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte	0x3e
+	movq	24(%rsi),%rdx
+	adcxq	%rbx,%r11
+	adoxq	%rax,%r12
+	adcxq	%r14,%r12
+	movq	%r8,40(%rdi)
+	movq	%r9,48(%rdi)
+	mulxq	32(%rsi),%r8,%rax
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+
+	mulxq	40(%rsi),%r9,%rbx
+	adcxq	%r10,%r8
+	adoxq	%rax,%r9
+	mulxq	48(%rsi),%r10,%rax
+	adcxq	%r11,%r9
+	adoxq	%r12,%r10
+	mulxq	56(%rsi),%r11,%r12
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%r14
+	adcxq	%rbx,%r10
+	adoxq	%rax,%r11
+	movq	48(%rsi),%r15
+	adcxq	%r13,%r11
+	adoxq	%rbp,%r12
+	adcxq	%rbp,%r12
+
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+
+	mulxq	%r14,%r9,%rax
+	movq	56(%rsi),%r8
+	adcxq	%r10,%r9
+	mulxq	%r15,%r10,%rbx
+	adoxq	%rax,%r10
+	adcxq	%r11,%r10
+	mulxq	%r8,%r11,%rax
+	movq	%r14,%rdx
+	adoxq	%rbx,%r11
+	adcxq	%r12,%r11
+
+	adcxq	%rbp,%rax
+
+	mulxq	%r15,%r14,%rbx
+	mulxq	%r8,%r12,%r13
+	movq	%r15,%rdx
+	leaq	64(%rsi),%rsi
+	adcxq	%r14,%r11
+	adoxq	%rbx,%r12
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+.byte	0x67,0x67
+	mulxq	%r8,%r8,%r14
+	adcxq	%r8,%r13
+	adcxq	%rbp,%r14
+
+	cmpq	8+8(%rsp),%rsi
+	je	L$sqrx8x_outer_break
+
+	negq	%rcx
+	movq	$-8,%rcx
+	movq	%rbp,%r15
+	movq	64(%rdi),%r8
+	adcxq	72(%rdi),%r9
+	adcxq	80(%rdi),%r10
+	adcxq	88(%rdi),%r11
+	adcq	96(%rdi),%r12
+	adcq	104(%rdi),%r13
+	adcq	112(%rdi),%r14
+	adcq	120(%rdi),%r15
+	leaq	(%rsi),%rbp
+	leaq	128(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	movq	-64(%rsi),%rdx
+	movq	%rax,16+8(%rsp)
+	movq	%rdi,24+8(%rsp)
+
+
+	xorl	%eax,%eax
+	jmp	L$sqrx8x_loop
+
+.p2align	5
+L$sqrx8x_loop:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	movq	%rbx,(%rdi,%rcx,8)
+	movl	$0,%ebx
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	movq	8(%rsi,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbx,%r15
+	adcxq	%rbx,%r15
+
+.byte	0x67
+	incq	%rcx
+	jnz	L$sqrx8x_loop
+
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	cmpq	8+8(%rsp),%rbp
+	je	L$sqrx8x_break
+
+	subq	16+8(%rsp),%rbx
+.byte	0x66
+	movq	-64(%rsi),%rdx
+	adcxq	0(%rdi),%r8
+	adcxq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+.byte	0x67
+	sbbq	%rax,%rax
+	xorl	%ebx,%ebx
+	movq	%rax,16+8(%rsp)
+	jmp	L$sqrx8x_loop
+
+.p2align	5
+L$sqrx8x_break:
+	xorq	%rbp,%rbp
+	subq	16+8(%rsp),%rbx
+	adcxq	%rbp,%r8
+	movq	24+8(%rsp),%rcx
+	adcxq	%rbp,%r9
+	movq	0(%rsi),%rdx
+	adcq	$0,%r10
+	movq	%r8,0(%rdi)
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	cmpq	%rcx,%rdi
+	je	L$sqrx8x_outer_loop
+
+	movq	%r9,8(%rdi)
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	movq	40(%rcx),%r13
+	movq	%r14,48(%rdi)
+	movq	48(%rcx),%r14
+	movq	%r15,56(%rdi)
+	movq	56(%rcx),%r15
+	movq	%rcx,%rdi
+	jmp	L$sqrx8x_outer_loop
+
+.p2align	5
+L$sqrx8x_outer_break:
+	movq	%r9,72(%rdi)
+.byte	102,72,15,126,217
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+	movq	%r12,96(%rdi)
+	movq	%r13,104(%rdi)
+	movq	%r14,112(%rdi)
+	leaq	48+8(%rsp),%rdi
+	movq	(%rsi,%rcx,1),%rdx
+
+	movq	8(%rdi),%r11
+	xorq	%r10,%r10
+	movq	0+8(%rsp),%r9
+	adoxq	%r11,%r11
+	movq	16(%rdi),%r12
+	movq	24(%rdi),%r13
+
+
+.p2align	5
+L$sqrx4x_shift_n_add:
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	40(%rdi),%r11
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	movq	16(%rsi,%rcx,1),%rdx
+	movq	48(%rdi),%r12
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	56(%rdi),%r13
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+	movq	24(%rsi,%rcx,1),%rdx
+	leaq	32(%rcx),%rcx
+	movq	64(%rdi),%r10
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	72(%rdi),%r11
+	movq	%rax,32(%rdi)
+	movq	%rbx,40(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	jrcxz	L$sqrx4x_shift_n_add_break
+.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	80(%rdi),%r12
+	movq	88(%rdi),%r13
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+	nop
+	jmp	L$sqrx4x_shift_n_add
+
+.p2align	5
+L$sqrx4x_shift_n_add_break:
+	adcxq	%r13,%rbx
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+.byte	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xorl	%eax,%eax
+	movq	32+8(%rsp),%rbx
+	movq	48+8(%rsp),%rdx
+	leaq	-64(%rbp,%r9,1),%rcx
+
+	movq	%rcx,0+8(%rsp)
+	movq	%rdi,8+8(%rsp)
+
+	leaq	48+8(%rsp),%rdi
+	jmp	L$sqrx8x_reduction_loop
+
+.p2align	5
+L$sqrx8x_reduction_loop:
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	%rdx,%r8
+	imulq	%rbx,%rdx
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,24+8(%rsp)
+
+	leaq	64(%rdi),%rdi
+	xorq	%rsi,%rsi
+	movq	$-8,%rcx
+	jmp	L$sqrx8x_reduce
+
+.p2align	5
+L$sqrx8x_reduce:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rbx,%r9
+	adcxq	%rbx,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	32+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+	movq	%rax,64+48+8(%rsp,%rcx,8)
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+.byte	0x67,0x67,0x67
+	incq	%rcx
+	jnz	L$sqrx8x_reduce
+
+	movq	%rsi,%rax
+	cmpq	0+8(%rsp),%rbp
+	jae	L$sqrx8x_no_tail
+
+	movq	48+8(%rsp),%rdx
+	addq	0(%rdi),%r8
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	adcxq	8(%rdi),%r9
+	adcxq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	L$sqrx8x_tail
+
+.p2align	5
+L$sqrx8x_tail:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	72+48+8(%rsp,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	movq	%rbx,(%rdi,%rcx,8)
+	movq	%r8,%rbx
+	adcxq	%rsi,%r15
+
+	incq	%rcx
+	jnz	L$sqrx8x_tail
+
+	cmpq	0+8(%rsp),%rbp
+	jae	L$sqrx8x_tail_done
+
+	subq	16+8(%rsp),%rsi
+	movq	48+8(%rsp),%rdx
+	leaq	64(%rbp),%rbp
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+	subq	$8,%rcx
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	L$sqrx8x_tail
+
+.p2align	5
+L$sqrx8x_tail_done:
+	xorq	%rax,%rax
+	addq	24+8(%rsp),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	subq	16+8(%rsp),%rsi
+L$sqrx8x_no_tail:
+	adcq	0(%rdi),%r8
+.byte	102,72,15,126,217
+	adcq	8(%rdi),%r9
+	movq	56(%rbp),%rsi
+.byte	102,72,15,126,213
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+
+	movq	32+8(%rsp),%rbx
+	movq	64(%rdi,%rcx,1),%rdx
+
+	movq	%r8,0(%rdi)
+	leaq	64(%rdi),%r8
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	64(%rdi,%rcx,1),%rdi
+	cmpq	8+8(%rsp),%r8
+	jb	L$sqrx8x_reduction_loop
+	ret
+
+
+.p2align	5
+
+__bn_postx4x_internal:
+
+	movq	0(%rbp),%r12
+	movq	%rcx,%r10
+	movq	%rcx,%r9
+	negq	%rax
+	sarq	$3+2,%rcx
+
+.byte	102,72,15,126,202
+.byte	102,72,15,126,206
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqrx4x_sub_entry
+
+.p2align	4
+L$sqrx4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+L$sqrx4x_sub_entry:
+	andnq	%rax,%r12,%r12
+	leaq	32(%rbp),%rbp
+	andnq	%rax,%r13,%r13
+	andnq	%rax,%r14,%r14
+	andnq	%rax,%r15,%r15
+
+	negq	%r8
+	adcq	0(%rdi),%r12
+	adcq	8(%rdi),%r13
+	adcq	16(%rdi),%r14
+	adcq	24(%rdi),%r15
+	movq	%r12,0(%rdx)
+	leaq	32(%rdi),%rdi
+	movq	%r13,8(%rdx)
+	sbbq	%r8,%r8
+	movq	%r14,16(%rdx)
+	movq	%r15,24(%rdx)
+	leaq	32(%rdx),%rdx
+
+	incq	%rcx
+	jnz	L$sqrx4x_sub
+
+	negq	%r9
+
+	ret
+
+
+.globl	_bn_scatter5
+.private_extern _bn_scatter5
+
+.p2align	4
+_bn_scatter5:
+
+_CET_ENDBR
+	cmpl	$0,%esi
+	jz	L$scatter_epilogue
+
+
+
+
+
+
+
+
+
+	leaq	(%rdx,%rcx,8),%rdx
+L$scatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subl	$1,%esi
+	jnz	L$scatter
+L$scatter_epilogue:
+	ret
+
+
+
+.globl	_bn_gather5
+.private_extern _bn_gather5
+
+.p2align	5
+_bn_gather5:
+
+L$SEH_begin_bn_gather5:
+_CET_ENDBR
+
+.byte	0x4c,0x8d,0x14,0x24
+
+.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	leaq	L$inc(%rip),%rax
+	andq	$-16,%rsp
+
+	movd	%ecx,%xmm5
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	128(%rdx),%r11
+	leaq	128(%rsp),%rax
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-128(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-112(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-96(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-80(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-48(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-16(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,16(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,48(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,80(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,96(%rax)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm3,112(%rax)
+	jmp	L$gather
+
+.p2align	5
+L$gather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r11),%xmm0
+	movdqa	-112(%r11),%xmm1
+	movdqa	-96(%r11),%xmm2
+	pand	-128(%rax),%xmm0
+	movdqa	-80(%r11),%xmm3
+	pand	-112(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r11),%xmm0
+	movdqa	-48(%r11),%xmm1
+	movdqa	-32(%r11),%xmm2
+	pand	-64(%rax),%xmm0
+	movdqa	-16(%r11),%xmm3
+	pand	-48(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	pand	0(%rax),%xmm0
+	movdqa	48(%r11),%xmm3
+	pand	16(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r11),%xmm0
+	movdqa	80(%r11),%xmm1
+	movdqa	96(%r11),%xmm2
+	pand	64(%rax),%xmm0
+	movdqa	112(%r11),%xmm3
+	pand	80(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	leaq	256(%r11),%r11
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subl	$1,%esi
+	jnz	L$gather
+
+	leaq	(%r10),%rsp
+
+	ret
+L$SEH_end_bn_gather5:
+
+
+.section	__DATA,__const
+.p2align	6
+L$inc:
+.long	0,0, 1,1
+.long	2,2, 2,2
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+#endif
diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm
new file mode 100644
index 0000000000..eeb354322d
--- /dev/null
+++ b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm
@@ -0,0 +1,3401 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%include "ring_core_generated/prefix_symbols_nasm.inc"
+section	.text code align=64
+
+
+global	bn_mul4x_mont_gather5
+
+ALIGN	32
+bn_mul4x_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul4x_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	DB	0x67
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mul4x_prologue:
+
+	DB	0x67
+
+
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$mul4xsp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$mul4xsp_done
+
+ALIGN	32
+$L$mul4xsp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$mul4xsp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	neg	r9
+
+	mov	QWORD[40+rsp],rax
+
+$L$mul4x_body:
+
+	call	mul4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul4x_mont_gather5:
+
+
+ALIGN	32
+mul4x_internal:
+
+	shl	r9,5
+	movd	xmm5,DWORD[56+rax]
+	lea	rax,[$L$inc]
+	lea	r13,[128+r9*1+rdx]
+	shr	r9,5
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r10,[((88-112))+r9*1+rsp]
+	lea	r12,[128+rdx]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	DB	0x67,0x67
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	DB	0x67
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+	DB	0x67
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+	pand	xmm0,XMMWORD[64+r12]
+
+	pand	xmm1,XMMWORD[80+r12]
+	pand	xmm2,XMMWORD[96+r12]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+r12]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+r12]
+	movdqa	xmm5,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+r12]
+	movdqa	xmm5,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[r12]
+	movdqa	xmm5,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	por	xmm0,xmm1
+
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	QWORD[((16+8))+rsp],r13
+	mov	QWORD[((56+8))+rsp],rdi
+
+	mov	r8,QWORD[r8]
+	mov	rax,QWORD[rsi]
+	lea	rsi,[r9*1+rsi]
+	neg	r9
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	lea	r14,[((64+8))+rsp]
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+r9*1+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[32+r9]
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[r14],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$1st4x
+
+ALIGN	32
+$L$1st4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[r14],rdi
+	mov	r13,rdx
+
+	add	r15,32
+	jnz	NEAR $L$1st4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	r13,rdx
+
+	lea	rcx,[r9*1+rcx]
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	mov	QWORD[((-8))+r14],r13
+
+	jmp	NEAR $L$outer4x
+
+ALIGN	32
+$L$outer4x:
+	lea	rdx,[((16+128))+r14]
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r12]
+	movdqa	xmm1,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm0,XMMWORD[((-128))+rdx]
+	pand	xmm1,XMMWORD[((-112))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r12]
+	movdqa	xmm1,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm0,XMMWORD[((-64))+rdx]
+	pand	xmm1,XMMWORD[((-48))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r12]
+	movdqa	xmm1,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm0,XMMWORD[rdx]
+	pand	xmm1,XMMWORD[16+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r12]
+	movdqa	xmm1,XMMWORD[80+r12]
+	movdqa	xmm2,XMMWORD[96+r12]
+	movdqa	xmm3,XMMWORD[112+r12]
+	pand	xmm0,XMMWORD[64+rdx]
+	pand	xmm1,XMMWORD[80+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	r10,QWORD[r9*1+r14]
+	mov	rbp,r8
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+	mov	QWORD[r14],rdi
+
+	lea	r14,[r9*1+r14]
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+r9*1+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[32+r9]
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	r13,rdx
+	jmp	NEAR $L$inner4x
+
+ALIGN	32
+$L$inner4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	adc	rdx,0
+	add	r10,QWORD[16+r14]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-32))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+	add	r10,QWORD[r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[((-8))+r14],r13
+	mov	r13,rdx
+
+	add	r15,32
+	jnz	NEAR $L$inner4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	adc	rdx,0
+	add	r10,QWORD[16+r14]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-32))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,rbp
+	mov	rbp,QWORD[((-8))+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	r13,rdx
+
+	mov	QWORD[((-16))+r14],rdi
+	lea	rcx,[r9*1+rcx]
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	add	r13,QWORD[r14]
+	adc	rdi,0
+	mov	QWORD[((-8))+r14],r13
+
+	cmp	r12,QWORD[((16+8))+rsp]
+	jb	NEAR $L$outer4x
+	xor	rax,rax
+	sub	rbp,r13
+	adc	r15,r15
+	or	rdi,r15
+	sub	rax,rdi
+	lea	rbx,[r9*1+r14]
+	mov	r12,QWORD[rcx]
+	lea	rbp,[rcx]
+	mov	rcx,r9
+	sar	rcx,3+2
+	mov	rdi,QWORD[((56+8))+rsp]
+	dec	r12
+	xor	r10,r10
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqr4x_sub_entry
+
+
+global	bn_power5_nohw
+
+ALIGN	32
+bn_power5_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_power5_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$power5_prologue:
+
+
+
+
+	shl	r9d,3
+	lea	r10d,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$pwr_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$pwr_sp_done
+
+ALIGN	32
+$L$pwr_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$pwr_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+	jmp	NEAR $L$pwr_page_walk_done
+
+$L$pwr_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+$L$pwr_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$power5_body:
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	102,73,15,110,218
+DB	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+DB	102,72,15,126,209
+DB	102,72,15,126,226
+	mov	rdi,rsi
+	mov	rax,QWORD[40+rsp]
+	lea	r8,[32+rsp]
+
+	call	mul4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$power5_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_power5_nohw:
+
+global	bn_sqr8x_internal
+
+
+ALIGN	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	lea	rbp,[32+r10]
+	lea	rsi,[r9*1+rsi]
+
+	mov	rcx,r9
+
+
+	mov	r14,QWORD[((-32))+rbp*1+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rbp*1+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rbp*1+rsi]
+	mov	r15,rax
+
+	mul	r14
+	mov	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	mov	QWORD[((-24))+rbp*1+rdi],r10
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD[((-16))+rbp*1+rdi],r11
+	mov	r10,rdx
+
+
+	mov	rbx,QWORD[((-8))+rbp*1+rsi]
+	mul	r15
+	mov	r12,rax
+	mov	rax,rbx
+	mov	r13,rdx
+
+	lea	rcx,[rbp]
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+	jmp	NEAR $L$sqr4x_1st
+
+ALIGN	32
+$L$sqr4x_1st:
+	mov	rbx,QWORD[rcx*1+rsi]
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[8+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[rcx*1+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[16+rcx*1+rsi]
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	QWORD[8+rcx*1+rdi],r10
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[24+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[16+rcx*1+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+	lea	rcx,[32+rcx]
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+
+	cmp	rcx,0
+	jne	NEAR $L$sqr4x_1st
+
+	mul	r15
+	add	r13,rax
+	lea	rbp,[16+rbp]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+	jmp	NEAR $L$sqr4x_outer
+
+ALIGN	32
+$L$sqr4x_outer:
+	mov	r14,QWORD[((-32))+rbp*1+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rbp*1+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rbp*1+rsi]
+	mov	r15,rax
+
+	mul	r14
+	mov	r10,QWORD[((-24))+rbp*1+rdi]
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD[((-24))+rbp*1+rdi],r10
+	mov	r11,rdx
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r11,QWORD[((-16))+rbp*1+rdi]
+	mov	r10,rdx
+	adc	r10,0
+	mov	QWORD[((-16))+rbp*1+rdi],r11
+
+	xor	r12,r12
+
+	mov	rbx,QWORD[((-8))+rbp*1+rsi]
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r12,QWORD[((-8))+rbp*1+rdi]
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r10,r12
+	mov	r11,rdx
+	adc	r11,0
+	mov	QWORD[((-8))+rbp*1+rdi],r10
+
+	lea	rcx,[rbp]
+	jmp	NEAR $L$sqr4x_inner
+
+ALIGN	32
+$L$sqr4x_inner:
+	mov	rbx,QWORD[rcx*1+rsi]
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	r12,rdx
+	adc	r12,0
+	add	r13,QWORD[rcx*1+rdi]
+	adc	r12,0
+
+	DB	0x67
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[8+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+	mul	r15
+	add	r12,rax
+	mov	QWORD[rcx*1+rdi],r11
+	mov	rax,rbx
+	mov	r13,rdx
+	adc	r13,0
+	add	r12,QWORD[8+rcx*1+rdi]
+	lea	rcx,[16+rcx]
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r10,r12
+	mov	r11,rdx
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+
+	cmp	rcx,0
+	jne	NEAR $L$sqr4x_inner
+
+	DB	0x67
+	mul	r15
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+
+	add	rbp,16
+	jnz	NEAR $L$sqr4x_outer
+
+
+	mov	r14,QWORD[((-32))+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rsi]
+	mov	r15,rax
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	QWORD[((-24))+rdi],r10
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	mov	rbx,QWORD[((-8))+rsi]
+	adc	r10,0
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[((-16))+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rdi],r10
+
+	mul	r15
+	add	r13,rax
+	mov	rax,QWORD[((-16))+rsi]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+
+	mul	rbx
+	add	rbp,16
+	xor	r14,r14
+	sub	rbp,r9
+	xor	r15,r15
+
+	add	rax,r12
+	adc	rdx,0
+	mov	QWORD[8+rdi],rax
+	mov	QWORD[16+rdi],rdx
+	mov	QWORD[24+rdi],r15
+
+	mov	rax,QWORD[((-16))+rbp*1+rsi]
+	lea	rdi,[((48+8))+rsp]
+	xor	r10,r10
+	mov	r11,QWORD[8+rdi]
+
+	lea	r12,[r10*2+r14]
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[16+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[24+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rbp*1+rsi]
+	mov	QWORD[rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[8+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[32+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[40+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[rbp*1+rsi]
+	mov	QWORD[16+rdi],rbx
+	adc	r8,rdx
+	lea	rbp,[16+rbp]
+	mov	QWORD[24+rdi],r8
+	sbb	r15,r15
+	lea	rdi,[64+rdi]
+	jmp	NEAR $L$sqr4x_shift_n_add
+
+ALIGN	32
+$L$sqr4x_shift_n_add:
+	lea	r12,[r10*2+r14]
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[((-16))+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[((-8))+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rbp*1+rsi]
+	mov	QWORD[((-32))+rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[((-24))+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[8+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[rbp*1+rsi]
+	mov	QWORD[((-16))+rdi],rbx
+	adc	r8,rdx
+
+	lea	r12,[r10*2+r14]
+	mov	QWORD[((-8))+rdi],r8
+	sbb	r15,r15
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[16+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[24+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[8+rbp*1+rsi]
+	mov	QWORD[rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[8+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[32+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[40+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[16+rbp*1+rsi]
+	mov	QWORD[16+rdi],rbx
+	adc	r8,rdx
+	mov	QWORD[24+rdi],r8
+	sbb	r15,r15
+	lea	rdi,[64+rdi]
+	add	rbp,32
+	jnz	NEAR $L$sqr4x_shift_n_add
+
+	lea	r12,[r10*2+r14]
+	DB	0x67
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[((-16))+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[((-8))+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rsi]
+	mov	QWORD[((-32))+rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[((-24))+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mul	rax
+	neg	r15
+	adc	rbx,rax
+	adc	r8,rdx
+	mov	QWORD[((-16))+rdi],rbx
+	mov	QWORD[((-8))+rdi],r8
+DB	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xor	rax,rax
+	lea	rcx,[rbp*1+r9]
+	lea	rdx,[((48+8))+r9*2+rsp]
+	mov	QWORD[((0+8))+rsp],rcx
+	lea	rdi,[((48+8))+r9*1+rsp]
+	mov	QWORD[((8+8))+rsp],rdx
+	neg	r9
+	jmp	NEAR $L$8x_reduction_loop
+
+ALIGN	32
+$L$8x_reduction_loop:
+	lea	rdi,[r9*1+rdi]
+	DB	0x66
+	mov	rbx,QWORD[rdi]
+	mov	r9,QWORD[8+rdi]
+	mov	r10,QWORD[16+rdi]
+	mov	r11,QWORD[24+rdi]
+	mov	r12,QWORD[32+rdi]
+	mov	r13,QWORD[40+rdi]
+	mov	r14,QWORD[48+rdi]
+	mov	r15,QWORD[56+rdi]
+	mov	QWORD[rdx],rax
+	lea	rdi,[64+rdi]
+
+	DB	0x67
+	mov	r8,rbx
+	imul	rbx,QWORD[((32+8))+rsp]
+	mov	rax,QWORD[rbp]
+	mov	ecx,8
+	jmp	NEAR $L$8x_reduce
+
+ALIGN	32
+$L$8x_reduce:
+	mul	rbx
+	mov	rax,QWORD[8+rbp]
+	neg	r8
+	mov	r8,rdx
+	adc	r8,0
+
+	mul	rbx
+	add	r9,rax
+	mov	rax,QWORD[16+rbp]
+	adc	rdx,0
+	add	r8,r9
+	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
+	mov	r9,rdx
+	adc	r9,0
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[24+rbp]
+	adc	rdx,0
+	add	r9,r10
+	mov	rsi,QWORD[((32+8))+rsp]
+	mov	r10,rdx
+	adc	r10,0
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[32+rbp]
+	adc	rdx,0
+	imul	rsi,r8
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,QWORD[40+rbp]
+	adc	rdx,0
+	add	r11,r12
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,QWORD[48+rbp]
+	adc	rdx,0
+	add	r12,r13
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	rbx
+	add	r14,rax
+	mov	rax,QWORD[56+rbp]
+	adc	rdx,0
+	add	r13,r14
+	mov	r14,rdx
+	adc	r14,0
+
+	mul	rbx
+	mov	rbx,rsi
+	add	r15,rax
+	mov	rax,QWORD[rbp]
+	adc	rdx,0
+	add	r14,r15
+	mov	r15,rdx
+	adc	r15,0
+
+	dec	ecx
+	jnz	NEAR $L$8x_reduce
+
+	lea	rbp,[64+rbp]
+	xor	rax,rax
+	mov	rdx,QWORD[((8+8))+rsp]
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$8x_no_tail
+
+	DB	0x66
+	add	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	sbb	rsi,rsi
+
+	mov	rbx,QWORD[((48+56+8))+rsp]
+	mov	ecx,8
+	mov	rax,QWORD[rbp]
+	jmp	NEAR $L$8x_tail
+
+ALIGN	32
+$L$8x_tail:
+	mul	rbx
+	add	r8,rax
+	mov	rax,QWORD[8+rbp]
+	mov	QWORD[rdi],r8
+	mov	r8,rdx
+	adc	r8,0
+
+	mul	rbx
+	add	r9,rax
+	mov	rax,QWORD[16+rbp]
+	adc	rdx,0
+	add	r8,r9
+	lea	rdi,[8+rdi]
+	mov	r9,rdx
+	adc	r9,0
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[24+rbp]
+	adc	rdx,0
+	add	r9,r10
+	mov	r10,rdx
+	adc	r10,0
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[32+rbp]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,QWORD[40+rbp]
+	adc	rdx,0
+	add	r11,r12
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,QWORD[48+rbp]
+	adc	rdx,0
+	add	r12,r13
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	rbx
+	add	r14,rax
+	mov	rax,QWORD[56+rbp]
+	adc	rdx,0
+	add	r13,r14
+	mov	r14,rdx
+	adc	r14,0
+
+	mul	rbx
+	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
+	add	r15,rax
+	adc	rdx,0
+	add	r14,r15
+	mov	rax,QWORD[rbp]
+	mov	r15,rdx
+	adc	r15,0
+
+	dec	ecx
+	jnz	NEAR $L$8x_tail
+
+	lea	rbp,[64+rbp]
+	mov	rdx,QWORD[((8+8))+rsp]
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$8x_tail_done
+
+	mov	rbx,QWORD[((48+56+8))+rsp]
+	neg	rsi
+	mov	rax,QWORD[rbp]
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	sbb	rsi,rsi
+
+	mov	ecx,8
+	jmp	NEAR $L$8x_tail
+
+ALIGN	32
+$L$8x_tail_done:
+	xor	rax,rax
+	add	r8,QWORD[rdx]
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rax,0
+
+	neg	rsi
+$L$8x_no_tail:
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	adc	rax,0
+	mov	rcx,QWORD[((-8))+rbp]
+	xor	rsi,rsi
+
+DB	102,72,15,126,213
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],r9
+DB	102,73,15,126,217
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+	mov	QWORD[32+rdi],r12
+	mov	QWORD[40+rdi],r13
+	mov	QWORD[48+rdi],r14
+	mov	QWORD[56+rdi],r15
+	lea	rdi,[64+rdi]
+
+	cmp	rdi,rdx
+	jb	NEAR $L$8x_reduction_loop
+	ret
+
+
+
+ALIGN	32
+__bn_post4x_internal:
+
+	mov	r12,QWORD[rbp]
+	lea	rbx,[r9*1+rdi]
+	mov	rcx,r9
+DB	102,72,15,126,207
+	neg	rax
+DB	102,72,15,126,206
+	sar	rcx,3+2
+	dec	r12
+	xor	r10,r10
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqr4x_sub_entry
+
+ALIGN	16
+$L$sqr4x_sub:
+	mov	r12,QWORD[rbp]
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+$L$sqr4x_sub_entry:
+	lea	rbp,[32+rbp]
+	not	r12
+	not	r13
+	not	r14
+	not	r15
+	and	r12,rax
+	and	r13,rax
+	and	r14,rax
+	and	r15,rax
+
+	neg	r10
+	adc	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	adc	r14,QWORD[16+rbx]
+	adc	r15,QWORD[24+rbx]
+	mov	QWORD[rdi],r12
+	lea	rbx,[32+rbx]
+	mov	QWORD[8+rdi],r13
+	sbb	r10,r10
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	lea	rdi,[32+rdi]
+
+	inc	rcx
+	jnz	NEAR $L$sqr4x_sub
+
+	mov	r10,r9
+	neg	r9
+	ret
+
+
+global	bn_mulx4x_mont_gather5
+
+ALIGN	32
+bn_mulx4x_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mulx4x_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx4x_prologue:
+
+
+
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$mulx4xsp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$mulx4xsp_done
+
+$L$mulx4xsp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$mulx4xsp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+	jmp	NEAR $L$mulx4x_page_walk_done
+
+$L$mulx4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$mulx4x_body:
+	call	mulx4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mulx4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mulx4x_mont_gather5:
+
+
+ALIGN	32
+mulx4x_internal:
+
+	mov	QWORD[8+rsp],r9
+	mov	r10,r9
+	neg	r9
+	shl	r9,5
+	neg	r10
+	lea	r13,[128+r9*1+rdx]
+	shr	r9,5+5
+	movd	xmm5,DWORD[56+rax]
+	sub	r9,1
+	lea	rax,[$L$inc]
+	mov	QWORD[((16+8))+rsp],r13
+	mov	QWORD[((24+8))+rsp],r9
+	mov	QWORD[((56+8))+rsp],rdi
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r10,[((88-112))+r10*1+rsp]
+	lea	rdi,[128+rdx]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	DB	0x67
+	movdqa	xmm2,xmm1
+	DB	0x67
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	DB	0x67
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+
+	pand	xmm0,XMMWORD[64+rdi]
+	pand	xmm1,XMMWORD[80+rdi]
+	pand	xmm2,XMMWORD[96+rdi]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+rdi]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+rdi]
+	movdqa	xmm5,XMMWORD[((-112))+rdi]
+	movdqa	xmm2,XMMWORD[((-96))+rdi]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+rdi]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+rdi]
+	movdqa	xmm5,XMMWORD[((-48))+rdi]
+	movdqa	xmm2,XMMWORD[((-32))+rdi]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+rdi]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[rdi]
+	movdqa	xmm5,XMMWORD[16+rdi]
+	movdqa	xmm2,XMMWORD[32+rdi]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+rdi]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	pxor	xmm0,xmm1
+
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	rdi,[256+rdi]
+DB	102,72,15,126,194
+	lea	rbx,[((64+32+8))+rsp]
+
+	mov	r9,rdx
+	mulx	rax,r8,QWORD[rsi]
+	mulx	r12,r11,QWORD[8+rsi]
+	add	r11,rax
+	mulx	r13,rax,QWORD[16+rsi]
+	adc	r12,rax
+	adc	r13,0
+	mulx	r14,rax,QWORD[24+rsi]
+
+	mov	r15,r8
+	imul	r8,QWORD[((32+8))+rsp]
+	xor	rbp,rbp
+	mov	rdx,r8
+
+	mov	QWORD[((8+8))+rsp],rdi
+
+	lea	rsi,[32+rsi]
+	adcx	r13,rax
+	adcx	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	mov	rdi,QWORD[((24+8))+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	adcx	r12,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r12
+	jmp	NEAR $L$mulx4x_1st
+
+ALIGN	32
+$L$mulx4x_1st:
+	adcx	r15,rbp
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	DB	0x67,0x67
+	mov	rdx,r8
+	adcx	r13,rax
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-32))+rbx],r11
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_1st
+
+	mov	rax,QWORD[8+rsp]
+	adc	r15,rbp
+	lea	rsi,[rax*1+rsi]
+	add	r14,r15
+	mov	rdi,QWORD[((8+8))+rsp]
+	adc	rbp,rbp
+	mov	QWORD[((-8))+rbx],r14
+	jmp	NEAR $L$mulx4x_outer
+
+ALIGN	32
+$L$mulx4x_outer:
+	lea	r10,[((16-256))+rbx]
+	pxor	xmm4,xmm4
+	DB	0x67,0x67
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+rdi]
+	movdqa	xmm1,XMMWORD[((-112))+rdi]
+	movdqa	xmm2,XMMWORD[((-96))+rdi]
+	pand	xmm0,XMMWORD[256+r10]
+	movdqa	xmm3,XMMWORD[((-80))+rdi]
+	pand	xmm1,XMMWORD[272+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[288+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[304+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+rdi]
+	movdqa	xmm1,XMMWORD[((-48))+rdi]
+	movdqa	xmm2,XMMWORD[((-32))+rdi]
+	pand	xmm0,XMMWORD[320+r10]
+	movdqa	xmm3,XMMWORD[((-16))+rdi]
+	pand	xmm1,XMMWORD[336+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[352+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[368+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[rdi]
+	movdqa	xmm1,XMMWORD[16+rdi]
+	movdqa	xmm2,XMMWORD[32+rdi]
+	pand	xmm0,XMMWORD[384+r10]
+	movdqa	xmm3,XMMWORD[48+rdi]
+	pand	xmm1,XMMWORD[400+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[416+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[432+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+rdi]
+	movdqa	xmm1,XMMWORD[80+rdi]
+	movdqa	xmm2,XMMWORD[96+rdi]
+	pand	xmm0,XMMWORD[448+r10]
+	movdqa	xmm3,XMMWORD[112+rdi]
+	pand	xmm1,XMMWORD[464+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[480+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[496+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	rdi,[256+rdi]
+DB	102,72,15,126,194
+
+	mov	QWORD[rbx],rbp
+	lea	rbx,[32+rax*1+rbx]
+	mulx	r11,r8,QWORD[rsi]
+	xor	rbp,rbp
+	mov	r9,rdx
+	mulx	r12,r14,QWORD[8+rsi]
+	adox	r8,QWORD[((-32))+rbx]
+	adcx	r11,r14
+	mulx	r13,r15,QWORD[16+rsi]
+	adox	r11,QWORD[((-24))+rbx]
+	adcx	r12,r15
+	mulx	r14,rdx,QWORD[24+rsi]
+	adox	r12,QWORD[((-16))+rbx]
+	adcx	r13,rdx
+	lea	rcx,[rax*1+rcx]
+	lea	rsi,[32+rsi]
+	adox	r13,QWORD[((-8))+rbx]
+	adcx	r14,rbp
+	adox	r14,rbp
+
+	mov	r15,r8
+	imul	r8,QWORD[((32+8))+rsp]
+
+	mov	rdx,r8
+	xor	rbp,rbp
+	mov	QWORD[((8+8))+rsp],rdi
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	rdi,QWORD[((24+8))+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-24))+rbx],r11
+	adox	r15,rbp
+	mov	QWORD[((-16))+rbx],r12
+	lea	rcx,[32+rcx]
+	jmp	NEAR $L$mulx4x_inner
+
+ALIGN	32
+$L$mulx4x_inner:
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r15,rbp
+	adox	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r10,QWORD[rbx]
+	adox	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r11,QWORD[8+rbx]
+	adox	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r12,QWORD[16+rbx]
+	adox	r13,rax
+	adcx	r13,QWORD[24+rbx]
+	adox	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+	adcx	r14,rbp
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	adox	r13,r15
+	mov	QWORD[((-32))+rbx],r11
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_inner
+
+	mov	rax,QWORD[((0+8))+rsp]
+	adc	r15,rbp
+	sub	rdi,QWORD[rbx]
+	mov	rdi,QWORD[((8+8))+rsp]
+	mov	r10,QWORD[((16+8))+rsp]
+	adc	r14,r15
+	lea	rsi,[rax*1+rsi]
+	adc	rbp,rbp
+	mov	QWORD[((-8))+rbx],r14
+
+	cmp	rdi,r10
+	jb	NEAR $L$mulx4x_outer
+
+	mov	r10,QWORD[((-8))+rcx]
+	mov	r8,rbp
+	mov	r12,QWORD[rax*1+rcx]
+	lea	rbp,[rax*1+rcx]
+	mov	rcx,rax
+	lea	rdi,[rax*1+rbx]
+	xor	eax,eax
+	xor	r15,r15
+	sub	r10,r14
+	adc	r15,r15
+	or	r8,r15
+	sar	rcx,3+2
+	sub	rax,r8
+	mov	rdx,QWORD[((56+8))+rsp]
+	dec	r12
+	mov	r13,QWORD[8+rbp]
+	xor	r8,r8
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqrx4x_sub_entry
+
+
+global	bn_powerx5
+
+ALIGN	32
+bn_powerx5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_powerx5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$powerx5_prologue:
+
+
+
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$pwrx_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$pwrx_sp_done
+
+ALIGN	32
+$L$pwrx_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$pwrx_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwrx_page_walk
+	jmp	NEAR $L$pwrx_page_walk_done
+
+$L$pwrx_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwrx_page_walk
+$L$pwrx_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	xmm0,xmm0
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	102,73,15,110,218
+DB	102,72,15,110,226
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$powerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	mov	r9,r10
+	mov	rdi,rsi
+DB	102,72,15,126,209
+DB	102,72,15,126,226
+	mov	rax,QWORD[40+rsp]
+
+	call	mulx4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$powerx5_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_powerx5:
+
+global	bn_sqrx8x_internal
+
+
+ALIGN	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	lea	rdi,[((48+8))+rsp]
+	lea	rbp,[r9*1+rsi]
+	mov	QWORD[((0+8))+rsp],r9
+	mov	QWORD[((8+8))+rsp],rbp
+	jmp	NEAR $L$sqr8x_zero_start
+
+ALIGN	32
+	DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+$L$sqrx8x_zero:
+	DB	0x3e
+	movdqa	XMMWORD[rdi],xmm0
+	movdqa	XMMWORD[16+rdi],xmm0
+	movdqa	XMMWORD[32+rdi],xmm0
+	movdqa	XMMWORD[48+rdi],xmm0
+$L$sqr8x_zero_start:
+	movdqa	XMMWORD[64+rdi],xmm0
+	movdqa	XMMWORD[80+rdi],xmm0
+	movdqa	XMMWORD[96+rdi],xmm0
+	movdqa	XMMWORD[112+rdi],xmm0
+	lea	rdi,[128+rdi]
+	sub	r9,64
+	jnz	NEAR $L$sqrx8x_zero
+
+	mov	rdx,QWORD[rsi]
+
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	lea	rdi,[((48+8))+rsp]
+	xor	rbp,rbp
+	jmp	NEAR $L$sqrx8x_outer_loop
+
+ALIGN	32
+$L$sqrx8x_outer_loop:
+	mulx	rax,r8,QWORD[8+rsi]
+	adcx	r8,r9
+	adox	r10,rax
+	mulx	rax,r9,QWORD[16+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+	DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcx	r10,r11
+	adox	r12,rax
+	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcx	r11,r12
+	adox	r13,rax
+	mulx	rax,r12,QWORD[40+rsi]
+	adcx	r12,r13
+	adox	r14,rax
+	mulx	rax,r13,QWORD[48+rsi]
+	adcx	r13,r14
+	adox	rax,r15
+	mulx	r15,r14,QWORD[56+rsi]
+	mov	rdx,QWORD[8+rsi]
+	adcx	r14,rax
+	adox	r15,rbp
+	adc	r15,QWORD[64+rdi]
+	mov	QWORD[8+rdi],r8
+	mov	QWORD[16+rdi],r9
+	sbb	rcx,rcx
+	xor	rbp,rbp
+
+
+	mulx	rbx,r8,QWORD[16+rsi]
+	mulx	rax,r9,QWORD[24+rsi]
+	adcx	r8,r10
+	adox	r9,rbx
+	mulx	rbx,r10,QWORD[32+rsi]
+	adcx	r9,r11
+	adox	r10,rax
+	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcx	r10,r12
+	adox	r11,rbx
+	DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcx	r11,r13
+	adox	r12,r14
+	DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	mov	rdx,QWORD[16+rsi]
+	adcx	r12,rax
+	adox	r13,rbx
+	adcx	r13,r15
+	adox	r14,rbp
+	adcx	r14,rbp
+
+	mov	QWORD[24+rdi],r8
+	mov	QWORD[32+rdi],r9
+
+	mulx	rbx,r8,QWORD[24+rsi]
+	mulx	rax,r9,QWORD[32+rsi]
+	adcx	r8,r10
+	adox	r9,rbx
+	mulx	rbx,r10,QWORD[40+rsi]
+	adcx	r9,r11
+	adox	r10,rax
+	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcx	r10,r12
+	adox	r11,r13
+	DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+	DB	0x3e
+	mov	rdx,QWORD[24+rsi]
+	adcx	r11,rbx
+	adox	r12,rax
+	adcx	r12,r14
+	mov	QWORD[40+rdi],r8
+	mov	QWORD[48+rdi],r9
+	mulx	rax,r8,QWORD[32+rsi]
+	adox	r13,rbp
+	adcx	r13,rbp
+
+	mulx	rbx,r9,QWORD[40+rsi]
+	adcx	r8,r10
+	adox	r9,rax
+	mulx	rax,r10,QWORD[48+rsi]
+	adcx	r9,r11
+	adox	r10,r12
+	mulx	r12,r11,QWORD[56+rsi]
+	mov	rdx,QWORD[32+rsi]
+	mov	r14,QWORD[40+rsi]
+	adcx	r10,rbx
+	adox	r11,rax
+	mov	r15,QWORD[48+rsi]
+	adcx	r11,r13
+	adox	r12,rbp
+	adcx	r12,rbp
+
+	mov	QWORD[56+rdi],r8
+	mov	QWORD[64+rdi],r9
+
+	mulx	rax,r9,r14
+	mov	r8,QWORD[56+rsi]
+	adcx	r9,r10
+	mulx	rbx,r10,r15
+	adox	r10,rax
+	adcx	r10,r11
+	mulx	rax,r11,r8
+	mov	rdx,r14
+	adox	r11,rbx
+	adcx	r11,r12
+
+	adcx	rax,rbp
+
+	mulx	rbx,r14,r15
+	mulx	r13,r12,r8
+	mov	rdx,r15
+	lea	rsi,[64+rsi]
+	adcx	r11,r14
+	adox	r12,rbx
+	adcx	r12,rax
+	adox	r13,rbp
+
+	DB	0x67,0x67
+	mulx	r14,r8,r8
+	adcx	r13,r8
+	adcx	r14,rbp
+
+	cmp	rsi,QWORD[((8+8))+rsp]
+	je	NEAR $L$sqrx8x_outer_break
+
+	neg	rcx
+	mov	rcx,-8
+	mov	r15,rbp
+	mov	r8,QWORD[64+rdi]
+	adcx	r9,QWORD[72+rdi]
+	adcx	r10,QWORD[80+rdi]
+	adcx	r11,QWORD[88+rdi]
+	adc	r12,QWORD[96+rdi]
+	adc	r13,QWORD[104+rdi]
+	adc	r14,QWORD[112+rdi]
+	adc	r15,QWORD[120+rdi]
+	lea	rbp,[rsi]
+	lea	rdi,[128+rdi]
+	sbb	rax,rax
+
+	mov	rdx,QWORD[((-64))+rsi]
+	mov	QWORD[((16+8))+rsp],rax
+	mov	QWORD[((24+8))+rsp],rdi
+
+
+	xor	eax,eax
+	jmp	NEAR $L$sqrx8x_loop
+
+ALIGN	32
+$L$sqrx8x_loop:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rbx,rax
+	adox	r8,r9
+
+	mulx	r9,rax,QWORD[8+rbp]
+	adcx	r8,rax
+	adox	r9,r10
+
+	mulx	r10,rax,QWORD[16+rbp]
+	adcx	r9,rax
+	adox	r10,r11
+
+	mulx	r11,rax,QWORD[24+rbp]
+	adcx	r10,rax
+	adox	r11,r12
+
+	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcx	r11,rax
+	adox	r12,r13
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	mov	QWORD[rcx*8+rdi],rbx
+	mov	ebx,0
+	adcx	r13,rax
+	adox	r14,r15
+
+	DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	mov	rdx,QWORD[8+rcx*8+rsi]
+	adcx	r14,rax
+	adox	r15,rbx
+	adcx	r15,rbx
+
+	DB	0x67
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_loop
+
+	lea	rbp,[64+rbp]
+	mov	rcx,-8
+	cmp	rbp,QWORD[((8+8))+rsp]
+	je	NEAR $L$sqrx8x_break
+
+	sub	rbx,QWORD[((16+8))+rsp]
+	DB	0x66
+	mov	rdx,QWORD[((-64))+rsi]
+	adcx	r8,QWORD[rdi]
+	adcx	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	DB	0x67
+	sbb	rax,rax
+	xor	ebx,ebx
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_loop
+
+ALIGN	32
+$L$sqrx8x_break:
+	xor	rbp,rbp
+	sub	rbx,QWORD[((16+8))+rsp]
+	adcx	r8,rbp
+	mov	rcx,QWORD[((24+8))+rsp]
+	adcx	r9,rbp
+	mov	rdx,QWORD[rsi]
+	adc	r10,0
+	mov	QWORD[rdi],r8
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	cmp	rdi,rcx
+	je	NEAR $L$sqrx8x_outer_loop
+
+	mov	QWORD[8+rdi],r9
+	mov	r9,QWORD[8+rcx]
+	mov	QWORD[16+rdi],r10
+	mov	r10,QWORD[16+rcx]
+	mov	QWORD[24+rdi],r11
+	mov	r11,QWORD[24+rcx]
+	mov	QWORD[32+rdi],r12
+	mov	r12,QWORD[32+rcx]
+	mov	QWORD[40+rdi],r13
+	mov	r13,QWORD[40+rcx]
+	mov	QWORD[48+rdi],r14
+	mov	r14,QWORD[48+rcx]
+	mov	QWORD[56+rdi],r15
+	mov	r15,QWORD[56+rcx]
+	mov	rdi,rcx
+	jmp	NEAR $L$sqrx8x_outer_loop
+
+ALIGN	32
+$L$sqrx8x_outer_break:
+	mov	QWORD[72+rdi],r9
+DB	102,72,15,126,217
+	mov	QWORD[80+rdi],r10
+	mov	QWORD[88+rdi],r11
+	mov	QWORD[96+rdi],r12
+	mov	QWORD[104+rdi],r13
+	mov	QWORD[112+rdi],r14
+	lea	rdi,[((48+8))+rsp]
+	mov	rdx,QWORD[rcx*1+rsi]
+
+	mov	r11,QWORD[8+rdi]
+	xor	r10,r10
+	mov	r9,QWORD[((0+8))+rsp]
+	adox	r11,r11
+	mov	r12,QWORD[16+rdi]
+	mov	r13,QWORD[24+rdi]
+
+
+ALIGN	32
+$L$sqrx4x_shift_n_add:
+	mulx	rbx,rax,rdx
+	adox	r12,r12
+	adcx	rax,r10
+	DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+	DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adox	r13,r13
+	adcx	rbx,r11
+	mov	r11,QWORD[40+rdi]
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r10,r10
+	adcx	rax,r12
+	mov	rdx,QWORD[16+rcx*1+rsi]
+	mov	r12,QWORD[48+rdi]
+	adox	r11,r11
+	adcx	rbx,r13
+	mov	r13,QWORD[56+rdi]
+	mov	QWORD[16+rdi],rax
+	mov	QWORD[24+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r12,r12
+	adcx	rax,r10
+	mov	rdx,QWORD[24+rcx*1+rsi]
+	lea	rcx,[32+rcx]
+	mov	r10,QWORD[64+rdi]
+	adox	r13,r13
+	adcx	rbx,r11
+	mov	r11,QWORD[72+rdi]
+	mov	QWORD[32+rdi],rax
+	mov	QWORD[40+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r10,r10
+	adcx	rax,r12
+	jrcxz	$L$sqrx4x_shift_n_add_break
+	DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adox	r11,r11
+	adcx	rbx,r13
+	mov	r12,QWORD[80+rdi]
+	mov	r13,QWORD[88+rdi]
+	mov	QWORD[48+rdi],rax
+	mov	QWORD[56+rdi],rbx
+	lea	rdi,[64+rdi]
+	nop
+	jmp	NEAR $L$sqrx4x_shift_n_add
+
+ALIGN	32
+$L$sqrx4x_shift_n_add_break:
+	adcx	rbx,r13
+	mov	QWORD[48+rdi],rax
+	mov	QWORD[56+rdi],rbx
+	lea	rdi,[64+rdi]
+DB	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xor	eax,eax
+	mov	rbx,QWORD[((32+8))+rsp]
+	mov	rdx,QWORD[((48+8))+rsp]
+	lea	rcx,[((-64))+r9*1+rbp]
+
+	mov	QWORD[((0+8))+rsp],rcx
+	mov	QWORD[((8+8))+rsp],rdi
+
+	lea	rdi,[((48+8))+rsp]
+	jmp	NEAR $L$sqrx8x_reduction_loop
+
+ALIGN	32
+$L$sqrx8x_reduction_loop:
+	mov	r9,QWORD[8+rdi]
+	mov	r10,QWORD[16+rdi]
+	mov	r11,QWORD[24+rdi]
+	mov	r12,QWORD[32+rdi]
+	mov	r8,rdx
+	imul	rdx,rbx
+	mov	r13,QWORD[40+rdi]
+	mov	r14,QWORD[48+rdi]
+	mov	r15,QWORD[56+rdi]
+	mov	QWORD[((24+8))+rsp],rax
+
+	lea	rdi,[64+rdi]
+	xor	rsi,rsi
+	mov	rcx,-8
+	jmp	NEAR $L$sqrx8x_reduce
+
+ALIGN	32
+$L$sqrx8x_reduce:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rax,rbx
+	adox	r8,r9
+
+	mulx	r9,rbx,QWORD[8+rbp]
+	adcx	r8,rbx
+	adox	r9,r10
+
+	mulx	r10,rbx,QWORD[16+rbp]
+	adcx	r9,rbx
+	adox	r10,r11
+
+	mulx	r11,rbx,QWORD[24+rbp]
+	adcx	r10,rbx
+	adox	r11,r12
+
+	DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	mov	rax,rdx
+	mov	rdx,r8
+	adcx	r11,rbx
+	adox	r12,r13
+
+	mulx	rdx,rbx,QWORD[((32+8))+rsp]
+	mov	rdx,rax
+	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	adcx	r13,rax
+	adox	r14,r15
+
+	mulx	r15,rax,QWORD[56+rbp]
+	mov	rdx,rbx
+	adcx	r14,rax
+	adox	r15,rsi
+	adcx	r15,rsi
+
+	DB	0x67,0x67,0x67
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_reduce
+
+	mov	rax,rsi
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$sqrx8x_no_tail
+
+	mov	rdx,QWORD[((48+8))+rsp]
+	add	r8,QWORD[rdi]
+	lea	rbp,[64+rbp]
+	mov	rcx,-8
+	adcx	r9,QWORD[8+rdi]
+	adcx	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	sbb	rax,rax
+
+	xor	rsi,rsi
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_tail
+
+ALIGN	32
+$L$sqrx8x_tail:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rbx,rax
+	adox	r8,r9
+
+	mulx	r9,rax,QWORD[8+rbp]
+	adcx	r8,rax
+	adox	r9,r10
+
+	mulx	r10,rax,QWORD[16+rbp]
+	adcx	r9,rax
+	adox	r10,r11
+
+	mulx	r11,rax,QWORD[24+rbp]
+	adcx	r10,rax
+	adox	r11,r12
+
+	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcx	r11,rax
+	adox	r12,r13
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	adcx	r13,rax
+	adox	r14,r15
+
+	mulx	r15,rax,QWORD[56+rbp]
+	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
+	adcx	r14,rax
+	adox	r15,rsi
+	mov	QWORD[rcx*8+rdi],rbx
+	mov	rbx,r8
+	adcx	r15,rsi
+
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_tail
+
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$sqrx8x_tail_done
+
+	sub	rsi,QWORD[((16+8))+rsp]
+	mov	rdx,QWORD[((48+8))+rsp]
+	lea	rbp,[64+rbp]
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	sbb	rax,rax
+	sub	rcx,8
+
+	xor	rsi,rsi
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_tail
+
+ALIGN	32
+$L$sqrx8x_tail_done:
+	xor	rax,rax
+	add	r8,QWORD[((24+8))+rsp]
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rax,0
+
+	sub	rsi,QWORD[((16+8))+rsp]
+$L$sqrx8x_no_tail:
+	adc	r8,QWORD[rdi]
+DB	102,72,15,126,217
+	adc	r9,QWORD[8+rdi]
+	mov	rsi,QWORD[56+rbp]
+DB	102,72,15,126,213
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	adc	rax,0
+
+	mov	rbx,QWORD[((32+8))+rsp]
+	mov	rdx,QWORD[64+rcx*1+rdi]
+
+	mov	QWORD[rdi],r8
+	lea	r8,[64+rdi]
+	mov	QWORD[8+rdi],r9
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+	mov	QWORD[32+rdi],r12
+	mov	QWORD[40+rdi],r13
+	mov	QWORD[48+rdi],r14
+	mov	QWORD[56+rdi],r15
+
+	lea	rdi,[64+rcx*1+rdi]
+	cmp	r8,QWORD[((8+8))+rsp]
+	jb	NEAR $L$sqrx8x_reduction_loop
+	ret
+
+
+ALIGN	32
+
+__bn_postx4x_internal:
+
+	mov	r12,QWORD[rbp]
+	mov	r10,rcx
+	mov	r9,rcx
+	neg	rax
+	sar	rcx,3+2
+
+DB	102,72,15,126,202
+DB	102,72,15,126,206
+	dec	r12
+	mov	r13,QWORD[8+rbp]
+	xor	r8,r8
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqrx4x_sub_entry
+
+ALIGN	16
+$L$sqrx4x_sub:
+	mov	r12,QWORD[rbp]
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+$L$sqrx4x_sub_entry:
+	andn	r12,r12,rax
+	lea	rbp,[32+rbp]
+	andn	r13,r13,rax
+	andn	r14,r14,rax
+	andn	r15,r15,rax
+
+	neg	r8
+	adc	r12,QWORD[rdi]
+	adc	r13,QWORD[8+rdi]
+	adc	r14,QWORD[16+rdi]
+	adc	r15,QWORD[24+rdi]
+	mov	QWORD[rdx],r12
+	lea	rdi,[32+rdi]
+	mov	QWORD[8+rdx],r13
+	sbb	r8,r8
+	mov	QWORD[16+rdx],r14
+	mov	QWORD[24+rdx],r15
+	lea	rdx,[32+rdx]
+
+	inc	rcx
+	jnz	NEAR $L$sqrx4x_sub
+
+	neg	r9
+
+	ret
+
+
+global	bn_scatter5
+
+ALIGN	16
+bn_scatter5:
+
+_CET_ENDBR
+	cmp	edx,0
+	jz	NEAR $L$scatter_epilogue
+
+
+
+
+
+
+
+
+
+	lea	r8,[r9*8+r8]
+$L$scatter:
+	mov	rax,QWORD[rcx]
+	lea	rcx,[8+rcx]
+	mov	QWORD[r8],rax
+	lea	r8,[256+r8]
+	sub	edx,1
+	jnz	NEAR $L$scatter
+$L$scatter_epilogue:
+	ret
+
+
+
+global	bn_gather5
+
+ALIGN	32
+bn_gather5:
+
+$L$SEH_begin_bn_gather5:
+_CET_ENDBR
+
+	DB	0x4c,0x8d,0x14,0x24
+
+	DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	lea	rax,[$L$inc]
+	and	rsp,-16
+
+	movd	xmm5,r9d
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r11,[128+r8]
+	lea	rax,[128+rsp]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[(-128)+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[(-112)+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[(-96)+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[(-80)+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[(-64)+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[(-48)+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[(-32)+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[(-16)+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[16+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[32+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[48+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[64+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[80+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[96+rax],xmm2
+	movdqa	xmm2,xmm4
+	movdqa	XMMWORD[112+rax],xmm3
+	jmp	NEAR $L$gather
+
+ALIGN	32
+$L$gather:
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r11]
+	movdqa	xmm1,XMMWORD[((-112))+r11]
+	movdqa	xmm2,XMMWORD[((-96))+r11]
+	pand	xmm0,XMMWORD[((-128))+rax]
+	movdqa	xmm3,XMMWORD[((-80))+r11]
+	pand	xmm1,XMMWORD[((-112))+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r11]
+	movdqa	xmm1,XMMWORD[((-48))+r11]
+	movdqa	xmm2,XMMWORD[((-32))+r11]
+	pand	xmm0,XMMWORD[((-64))+rax]
+	movdqa	xmm3,XMMWORD[((-16))+r11]
+	pand	xmm1,XMMWORD[((-48))+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r11]
+	movdqa	xmm1,XMMWORD[16+r11]
+	movdqa	xmm2,XMMWORD[32+r11]
+	pand	xmm0,XMMWORD[rax]
+	movdqa	xmm3,XMMWORD[48+r11]
+	pand	xmm1,XMMWORD[16+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r11]
+	movdqa	xmm1,XMMWORD[80+r11]
+	movdqa	xmm2,XMMWORD[96+r11]
+	pand	xmm0,XMMWORD[64+rax]
+	movdqa	xmm3,XMMWORD[112+r11]
+	pand	xmm1,XMMWORD[80+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+	lea	r11,[256+r11]
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	movq	QWORD[rcx],xmm0
+	lea	rcx,[8+rcx]
+	sub	edx,1
+	jnz	NEAR $L$gather
+
+	lea	rsp,[r10]
+
+	ret
+$L$SEH_end_bn_gather5:
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$inc:
+	DD	0,0,1,1
+	DD	2,2,2,2
+	DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+	DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
+	DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
+	DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
+	DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
+	DB	112,101,110,115,115,108,46,111,114,103,62,0
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+mul_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	r10,[$L$mul4x_epilogue]
+	cmp	rbx,r10
+	ja	NEAR $L$body_40
+
+	mov	r10,QWORD[192+r8]
+	mov	rax,QWORD[8+r10*8+rax]
+
+	jmp	NEAR $L$common_pop_regs
+
+$L$body_40:
+	mov	rax,QWORD[40+rax]
+$L$common_pop_regs:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_power5_nohw wrt ..imagebase
+	DD	$L$SEH_end_bn_power5_nohw wrt ..imagebase
+	DD	$L$SEH_info_bn_power5_nohw wrt ..imagebase
+	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_info_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_bn_mul4x_mont_gather5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_power5_nohw:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mulx4x_mont_gather5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_powerx5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_gather5:
+	DB	0x01,0x0b,0x03,0x0a
+	DB	0x0b,0x01,0x21,0x00
+	DB	0x04,0xa3,0x00,0x00
+ALIGN	8
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-nasm.o b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.o
new file mode 100644
index 0000000000..2d5a63e2f1
Binary files /dev/null and b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.o differ
diff --git a/ring-0.17.14/src/aead.rs b/ring-0.17.14/src/aead.rs
new file mode 100644
index 0000000000..16a8aba4b6
--- /dev/null
+++ b/ring-0.17.14/src/aead.rs
@@ -0,0 +1,186 @@
+// Copyright 2015-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Authenticated Encryption with Associated Data (AEAD).
+//!
+//! See [Authenticated encryption: relations among notions and analysis of the
+//! generic composition paradigm][AEAD] for an introduction to the concept of
+//! AEADs.
+//!
+//! [AEAD]: https://eprint.iacr.org/2000/025.pdf
+//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD
+
+use crate::{
+    cpu, error,
+    polyfill::{u64_from_usize, usize_from_u64_saturated},
+};
+
+pub use self::{
+    algorithm::{Algorithm, AES_128_GCM, AES_256_GCM, CHACHA20_POLY1305},
+    less_safe_key::LessSafeKey,
+    nonce::{Nonce, NONCE_LEN},
+    opening_key::OpeningKey,
+    sealing_key::SealingKey,
+    unbound_key::UnboundKey,
+};
+
+/// A sequences of unique nonces.
+///
+/// A given `NonceSequence` must never return the same `Nonce` twice from
+/// `advance()`.
+///
+/// A simple counter is a reasonable (but probably not ideal) `NonceSequence`.
+///
+/// Intentionally not `Clone` or `Copy` since cloning would allow duplication
+/// of the sequence.
+pub trait NonceSequence {
+    /// Returns the next nonce in the sequence.
+    ///
+    /// This may fail if "too many" nonces have been requested, where how many
+    /// is too many is up to the implementation of `NonceSequence`. An
+    /// implementation may that enforce a maximum number of records are
+    /// sent/received under a key this way. Once `advance()` fails, it must
+    /// fail for all subsequent calls.
+    fn advance(&mut self) -> Result<Nonce, error::Unspecified>;
+}
+
+/// An AEAD key bound to a nonce sequence.
+pub trait BoundKey<N: NonceSequence>: core::fmt::Debug {
+    /// Constructs a new key from the given `UnboundKey` and `NonceSequence`.
+    fn new(key: UnboundKey, nonce_sequence: N) -> Self;
+
+    /// The key's AEAD algorithm.
+    fn algorithm(&self) -> &'static Algorithm;
+}
+
+/// The additionally authenticated data (AAD) for an opening or sealing
+/// operation. This data is authenticated but is **not** encrypted.
+///
+/// The type `A` could be a byte slice `&[u8]`, a byte array `[u8; N]`
+/// for some constant `N`, `Vec<u8>`, etc.
+#[derive(Clone, Copy)]
+pub struct Aad<A>(A);
+
+impl<A: AsRef<[u8]>> Aad<A> {
+    /// Construct the `Aad` from the given bytes.
+    #[inline]
+    pub fn from(aad: A) -> Self {
+        Self(aad)
+    }
+}
+
+impl<A> AsRef<[u8]> for Aad<A>
+where
+    A: AsRef<[u8]>,
+{
+    fn as_ref(&self) -> &[u8] {
+        self.0.as_ref()
+    }
+}
+
+impl Aad<[u8; 0]> {
+    /// Construct an empty `Aad`.
+    pub fn empty() -> Self {
+        Self::from([])
+    }
+}
+
+impl<A> core::fmt::Debug for Aad<A>
+where
+    A: core::fmt::Debug,
+{
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_tuple("Aad").field(&self.0).finish()
+    }
+}
+
+impl<A> PartialEq for Aad<A>
+where
+    A: PartialEq,
+{
+    #[inline]
+    fn eq(&self, other: &Self) -> bool {
+        self.0.eq(&other.0)
+    }
+}
+
+impl<A> Eq for Aad<A> where A: Eq {}
+
+#[allow(clippy::large_enum_variant, variant_size_differences)]
+#[derive(Clone)]
+enum KeyInner {
+    AesGcm(aes_gcm::Key),
+    ChaCha20Poly1305(chacha20_poly1305::Key),
+}
+
+const fn max_input_len(block_len: usize, overhead_blocks_per_nonce: usize) -> usize {
+    // Each of our AEADs use a 32-bit block counter so the maximum is the
+    // largest input that will not overflow the counter.
+    usize_from_u64_saturated(
+        ((1u64 << 32) - u64_from_usize(overhead_blocks_per_nonce)) * u64_from_usize(block_len),
+    )
+}
+
+/// A possibly valid authentication tag.
+#[must_use]
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct Tag([u8; TAG_LEN]);
+
+impl AsRef<[u8]> for Tag {
+    fn as_ref(&self) -> &[u8] {
+        self.0.as_ref()
+    }
+}
+
+impl TryFrom<&[u8]> for Tag {
+    type Error = error::Unspecified;
+
+    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
+        let raw_tag: [u8; TAG_LEN] = value.try_into().map_err(|_| error::Unspecified)?;
+        Ok(Self::from(raw_tag))
+    }
+}
+
+impl From<[u8; TAG_LEN]> for Tag {
+    #[inline]
+    fn from(value: [u8; TAG_LEN]) -> Self {
+        Self(value)
+    }
+}
+
+const MAX_KEY_LEN: usize = 32;
+
+// All the AEADs we support use 128-bit tags.
+const TAG_LEN: usize = 16;
+
+/// The maximum length of a tag for the algorithms in this module.
+pub const MAX_TAG_LEN: usize = TAG_LEN;
+
+mod aes;
+mod aes_gcm;
+mod algorithm;
+mod chacha;
+mod chacha20_poly1305;
+pub mod chacha20_poly1305_openssh;
+mod gcm;
+mod less_safe_key;
+mod nonce;
+mod opening_key;
+mod overlapping;
+mod poly1305;
+pub mod quic;
+mod sealing_key;
+mod shift;
+mod unbound_key;
diff --git a/ring-0.17.14/src/aead/aes.rs b/ring-0.17.14/src/aead/aes.rs
new file mode 100644
index 0000000000..d9832aeb21
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes.rs
@@ -0,0 +1,284 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{nonce::Nonce, overlapping, quic::Sample, NONCE_LEN};
+use crate::{
+    bb,
+    cpu::{self, GetFeature as _},
+    error,
+    polyfill::unwrap_const,
+};
+use cfg_if::cfg_if;
+use core::num::NonZeroU32;
+
+pub(super) use ffi::Counter;
+
+#[macro_use]
+mod ffi;
+
+mod bs;
+pub(super) mod fallback;
+pub(super) mod hw;
+pub(super) mod vp;
+
+pub type Overlapping<'o> = overlapping::Overlapping<'o, u8>;
+pub type OverlappingPartialBlock<'o> = overlapping::PartialBlock<'o, u8, BLOCK_LEN>;
+
+cfg_if! {
+    if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] {
+        pub(super) use ffi::AES_KEY;
+    } else {
+        use ffi::AES_KEY;
+    }
+}
+
+#[derive(Clone)]
+pub(super) enum Key {
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64",
+        target_arch = "x86"
+    ))]
+    Hw(hw::Key),
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86",
+        target_arch = "x86_64"
+    ))]
+    Vp(vp::Key),
+
+    Fallback(fallback::Key),
+}
+
+impl Key {
+    #[inline]
+    pub fn new(
+        bytes: KeyBytes<'_>,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::Unspecified> {
+        #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            target_arch = "x86",
+            target_arch = "x86_64"
+        ))]
+        if let Some(hw_features) = cpu_features.get_feature() {
+            return Ok(Self::Hw(hw::Key::new(
+                bytes,
+                hw_features,
+                cpu_features.get_feature(),
+            )?));
+        }
+
+        #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86_64",
+            target_arch = "x86"
+        ))]
+        if let Some(vp_features) = cpu_features.get_feature() {
+            return Ok(Self::Vp(vp::Key::new(bytes, vp_features)?));
+        }
+
+        let _ = cpu_features;
+
+        Ok(Self::Fallback(fallback::Key::new(bytes)?))
+    }
+
+    #[inline]
+    fn encrypt_block(&self, a: Block) -> Block {
+        match self {
+            #[cfg(any(
+                all(target_arch = "aarch64", target_endian = "little"),
+                target_arch = "x86_64",
+                target_arch = "x86"
+            ))]
+            Key::Hw(inner) => inner.encrypt_block(a),
+
+            #[cfg(any(
+                all(target_arch = "aarch64", target_endian = "little"),
+                all(target_arch = "arm", target_endian = "little"),
+                target_arch = "x86",
+                target_arch = "x86_64"
+            ))]
+            Key::Vp(inner) => inner.encrypt_block(a),
+
+            Key::Fallback(inner) => inner.encrypt_block(a),
+        }
+    }
+
+    pub fn new_mask(&self, sample: Sample) -> [u8; 5] {
+        let [b0, b1, b2, b3, b4, ..] = self.encrypt_block(sample);
+        [b0, b1, b2, b3, b4]
+    }
+}
+
+pub const AES_128_KEY_LEN: usize = 128 / 8;
+pub const AES_256_KEY_LEN: usize = 256 / 8;
+
+pub enum KeyBytes<'a> {
+    AES_128(&'a [u8; AES_128_KEY_LEN]),
+    AES_256(&'a [u8; AES_256_KEY_LEN]),
+}
+
+// `Counter` is `ffi::Counter` as its representation is dictated by its use in
+// the FFI.
+impl Counter {
+    pub fn one(nonce: Nonce) -> Self {
+        let mut value = [0u8; BLOCK_LEN];
+        value[..NONCE_LEN].copy_from_slice(nonce.as_ref());
+        value[BLOCK_LEN - 1] = 1;
+        Self(value)
+    }
+
+    pub fn increment(&mut self) -> Iv {
+        const ONE: NonZeroU32 = unwrap_const(NonZeroU32::new(1));
+
+        let iv = Iv(self.0);
+        self.increment_by_less_safe(ONE);
+        iv
+    }
+
+    pub(super) fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) {
+        let [.., c0, c1, c2, c3] = &mut self.0;
+        let old_value: u32 = u32::from_be_bytes([*c0, *c1, *c2, *c3]);
+        let new_value = old_value.wrapping_add(increment_by.get());
+        [*c0, *c1, *c2, *c3] = u32::to_be_bytes(new_value);
+    }
+}
+
+/// The IV for a single block encryption.
+///
+/// Intentionally not `Clone` to ensure each is used only once.
+pub struct Iv(Block);
+
+impl From<Counter> for Iv {
+    fn from(counter: Counter) -> Self {
+        Self(counter.0)
+    }
+}
+
+pub(super) type Block = [u8; BLOCK_LEN];
+pub(super) const BLOCK_LEN: usize = 16;
+pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN];
+
+pub(super) trait EncryptBlock {
+    fn encrypt_block(&self, block: Block) -> Block;
+    fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block;
+}
+
+pub(super) trait EncryptCtr32 {
+    fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter);
+}
+
+#[allow(dead_code)]
+fn encrypt_block_using_encrypt_iv_xor_block(key: &impl EncryptBlock, block: Block) -> Block {
+    key.encrypt_iv_xor_block(Iv(block), ZERO_BLOCK)
+}
+
+fn encrypt_iv_xor_block_using_encrypt_block(
+    key: &impl EncryptBlock,
+    iv: Iv,
+    block: Block,
+) -> Block {
+    let encrypted_iv = key.encrypt_block(iv.0);
+    bb::xor_16(encrypted_iv, block)
+}
+
+#[allow(dead_code)]
+fn encrypt_iv_xor_block_using_ctr32(key: &impl EncryptCtr32, iv: Iv, mut block: Block) -> Block {
+    let mut ctr = Counter(iv.0); // This is OK because we're only encrypting one block.
+    key.ctr32_encrypt_within(block.as_mut().into(), &mut ctr);
+    block
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::testutil as test;
+
+    #[test]
+    pub fn test_aes() {
+        test::run(test_vector_file!("aes_tests.txt"), |section, test_case| {
+            assert_eq!(section, "");
+            let key = consume_key(test_case, "Key");
+            let input = test_case.consume_bytes("Input");
+            let block: Block = input.as_slice().try_into()?;
+            let expected_output = test_case.consume_bytes("Output");
+
+            let output = key.encrypt_block(block);
+            assert_eq!(output.as_ref(), &expected_output[..]);
+
+            Ok(())
+        })
+    }
+
+    fn consume_key(test_case: &mut test::TestCase, name: &str) -> Key {
+        let key = test_case.consume_bytes(name);
+        let key = &key[..];
+        let key = match key.len() {
+            16 => KeyBytes::AES_128(key.try_into().unwrap()),
+            32 => KeyBytes::AES_256(key.try_into().unwrap()),
+            _ => unreachable!(),
+        };
+        Key::new(key, cpu::features()).unwrap()
+    }
+}
+
+// These AES-GCM-specific tests are here instead of in `aes_gcm` because
+// `Counter`'s API isn't visible (enough) to aes_gcm.
+#[cfg(test)]
+mod aes_gcm_tests {
+    use super::{super::aes_gcm::MAX_IN_OUT_LEN, *};
+    use core::num::NonZeroU32;
+
+    #[test]
+    fn test_aes_gcm_counter_blocks_max() {
+        test_aes_gcm_counter_blocks(MAX_IN_OUT_LEN, &[0, 0, 0, 0]);
+    }
+
+    #[test]
+    fn test_aes_gcm_counter_blocks_max_minus_one() {
+        test_aes_gcm_counter_blocks(MAX_IN_OUT_LEN - BLOCK_LEN, &[0xff, 0xff, 0xff, 0xff]);
+    }
+
+    fn test_aes_gcm_counter_blocks(in_out_len: usize, expected_final_counter: &[u8; 4]) {
+        fn ctr32(ctr: &Counter) -> &[u8; 4] {
+            (&ctr.0[12..]).try_into().unwrap()
+        }
+
+        let rounded_down = in_out_len / BLOCK_LEN;
+        let blocks = rounded_down + (if in_out_len % BLOCK_LEN == 0 { 0 } else { 1 });
+        let blocks = u32::try_from(blocks)
+            .ok()
+            .and_then(NonZeroU32::new)
+            .unwrap();
+
+        let nonce = Nonce::assume_unique_for_key([1; 12]);
+        let mut ctr = Counter::one(nonce);
+        assert_eq!(ctr32(&ctr), &[0, 0, 0, 1]);
+        let _tag_iv = ctr.increment();
+        assert_eq!(ctr32(&ctr), &[0, 0, 0, 2]);
+        ctr.increment_by_less_safe(blocks);
+
+        // `MAX_IN_OUT_LEN` is less on 32-bit targets, so we don't even get
+        // close to wrapping, but run the tests on them anyway.
+        #[cfg(target_pointer_width = "64")]
+        assert_eq!(ctr32(&ctr), expected_final_counter);
+
+        #[cfg(target_pointer_width = "32")]
+        let _ = expected_final_counter;
+    }
+}
diff --git a/ring-0.17.14/src/aead/aes/bs.rs b/ring-0.17.14/src/aead/aes/bs.rs
new file mode 100644
index 0000000000..2e418bbf10
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes/bs.rs
@@ -0,0 +1,60 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(all(target_arch = "arm", target_endian = "little"))]
+
+use super::{Counter, Overlapping, AES_KEY};
+
+/// SAFETY:
+///   * The caller must ensure that if blocks > 0 then either `input` and
+///     `output` do not overlap at all, or input == output.add(n) for some
+///     (nonnegative) n.
+///   * if blocks > 0, The caller must ensure `input` points to `blocks` blocks
+///     and that `output` points to writable space for `blocks` blocks.
+///   * The caller must ensure that `vpaes_key` was initialized with
+///     `vpaes_set_encrypt_key`.
+///   * Upon returning, `blocks` blocks will have been read from `input` and
+///     written to `output`.
+pub(super) unsafe fn ctr32_encrypt_blocks_with_vpaes_key(
+    in_out: Overlapping<'_>,
+    vpaes_key: &AES_KEY,
+    ctr: &mut Counter,
+) {
+    prefixed_extern! {
+        // bsaes_ctr32_encrypt_blocks requires transformation of an existing
+        // VPAES key; there is no `bsaes_set_encrypt_key`.
+        fn vpaes_encrypt_key_to_bsaes(bsaes_key: *mut AES_KEY, vpaes_key: &AES_KEY);
+    }
+
+    // SAFETY:
+    //   * The caller ensures `vpaes_key` was initialized by
+    //     `vpaes_set_encrypt_key`.
+    //   * `bsaes_key was zeroed above, and `vpaes_encrypt_key_to_bsaes`
+    //     is assumed to initialize `bsaes_key`.
+    let bsaes_key = unsafe { AES_KEY::derive(vpaes_encrypt_key_to_bsaes, vpaes_key) };
+
+    // The code for `vpaes_encrypt_key_to_bsaes` notes "vpaes stores one
+    // fewer round count than bsaes, but the number of keys is the same,"
+    // so use this as a sanity check.
+    debug_assert_eq!(bsaes_key.rounds(), vpaes_key.rounds() + 1);
+
+    // SAFETY:
+    //  * `bsaes_key` is in bsaes format after calling
+    //    `vpaes_encrypt_key_to_bsaes`.
+    //  * `bsaes_ctr32_encrypt_blocks` satisfies the contract for
+    //    `ctr32_encrypt_blocks`.
+    unsafe {
+        ctr32_encrypt_blocks!(bsaes_ctr32_encrypt_blocks, in_out, &bsaes_key, ctr);
+    }
+}
diff --git a/ring-0.17.14/src/aead/aes/fallback.rs b/ring-0.17.14/src/aead/aes/fallback.rs
new file mode 100644
index 0000000000..40dee0d757
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes/fallback.rs
@@ -0,0 +1,44 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY};
+use crate::error;
+
+#[derive(Clone)]
+pub struct Key {
+    inner: AES_KEY,
+}
+
+impl Key {
+    pub(in super::super) fn new(bytes: KeyBytes<'_>) -> Result<Self, error::Unspecified> {
+        let inner = unsafe { set_encrypt_key!(aes_nohw_set_encrypt_key, bytes) }?;
+        Ok(Self { inner })
+    }
+}
+
+impl EncryptBlock for Key {
+    fn encrypt_block(&self, block: Block) -> Block {
+        unsafe { encrypt_block!(aes_nohw_encrypt, block, &self.inner) }
+    }
+
+    fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block {
+        super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block)
+    }
+}
+
+impl EncryptCtr32 for Key {
+    fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) {
+        unsafe { ctr32_encrypt_blocks!(aes_nohw_ctr32_encrypt_blocks, in_out, &self.inner, ctr) }
+    }
+}
diff --git a/ring-0.17.14/src/aead/aes/ffi.rs b/ring-0.17.14/src/aead/aes/ffi.rs
new file mode 100644
index 0000000000..c2286e1b87
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes/ffi.rs
@@ -0,0 +1,206 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Block, KeyBytes, Overlapping, BLOCK_LEN};
+use crate::{bits::BitLength, c, error};
+use core::{
+    ffi::{c_int, c_uint},
+    num::{NonZeroU32, NonZeroUsize},
+};
+
+/// nonce || big-endian counter.
+#[repr(transparent)]
+pub(in super::super) struct Counter(pub(super) [u8; BLOCK_LEN]);
+
+// Keep this in sync with AES_KEY in aes.h.
+#[repr(C)]
+#[derive(Clone)]
+pub(in super::super) struct AES_KEY {
+    pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)],
+    pub rounds: c_uint,
+}
+
+// Keep this in sync with `AES_MAXNR` in aes.h.
+const MAX_ROUNDS: usize = 14;
+
+impl AES_KEY {
+    #[inline]
+    pub(super) unsafe fn new(
+        f: unsafe extern "C" fn(*const u8, BitLength<c_int>, *mut AES_KEY) -> c_int,
+        bytes: KeyBytes<'_>,
+    ) -> Result<Self, error::Unspecified> {
+        let mut key = Self {
+            rd_key: [0; 4 * (MAX_ROUNDS + 1)],
+            rounds: 0,
+        };
+
+        let (bytes, key_bits) = match bytes {
+            KeyBytes::AES_128(bytes) => (&bytes[..], BitLength::from_bits(128)),
+            KeyBytes::AES_256(bytes) => (&bytes[..], BitLength::from_bits(256)),
+        };
+
+        // Unusually, in this case zero means success and non-zero means failure.
+        if 0 == unsafe { f(bytes.as_ptr(), key_bits, &mut key) } {
+            debug_assert_ne!(key.rounds, 0); // Sanity check initialization.
+            Ok(key)
+        } else {
+            Err(error::Unspecified)
+        }
+    }
+}
+
+#[cfg(all(target_arch = "arm", target_endian = "little"))]
+impl AES_KEY {
+    pub(super) unsafe fn derive(
+        f: for<'a> unsafe extern "C" fn(*mut AES_KEY, &'a AES_KEY),
+        src: &Self,
+    ) -> Self {
+        let mut r = AES_KEY {
+            rd_key: [0u32; 4 * (MAX_ROUNDS + 1)],
+            rounds: 0,
+        };
+        unsafe { f(&mut r, src) };
+        r
+    }
+
+    pub(super) fn rounds(&self) -> u32 {
+        self.rounds
+    }
+}
+
+// SAFETY:
+//  * The function `$name` must read `bits` bits from `user_key`; `bits` will
+//    always be a valid AES key length, i.e. a whole number of bytes.
+//  * `$name` must set `key.rounds` to the value expected by the corresponding
+//    encryption/decryption functions and return 0, or otherwise must return
+//    non-zero to indicate failure.
+//  * `$name` may inspect CPU features.
+//
+// In BoringSSL, the C prototypes for these are in
+// crypto/fipsmodule/aes/internal.h.
+macro_rules! set_encrypt_key {
+    ( $name:ident, $key_bytes:expr $(,)? ) => {{
+        use crate::bits::BitLength;
+        use core::ffi::c_int;
+        prefixed_extern! {
+            fn $name(user_key: *const u8, bits: BitLength<c_int>, key: *mut AES_KEY) -> c_int;
+        }
+        $crate::aead::aes::ffi::AES_KEY::new($name, $key_bytes)
+    }};
+}
+
+macro_rules! encrypt_block {
+    ($name:ident, $block:expr, $key:expr) => {{
+        use crate::aead::aes::{ffi::AES_KEY, Block};
+        prefixed_extern! {
+            fn $name(a: &Block, r: *mut Block, key: &AES_KEY);
+        }
+        $key.encrypt_block($name, $block)
+    }};
+}
+
+impl AES_KEY {
+    #[inline]
+    pub(super) unsafe fn encrypt_block(
+        &self,
+        f: unsafe extern "C" fn(&Block, *mut Block, &AES_KEY),
+        a: Block,
+    ) -> Block {
+        let mut result = core::mem::MaybeUninit::uninit();
+        unsafe {
+            f(&a, result.as_mut_ptr(), self);
+            result.assume_init()
+        }
+    }
+}
+
+/// SAFETY:
+///   * The caller must ensure that `$key` was initialized with the
+///     `set_encrypt_key!` invocation that `$name` requires.
+///   * The caller must ensure that fhe function `$name` satisfies the conditions
+///     for the `f` parameter to `ctr32_encrypt_blocks`.
+macro_rules! ctr32_encrypt_blocks {
+    ($name:ident, $in_out:expr, $key:expr, $ctr:expr $(,)? ) => {{
+        use crate::{
+            aead::aes::{ffi::AES_KEY, Counter, BLOCK_LEN},
+            c,
+        };
+        prefixed_extern! {
+            fn $name(
+                input: *const [u8; BLOCK_LEN],
+                output: *mut [u8; BLOCK_LEN],
+                blocks: c::NonZero_size_t,
+                key: &AES_KEY,
+                ivec: &Counter,
+            );
+        }
+        $key.ctr32_encrypt_blocks($name, $in_out, $ctr)
+    }};
+}
+
+impl AES_KEY {
+    /// SAFETY:
+    ///   * `f` must not read more than `blocks` blocks from `input`.
+    ///   * `f` must write exactly `block` blocks to `output`.
+    ///   * In particular, `f` must handle blocks == 0 without reading from `input`
+    ///     or writing to `output`.
+    ///   * `f` must support the input overlapping with the output exactly or
+    ///     with any nonnegative offset `n` (i.e. `input == output.add(n)`);
+    ///     `f` does NOT need to support the cases where input < output.
+    ///   * `key` must have been initialized with the `set_encrypt_key!` invocation
+    ///      that corresponds to `f`.
+    ///   * `f` may inspect CPU features.
+    #[inline]
+    pub(super) unsafe fn ctr32_encrypt_blocks(
+        &self,
+        f: unsafe extern "C" fn(
+            input: *const [u8; BLOCK_LEN],
+            output: *mut [u8; BLOCK_LEN],
+            blocks: c::NonZero_size_t,
+            key: &AES_KEY,
+            ivec: &Counter,
+        ),
+        in_out: Overlapping<'_>,
+        ctr: &mut Counter,
+    ) {
+        in_out.with_input_output_len(|input, output, len| {
+            debug_assert_eq!(len % BLOCK_LEN, 0);
+
+            let blocks = match NonZeroUsize::new(len / BLOCK_LEN) {
+                Some(blocks) => blocks,
+                None => {
+                    return;
+                }
+            };
+
+            let input: *const [u8; BLOCK_LEN] = input.cast();
+            let output: *mut [u8; BLOCK_LEN] = output.cast();
+            let blocks_u32: NonZeroU32 = blocks.try_into().unwrap();
+
+            // SAFETY:
+            //  * `input` points to `blocks` blocks.
+            //  * `output` points to space for `blocks` blocks to be written.
+            //  * input == output.add(n), where n == src.start, and the caller is
+            //    responsible for ensuing this sufficient for `f` to work correctly.
+            //  * `blocks` is non-zero so `f` doesn't have to work for empty slices.
+            //  * The caller is responsible for ensuring `key` was initialized by the
+            //    `set_encrypt_key!` invocation required by `f`.
+            unsafe {
+                f(input, output, blocks, self, ctr);
+            }
+
+            ctr.increment_by_less_safe(blocks_u32);
+        });
+    }
+}
diff --git a/ring-0.17.14/src/aead/aes/hw.rs b/ring-0.17.14/src/aead/aes/hw.rs
new file mode 100644
index 0000000000..6310f22dab
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes/hw.rs
@@ -0,0 +1,95 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+
+use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY};
+use crate::{cpu, error};
+use cfg_if::cfg_if;
+
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+        pub(in super::super) type RequiredCpuFeatures = cpu::arm::Aes;
+        pub(in super::super) type OptionalCpuFeatures = ();
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        use cpu::intel::{Aes, Avx, Ssse3};
+        // Some functions seem to have been written to require only SSE/SSE2
+        // but there seem to be no SSSE3-less CPUs with AES-NI, and we don't
+        // have feature detection for SSE2.
+        pub(in super::super) type RequiredCpuFeatures = (Aes, Ssse3);
+        pub(in super::super) type OptionalCpuFeatures = Avx;
+    }
+}
+
+#[derive(Clone)]
+pub struct Key {
+    inner: AES_KEY,
+}
+
+impl Key {
+    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+    pub(in super::super) fn new(
+        bytes: KeyBytes<'_>,
+        _required_cpu_features: RequiredCpuFeatures,
+        _optional_cpu_features: Option<OptionalCpuFeatures>,
+    ) -> Result<Self, error::Unspecified> {
+        let inner = unsafe { set_encrypt_key!(aes_hw_set_encrypt_key, bytes) }?;
+        Ok(Self { inner })
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub(in super::super) fn new(
+        bytes: KeyBytes<'_>,
+        (Aes { .. }, Ssse3 { .. }): RequiredCpuFeatures,
+        optional_cpu_features: Option<OptionalCpuFeatures>,
+    ) -> Result<Self, error::Unspecified> {
+        // Ssse3 is required, but upstream only uses this if there is also Avx;
+        // presumably the base version is faster on pre-AVX CPUs.
+        let inner = if let Some(Avx { .. }) = optional_cpu_features {
+            unsafe { set_encrypt_key!(aes_hw_set_encrypt_key_alt, bytes) }?
+        } else {
+            unsafe { set_encrypt_key!(aes_hw_set_encrypt_key_base, bytes) }?
+        };
+        Ok(Self { inner })
+    }
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ))]
+    #[must_use]
+    pub(in super::super) fn inner_less_safe(&self) -> &AES_KEY {
+        &self.inner
+    }
+}
+
+impl EncryptBlock for Key {
+    fn encrypt_block(&self, block: Block) -> Block {
+        super::encrypt_block_using_encrypt_iv_xor_block(self, block)
+    }
+
+    fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block {
+        super::encrypt_iv_xor_block_using_ctr32(self, iv, block)
+    }
+}
+
+impl EncryptCtr32 for Key {
+    fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) {
+        unsafe { ctr32_encrypt_blocks!(aes_hw_ctr32_encrypt_blocks, in_out, &self.inner, ctr) }
+    }
+}
diff --git a/ring-0.17.14/src/aead/aes/vp.rs b/ring-0.17.14/src/aead/aes/vp.rs
new file mode 100644
index 0000000000..283cd7d14b
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes/vp.rs
@@ -0,0 +1,139 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+
+use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY};
+use crate::{cpu, error};
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little")
+))]
+type RequiredCpuFeatures = cpu::arm::Neon;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(in super::super) type RequiredCpuFeatures = cpu::intel::Ssse3;
+
+#[derive(Clone)]
+pub(in super::super) struct Key {
+    inner: AES_KEY,
+}
+
+impl Key {
+    pub(in super::super) fn new(
+        bytes: KeyBytes<'_>,
+        _cpu: RequiredCpuFeatures,
+    ) -> Result<Self, error::Unspecified> {
+        let inner = unsafe { set_encrypt_key!(vpaes_set_encrypt_key, bytes) }?;
+        Ok(Self { inner })
+    }
+}
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+impl EncryptBlock for Key {
+    fn encrypt_block(&self, block: Block) -> Block {
+        super::encrypt_block_using_encrypt_iv_xor_block(self, block)
+    }
+
+    fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block {
+        super::encrypt_iv_xor_block_using_ctr32(self, iv, block)
+    }
+}
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+impl EncryptCtr32 for Key {
+    fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) {
+        unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, &self.inner, ctr) }
+    }
+}
+
+#[cfg(all(target_arch = "arm", target_endian = "little"))]
+impl EncryptCtr32 for Key {
+    fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) {
+        use super::{super::overlapping::IndexError, bs, BLOCK_LEN};
+
+        let in_out = {
+            let (in_out, src) = in_out.into_slice_src_mut();
+            let blocks = in_out[src.clone()].len() / BLOCK_LEN;
+
+            // bsaes operates in batches of 8 blocks.
+            let bsaes_blocks = if blocks >= 8 && (blocks % 8) < 6 {
+                // It's faster to use bsaes for all the full batches and then
+                // switch to vpaes for the last partial batch (if any).
+                blocks - (blocks % 8)
+            } else if blocks >= 8 {
+                // It's faster to let bsaes handle everything including
+                // the last partial batch.
+                blocks
+            } else {
+                // It's faster to let vpaes handle everything.
+                0
+            };
+            let bsaes_in_out_len = bsaes_blocks * BLOCK_LEN;
+            let bs_in_out =
+                Overlapping::new(&mut in_out[..(src.start + bsaes_in_out_len)], src.clone())
+                    .unwrap_or_else(|IndexError { .. }| unreachable!());
+
+            // SAFETY:
+            //  * self.inner was initialized with `vpaes_set_encrypt_key` above,
+            //    as required by `bsaes_ctr32_encrypt_blocks_with_vpaes_key`.
+            unsafe {
+                bs::ctr32_encrypt_blocks_with_vpaes_key(bs_in_out, &self.inner, ctr);
+            }
+
+            Overlapping::new(&mut in_out[bsaes_in_out_len..], src)
+                .unwrap_or_else(|IndexError { .. }| unreachable!())
+        };
+
+        // SAFETY:
+        //  * self.inner was initialized with `vpaes_set_encrypt_key` above,
+        //    as required by `vpaes_ctr32_encrypt_blocks`.
+        //  * `vpaes_ctr32_encrypt_blocks` satisfies the contract for
+        //    `ctr32_encrypt_blocks`.
+        unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, &self.inner, ctr) }
+    }
+}
+
+#[cfg(target_arch = "x86")]
+impl EncryptBlock for Key {
+    fn encrypt_block(&self, block: Block) -> Block {
+        unsafe { encrypt_block!(vpaes_encrypt, block, &self.inner) }
+    }
+
+    fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block {
+        super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block)
+    }
+}
+
+#[cfg(target_arch = "x86")]
+impl EncryptCtr32 for Key {
+    fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) {
+        super::super::shift::shift_full_blocks(in_out, |input| {
+            self.encrypt_iv_xor_block(ctr.increment(), *input)
+        });
+    }
+}
diff --git a/ring-0.17.14/src/aead/aes_gcm.rs b/ring-0.17.14/src/aead/aes_gcm.rs
new file mode 100644
index 0000000000..d9e08a3116
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes_gcm.rs
@@ -0,0 +1,505 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    aes::{self, Counter, Overlapping, OverlappingPartialBlock, BLOCK_LEN, ZERO_BLOCK},
+    gcm,
+    overlapping::IndexError,
+    Aad, Nonce, Tag,
+};
+use crate::{
+    cpu,
+    error::{self, InputTooLongError},
+    polyfill::{slice, sliceutil::overwrite_at_start, usize_from_u64_saturated},
+};
+use core::ops::RangeFrom;
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+use cpu::GetFeature as _;
+
+mod aarch64;
+mod aeshwclmulmovbe;
+mod vaesclmulavx2;
+
+#[derive(Clone)]
+pub(super) struct Key(DynKey);
+
+impl Key {
+    pub(super) fn new(
+        key: aes::KeyBytes,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::Unspecified> {
+        Ok(Self(DynKey::new(key, cpu_features)?))
+    }
+}
+
+#[derive(Clone)]
+enum DynKey {
+    #[cfg(target_arch = "x86_64")]
+    VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
+
+    #[cfg(target_arch = "x86_64")]
+    AesHwClMulAvxMovbe(Combo<aes::hw::Key, gcm::clmulavxmovbe::Key>),
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86",
+        target_arch = "x86_64"
+    ))]
+    AesHwClMul(Combo<aes::hw::Key, gcm::clmul::Key>),
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little")
+    ))]
+    Simd(Combo<aes::vp::Key, gcm::neon::Key>),
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    Simd(Combo<aes::vp::Key, gcm::fallback::Key>),
+
+    Fallback(Combo<aes::fallback::Key, gcm::fallback::Key>),
+}
+
+impl DynKey {
+    fn new(key: aes::KeyBytes, cpu: cpu::Features) -> Result<Self, error::Unspecified> {
+        let cpu = cpu.values();
+
+        #[cfg(target_arch = "x86_64")]
+        if let Some((aes, gcm)) = cpu.get_feature() {
+            // 14.3.1 Detection of VEX-Encoded AES and VPCLMULQDQ
+            let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
+            let gcm_key_value = derive_gcm_key_value(&aes_key);
+            let combo = if let Some(cpu) = cpu.get_feature() {
+                let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
+                Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
+            } else if let Some(cpu) = cpu.get_feature() {
+                let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
+                Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
+            } else {
+                let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm);
+                Self::AesHwClMul(Combo { aes_key, gcm_key })
+            };
+            return Ok(combo);
+        }
+
+        // x86_64 is handled above.
+        #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            target_arch = "x86"
+        ))]
+        if let (Some(aes), Some(gcm)) = (cpu.get_feature(), cpu.get_feature()) {
+            let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
+            let gcm_key_value = derive_gcm_key_value(&aes_key);
+            let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm);
+            return Ok(Self::AesHwClMul(Combo { aes_key, gcm_key }));
+        }
+
+        #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little")
+        ))]
+        if let Some(cpu) = cpu.get_feature() {
+            return Self::new_neon(key, cpu);
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        if let Some(cpu) = cpu.get_feature() {
+            return Self::new_ssse3(key, cpu);
+        }
+
+        let _ = cpu;
+        Self::new_fallback(key)
+    }
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little")
+    ))]
+    #[cfg_attr(target_arch = "aarch64", inline(never))]
+    fn new_neon(key: aes::KeyBytes, cpu: cpu::arm::Neon) -> Result<Self, error::Unspecified> {
+        let aes_key = aes::vp::Key::new(key, cpu)?;
+        let gcm_key_value = derive_gcm_key_value(&aes_key);
+        let gcm_key = gcm::neon::Key::new(gcm_key_value, cpu);
+        Ok(Self::Simd(Combo { aes_key, gcm_key }))
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[inline(never)]
+    fn new_ssse3(
+        key: aes::KeyBytes,
+        cpu: aes::vp::RequiredCpuFeatures,
+    ) -> Result<Self, error::Unspecified> {
+        let aes_key = aes::vp::Key::new(key, cpu)?;
+        let gcm_key_value = derive_gcm_key_value(&aes_key);
+        let gcm_key = gcm::fallback::Key::new(gcm_key_value);
+        Ok(Self::Simd(Combo { aes_key, gcm_key }))
+    }
+
+    #[cfg_attr(
+        any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86",
+            target_arch = "x86_64",
+        ),
+        inline(never)
+    )]
+    fn new_fallback(key: aes::KeyBytes) -> Result<Self, error::Unspecified> {
+        let aes_key = aes::fallback::Key::new(key)?;
+        let gcm_key_value = derive_gcm_key_value(&aes_key);
+        let gcm_key = gcm::fallback::Key::new(gcm_key_value);
+        Ok(Self::Fallback(Combo { aes_key, gcm_key }))
+    }
+}
+
+fn derive_gcm_key_value(aes_key: &impl aes::EncryptBlock) -> gcm::KeyValue {
+    gcm::KeyValue::new(aes_key.encrypt_block(ZERO_BLOCK))
+}
+
+const CHUNK_BLOCKS: usize = 3 * 1024 / 16;
+
+#[inline(never)]
+pub(super) fn seal(
+    Key(key): &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+) -> Result<Tag, error::Unspecified> {
+    let mut ctr = Counter::one(nonce);
+    let tag_iv = ctr.increment();
+
+    match key {
+        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+        DynKey::AesHwClMul(c) => {
+            seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
+        }
+
+        #[cfg(target_arch = "x86_64")]
+        DynKey::VAesClMulAvx2(c) => seal_whole_partial(
+            c,
+            aad,
+            in_out,
+            ctr,
+            tag_iv,
+            vaesclmulavx2::seal_whole_vaes_clmul_avx2,
+        ),
+
+        #[cfg(target_arch = "x86_64")]
+        DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => {
+            aeshwclmulmovbe::seal(aes_key, gcm_key, ctr, tag_iv, aad, in_out)
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        DynKey::AesHwClMul(c) => seal_strided(c, aad, in_out, ctr, tag_iv),
+
+        #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86_64",
+            target_arch = "x86"
+        ))]
+        DynKey::Simd(c) => seal_strided(c, aad, in_out, ctr, tag_iv),
+
+        DynKey::Fallback(c) => seal_strided(c, aad, in_out, ctr, tag_iv),
+    }
+}
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+fn seal_whole_partial<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
+    Combo { aes_key, gcm_key }: &Combo<A, G>,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    mut ctr: Counter,
+    tag_iv: aes::Iv,
+    seal_whole: impl FnOnce(&A, &mut gcm::Context<G>, &mut Counter, slice::AsChunksMut<u8, BLOCK_LEN>),
+) -> Result<Tag, error::Unspecified> {
+    let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?;
+    let (whole, remainder) = slice::as_chunks_mut(in_out);
+    seal_whole(aes_key, &mut auth, &mut ctr, whole);
+    let remainder = OverlappingPartialBlock::new(remainder.into())
+        .unwrap_or_else(|InputTooLongError { .. }| unreachable!());
+    seal_finish(aes_key, auth, remainder, ctr, tag_iv)
+}
+
+#[cfg_attr(
+    any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86",
+        target_arch = "x86_64"
+    ),
+    inline(never)
+)]
+#[cfg_attr(
+    any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ),
+    cold
+)]
+fn seal_strided<
+    A: aes::EncryptBlock + aes::EncryptCtr32,
+    G: gcm::UpdateBlock + gcm::UpdateBlocks,
+>(
+    Combo { aes_key, gcm_key }: &Combo<A, G>,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    mut ctr: Counter,
+    tag_iv: aes::Iv,
+) -> Result<Tag, error::Unspecified> {
+    let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?;
+
+    let (mut whole, remainder) = slice::as_chunks_mut(in_out);
+
+    for mut chunk in whole.chunks_mut::<CHUNK_BLOCKS>() {
+        aes_key.ctr32_encrypt_within(chunk.as_flattened_mut().into(), &mut ctr);
+        auth.update_blocks(chunk.as_ref());
+    }
+
+    let remainder = OverlappingPartialBlock::new(remainder.into())
+        .unwrap_or_else(|InputTooLongError { .. }| unreachable!());
+    seal_finish(aes_key, auth, remainder, ctr, tag_iv)
+}
+
+fn seal_finish<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
+    aes_key: &A,
+    mut auth: gcm::Context<G>,
+    remainder: OverlappingPartialBlock<'_>,
+    ctr: Counter,
+    tag_iv: aes::Iv,
+) -> Result<Tag, error::Unspecified> {
+    let remainder_len = remainder.len();
+    if remainder_len > 0 {
+        let mut input = ZERO_BLOCK;
+        overwrite_at_start(&mut input, remainder.input());
+        let mut output = aes_key.encrypt_iv_xor_block(ctr.into(), input);
+        output[remainder_len..].fill(0);
+        auth.update_block(output);
+        remainder.overwrite_at_start(output);
+    }
+
+    Ok(finish(aes_key, auth, tag_iv))
+}
+
+#[inline(never)]
+pub(super) fn open(
+    Key(key): &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out_slice: &mut [u8],
+    src: RangeFrom<usize>,
+) -> Result<Tag, error::Unspecified> {
+    let mut ctr = Counter::one(nonce);
+    let tag_iv = ctr.increment();
+
+    match key {
+        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+        DynKey::AesHwClMul(c) => {
+            open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
+        }
+
+        #[cfg(target_arch = "x86_64")]
+        DynKey::VAesClMulAvx2(c) => open_whole_partial(
+            c,
+            aad,
+            in_out_slice,
+            src,
+            ctr,
+            tag_iv,
+            vaesclmulavx2::open_whole_vaes_clmul_avx2,
+        ),
+
+        #[cfg(target_arch = "x86_64")]
+        DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => {
+            aeshwclmulmovbe::open(aes_key, gcm_key, ctr, tag_iv, aad, in_out_slice, src)
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        DynKey::AesHwClMul(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv),
+
+        #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86_64",
+            target_arch = "x86"
+        ))]
+        DynKey::Simd(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv),
+
+        DynKey::Fallback(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv),
+    }
+}
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+fn open_whole_partial<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
+    Combo { aes_key, gcm_key }: &Combo<A, G>,
+    aad: Aad<&[u8]>,
+    in_out_slice: &mut [u8],
+    src: RangeFrom<usize>,
+    mut ctr: Counter,
+    tag_iv: aes::Iv,
+    open_whole: impl FnOnce(&A, &mut gcm::Context<G>, Overlapping, &mut Counter),
+) -> Result<Tag, error::Unspecified> {
+    let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::<IndexError>)?;
+    let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?;
+
+    let remainder_len = in_out.len() % BLOCK_LEN;
+
+    let in_out_slice_len = in_out_slice.len();
+    let whole_in_out_slice = &mut in_out_slice[..(in_out_slice_len - remainder_len)];
+    let whole = Overlapping::new(whole_in_out_slice, src.clone())
+        .unwrap_or_else(|IndexError { .. }| unreachable!());
+    let whole_len = whole.len();
+    open_whole(aes_key, &mut auth, whole, &mut ctr);
+
+    let remainder = &mut in_out_slice[whole_len..];
+    let remainder =
+        Overlapping::new(remainder, src).unwrap_or_else(|IndexError { .. }| unreachable!());
+    let remainder = OverlappingPartialBlock::new(remainder)
+        .unwrap_or_else(|InputTooLongError { .. }| unreachable!());
+    open_finish(aes_key, auth, remainder, ctr, tag_iv)
+}
+
+#[cfg_attr(
+    any(
+        all(
+            any(
+                all(target_arch = "aarch64", target_endian = "little"),
+                all(target_arch = "arm", target_endian = "little")
+            ),
+            target_feature = "neon"
+        ),
+        all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse"
+        )
+    ),
+    inline(never)
+)]
+#[cfg_attr(
+    any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ),
+    cold
+)]
+fn open_strided<
+    A: aes::EncryptBlock + aes::EncryptCtr32,
+    G: gcm::UpdateBlock + gcm::UpdateBlocks,
+>(
+    Combo { aes_key, gcm_key }: &Combo<A, G>,
+    aad: Aad<&[u8]>,
+    in_out_slice: &mut [u8],
+    src: RangeFrom<usize>,
+    mut ctr: Counter,
+    tag_iv: aes::Iv,
+) -> Result<Tag, error::Unspecified> {
+    let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::<IndexError>)?;
+    let input = in_out.input();
+    let input_len = input.len();
+
+    let mut auth = gcm::Context::new(gcm_key, aad, input_len)?;
+
+    let remainder_len = input_len % BLOCK_LEN;
+    let whole_len = input_len - remainder_len;
+    let in_prefix_len = src.start;
+
+    {
+        let mut chunk_len = CHUNK_BLOCKS * BLOCK_LEN;
+        let mut output = 0;
+        let mut input = in_prefix_len;
+        loop {
+            if whole_len - output < chunk_len {
+                chunk_len = whole_len - output;
+            }
+
+            let ciphertext = &in_out_slice[input..][..chunk_len];
+            let (ciphertext, leftover) = slice::as_chunks(ciphertext);
+            debug_assert_eq!(leftover.len(), 0);
+            if ciphertext.is_empty() {
+                break;
+            }
+            auth.update_blocks(ciphertext);
+
+            let chunk = Overlapping::new(
+                &mut in_out_slice[output..][..(chunk_len + in_prefix_len)],
+                in_prefix_len..,
+            )
+            .map_err(error::erase::<IndexError>)?;
+            aes_key.ctr32_encrypt_within(chunk, &mut ctr);
+            output += chunk_len;
+            input += chunk_len;
+        }
+    }
+
+    let in_out = Overlapping::new(&mut in_out_slice[whole_len..], src)
+        .unwrap_or_else(|IndexError { .. }| unreachable!());
+    let in_out = OverlappingPartialBlock::new(in_out)
+        .unwrap_or_else(|InputTooLongError { .. }| unreachable!());
+
+    open_finish(aes_key, auth, in_out, ctr, tag_iv)
+}
+
+fn open_finish<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
+    aes_key: &A,
+    mut auth: gcm::Context<G>,
+    remainder: OverlappingPartialBlock<'_>,
+    ctr: Counter,
+    tag_iv: aes::Iv,
+) -> Result<Tag, error::Unspecified> {
+    if remainder.len() > 0 {
+        let mut input = ZERO_BLOCK;
+        overwrite_at_start(&mut input, remainder.input());
+        auth.update_block(input);
+        remainder.overwrite_at_start(aes_key.encrypt_iv_xor_block(ctr.into(), input));
+    }
+    Ok(finish(aes_key, auth, tag_iv))
+}
+
+fn finish<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
+    aes_key: &A,
+    gcm_ctx: gcm::Context<G>,
+    tag_iv: aes::Iv,
+) -> Tag {
+    // Finalize the tag and return it.
+    gcm_ctx.pre_finish(|pre_tag| Tag(aes_key.encrypt_iv_xor_block(tag_iv, pre_tag)))
+}
+
+pub(super) const MAX_IN_OUT_LEN: usize = super::max_input_len(BLOCK_LEN, 2);
+
+// [NIST SP800-38D] Section 5.2.1.1. Note that [RFC 5116 Section 5.1] and
+// [RFC 5116 Section 5.2] have an off-by-one error in `P_MAX`.
+//
+// [NIST SP800-38D]:
+//    http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+// [RFC 5116 Section 5.1]: https://tools.ietf.org/html/rfc5116#section-5.1
+// [RFC 5116 Section 5.2]: https://tools.ietf.org/html/rfc5116#section-5.2
+const _MAX_INPUT_LEN_BOUNDED_BY_NIST: () =
+    assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(((1u64 << 39) - 256) / 8));
+
+#[derive(Copy, Clone)]
+pub(super) struct Combo<Aes, Gcm> {
+    pub(super) aes_key: Aes,
+    pub(super) gcm_key: Gcm,
+}
diff --git a/ring-0.17.14/src/aead/aes_gcm/aarch64.rs b/ring-0.17.14/src/aead/aes_gcm/aarch64.rs
new file mode 100644
index 0000000000..15aa5b134f
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes_gcm/aarch64.rs
@@ -0,0 +1,95 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(all(target_arch = "aarch64", target_endian = "little"))]
+
+use super::{aes, gcm, Counter, BLOCK_LEN};
+use crate::{aead::aes::Overlapping, bits::BitLength, polyfill::slice::AsChunksMut};
+use core::num::NonZeroU64;
+
+pub(super) fn seal_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::clmul::Key>,
+    ctr: &mut Counter,
+    mut in_out: AsChunksMut<u8, BLOCK_LEN>,
+) {
+    let whole_block_bits = auth.in_out_whole_block_bits();
+    let whole_block_bits_u64: BitLength<u64> = whole_block_bits.into();
+    if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() {
+        let (htable, xi) = auth.inner();
+
+        prefixed_extern! {
+            fn aes_gcm_enc_kernel(
+                input: *const [u8; BLOCK_LEN],
+                in_bits: BitLength<NonZeroU64>,
+                output: *mut [u8; BLOCK_LEN],
+                Xi: &mut gcm::Xi,
+                ivec: &mut Counter,
+                key: &aes::AES_KEY,
+                Htable: &gcm::HTable);
+        }
+
+        unsafe {
+            aes_gcm_enc_kernel(
+                in_out.as_ptr(),
+                whole_block_bits,
+                in_out.as_mut_ptr(),
+                xi,
+                ctr,
+                aes_key.inner_less_safe(),
+                htable,
+            )
+        }
+    }
+}
+
+pub(super) fn open_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::clmul::Key>,
+    in_out: Overlapping,
+    ctr: &mut Counter,
+) {
+    // Precondition. TODO: Create an overlapping::AsChunks for this.
+    assert_eq!(in_out.len() % BLOCK_LEN, 0);
+
+    in_out.with_input_output_len(|input, output, _len| {
+        let whole_block_bits = auth.in_out_whole_block_bits();
+        let whole_block_bits_u64: BitLength<u64> = whole_block_bits.into();
+        if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() {
+            let (htable, xi) = auth.inner();
+            prefixed_extern! {
+                fn aes_gcm_dec_kernel(
+                    input: *const u8,
+                    in_bits: BitLength<NonZeroU64>,
+                    output: *mut u8,
+                    Xi: &mut gcm::Xi,
+                    ivec: &mut Counter,
+                    key: &aes::AES_KEY,
+                    Htable: &gcm::HTable);
+            }
+
+            unsafe {
+                aes_gcm_dec_kernel(
+                    input,
+                    whole_block_bits,
+                    output,
+                    xi,
+                    ctr,
+                    aes_key.inner_less_safe(),
+                    htable,
+                )
+            }
+        }
+    })
+}
diff --git a/ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs b/ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs
new file mode 100644
index 0000000000..e6d49ee3bb
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs
@@ -0,0 +1,154 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{
+    super::overlapping::IndexError,
+    aes::{self, Counter, EncryptCtr32, Overlapping, OverlappingPartialBlock},
+    gcm, Aad, Tag,
+};
+use crate::{
+    c,
+    error::{self, InputTooLongError},
+    polyfill::slice,
+};
+use core::ops::RangeFrom;
+
+#[inline(never)]
+pub(super) fn seal(
+    aes_key: &aes::hw::Key,
+    gcm_key: &gcm::clmulavxmovbe::Key,
+    mut ctr: Counter,
+    tag_iv: aes::Iv,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+) -> Result<Tag, error::Unspecified> {
+    prefixed_extern! {
+        // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The
+        // assembly says it needs just nine values in that array.
+        fn aesni_gcm_encrypt(
+            input: *const u8,
+            output: *mut u8,
+            len: c::size_t,
+            key: &aes::AES_KEY,
+            ivec: &mut Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi) -> c::size_t;
+    }
+
+    let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?;
+    let (htable, xi) = auth.inner();
+
+    let processed = unsafe {
+        aesni_gcm_encrypt(
+            in_out.as_ptr(),
+            in_out.as_mut_ptr(),
+            in_out.len(),
+            aes_key.inner_less_safe(),
+            &mut ctr,
+            htable,
+            xi,
+        )
+    };
+
+    let ramaining = match in_out.get_mut(processed..) {
+        Some(remaining) => remaining,
+        None => {
+            // This can't happen. If it did, then the assembly already
+            // caused a buffer overflow.
+            unreachable!()
+        }
+    };
+    let (mut whole, remainder) = slice::as_chunks_mut(ramaining);
+    aes_key.ctr32_encrypt_within(whole.as_flattened_mut().into(), &mut ctr);
+    auth.update_blocks(whole.as_ref());
+    let remainder = OverlappingPartialBlock::new(remainder.into())
+        .unwrap_or_else(|InputTooLongError { .. }| unreachable!());
+
+    super::seal_finish(aes_key, auth, remainder, ctr, tag_iv)
+}
+
+#[inline(never)]
+pub(super) fn open(
+    aes_key: &aes::hw::Key,
+    gcm_key: &gcm::clmulavxmovbe::Key,
+    mut ctr: Counter,
+    tag_iv: aes::Iv,
+    aad: Aad<&[u8]>,
+    in_out_slice: &mut [u8],
+    src: RangeFrom<usize>,
+) -> Result<Tag, error::Unspecified> {
+    prefixed_extern! {
+        // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The
+        // assembly says it needs just nine values in that array.
+        fn aesni_gcm_decrypt(
+            input: *const u8,
+            output: *mut u8,
+            len: c::size_t,
+            key: &aes::AES_KEY,
+            ivec: &mut Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi) -> c::size_t;
+    }
+
+    let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::<IndexError>)?;
+    let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?;
+    let processed = in_out.with_input_output_len(|input, output, len| {
+        let (htable, xi) = auth.inner();
+        unsafe {
+            aesni_gcm_decrypt(
+                input,
+                output,
+                len,
+                aes_key.inner_less_safe(),
+                &mut ctr,
+                htable,
+                xi,
+            )
+        }
+    });
+    let in_out_slice = in_out_slice.get_mut(processed..).unwrap_or_else(|| {
+        // This can't happen. If it did, then the assembly already
+        // caused a buffer overflow.
+        unreachable!()
+    });
+    // Authenticate any remaining whole blocks.
+    let in_out =
+        Overlapping::new(in_out_slice, src.clone()).unwrap_or_else(|IndexError { .. }| {
+            // This can't happen. If it did, then the assembly already
+            // overwrote part of the remaining input.
+            unreachable!()
+        });
+    let (whole, _) = slice::as_chunks(in_out.input());
+    auth.update_blocks(whole);
+
+    let whole_len = whole.as_flattened().len();
+
+    // Decrypt any remaining whole blocks.
+    let whole = Overlapping::new(&mut in_out_slice[..(src.start + whole_len)], src.clone())
+        .map_err(error::erase::<IndexError>)?;
+    aes_key.ctr32_encrypt_within(whole, &mut ctr);
+
+    let in_out_slice = match in_out_slice.get_mut(whole_len..) {
+        Some(partial) => partial,
+        None => unreachable!(),
+    };
+    let in_out =
+        Overlapping::new(in_out_slice, src).unwrap_or_else(|IndexError { .. }| unreachable!());
+    let in_out = OverlappingPartialBlock::new(in_out)
+        .unwrap_or_else(|InputTooLongError { .. }| unreachable!());
+
+    super::open_finish(aes_key, auth, in_out, ctr, tag_iv)
+}
diff --git a/ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs b/ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs
new file mode 100644
index 0000000000..8a2a68f238
--- /dev/null
+++ b/ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs
@@ -0,0 +1,86 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{aes, gcm, Counter, BLOCK_LEN};
+use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
+use core::num::NonZeroU32;
+
+pub(super) fn seal_whole_vaes_clmul_avx2(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx2::Key>,
+    ctr: &mut Counter,
+    mut in_out: AsChunksMut<u8, BLOCK_LEN>,
+) {
+    prefixed_extern! {
+        fn aes_gcm_enc_update_vaes_avx2(
+            input: *const u8,
+            output: *mut u8,
+            len: c::size_t,
+            key: &aes::AES_KEY,
+            ivec: &Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(blocks) = NonZeroU32::new(blocks) {
+        let aes_key = aes_key.inner_less_safe();
+        let (htable, xi) = auth.inner();
+        let input = in_out.as_ptr();
+        let output = in_out.as_mut_ptr();
+        let len = in_out.len();
+        unsafe { aes_gcm_enc_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) };
+        ctr.increment_by_less_safe(blocks);
+    }
+}
+
+pub(super) fn open_whole_vaes_clmul_avx2(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx2::Key>,
+    in_out: Overlapping,
+    ctr: &mut Counter,
+) {
+    prefixed_extern! {
+        fn aes_gcm_dec_update_vaes_avx2(
+            input: *const u8,
+            output: *mut u8,
+            len: c::size_t,
+            key: &aes::AES_KEY,
+            ivec: &mut Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi);
+    }
+
+    // Precondition. TODO: Create an overlapping::AsChunks for this.
+    assert_eq!(in_out.len() % BLOCK_LEN, 0);
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(blocks) = NonZeroU32::new(blocks) {
+        let aes_key = aes_key.inner_less_safe();
+        let (htable, xi) = auth.inner();
+        in_out.with_input_output_len(|input, output, len| unsafe {
+            aes_gcm_dec_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi)
+        });
+        ctr.increment_by_less_safe(blocks);
+    }
+}
diff --git a/ring-0.17.14/src/aead/algorithm.rs b/ring-0.17.14/src/aead/algorithm.rs
new file mode 100644
index 0000000000..24d4fb89b6
--- /dev/null
+++ b/ring-0.17.14/src/aead/algorithm.rs
@@ -0,0 +1,269 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{
+    bb, cpu,
+    error::{self, InputTooLongError},
+    hkdf,
+};
+use core::ops::RangeFrom;
+
+use super::{
+    aes, aes_gcm, chacha20_poly1305,
+    nonce::{Nonce, NONCE_LEN},
+    overlapping::{IndexError, Overlapping},
+    Aad, KeyInner, Tag, TAG_LEN,
+};
+
+impl hkdf::KeyType for &'static Algorithm {
+    #[inline]
+    fn len(&self) -> usize {
+        self.key_len()
+    }
+}
+
+/// An AEAD Algorithm.
+pub struct Algorithm {
+    init: fn(key: &[u8], cpu_features: cpu::Features) -> Result<KeyInner, error::Unspecified>,
+
+    seal: fn(
+        key: &KeyInner,
+        nonce: Nonce,
+        aad: Aad<&[u8]>,
+        in_out: &mut [u8],
+        cpu_features: cpu::Features,
+    ) -> Result<Tag, error::Unspecified>,
+    open: fn(
+        key: &KeyInner,
+        nonce: Nonce,
+        aad: Aad<&[u8]>,
+        in_out: &mut [u8],
+        src: RangeFrom<usize>,
+        cpu_features: cpu::Features,
+    ) -> Result<Tag, error::Unspecified>,
+
+    key_len: usize,
+    id: AlgorithmID,
+}
+
+impl Algorithm {
+    /// The length of the key.
+    #[inline(always)]
+    pub fn key_len(&self) -> usize {
+        self.key_len
+    }
+
+    /// The length of a tag.
+    ///
+    /// See also `MAX_TAG_LEN`.
+    #[inline(always)]
+    pub fn tag_len(&self) -> usize {
+        TAG_LEN
+    }
+
+    /// The length of the nonces.
+    #[inline(always)]
+    pub fn nonce_len(&self) -> usize {
+        NONCE_LEN
+    }
+
+    pub(super) fn new_key(
+        &self,
+        key_bytes: &[u8],
+        cpu_features: cpu::Features,
+    ) -> Result<KeyInner, error::Unspecified> {
+        (self.init)(key_bytes, cpu_features)
+    }
+
+    pub(super) fn open_within<'io>(
+        &self,
+        key: &KeyInner,
+        nonce: Nonce,
+        aad: Aad<&[u8]>,
+        received_tag: Tag,
+        in_out: &'io mut [u8],
+        src: RangeFrom<usize>,
+        cpu_features: cpu::Features,
+    ) -> Result<&'io mut [u8], error::Unspecified> {
+        let ciphertext_len = in_out.get(src.clone()).ok_or(error::Unspecified)?.len();
+
+        let Tag(calculated_tag) = (self.open)(key, nonce, aad, in_out, src, cpu_features)?;
+
+        if bb::verify_slices_are_equal(calculated_tag.as_ref(), received_tag.as_ref()).is_err() {
+            // Zero out the plaintext so that it isn't accidentally leaked or used
+            // after verification fails. It would be safest if we could check the
+            // tag before decrypting, but some `open` implementations interleave
+            // authentication with decryption for performance.
+            for b in &mut in_out[..ciphertext_len] {
+                *b = 0;
+            }
+            return Err(error::Unspecified);
+        }
+
+        // `ciphertext_len` is also the plaintext length.
+        Ok(&mut in_out[..ciphertext_len])
+    }
+
+    #[inline]
+    pub(super) fn seal(
+        &self,
+        key: &KeyInner,
+        nonce: Nonce,
+        aad: Aad<&[u8]>,
+        in_out: &mut [u8],
+        cpu_features: cpu::Features,
+    ) -> Result<Tag, error::Unspecified> {
+        (self.seal)(key, nonce, aad, in_out, cpu_features)
+    }
+}
+
+derive_debug_via_id!(Algorithm);
+
+#[derive(Debug, Eq, PartialEq)]
+pub(super) enum AlgorithmID {
+    AES_128_GCM,
+    AES_256_GCM,
+    CHACHA20_POLY1305,
+}
+
+impl PartialEq for Algorithm {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+    }
+}
+
+impl Eq for Algorithm {}
+
+/// AES-128 in GCM mode with 128-bit tags and 96 bit nonces.
+pub static AES_128_GCM: Algorithm = Algorithm {
+    key_len: aes::AES_128_KEY_LEN,
+    init: aes_gcm_init_128,
+    seal: aes_gcm_seal,
+    open: aes_gcm_open,
+    id: AlgorithmID::AES_128_GCM,
+};
+
+/// AES-256 in GCM mode with 128-bit tags and 96 bit nonces.
+pub static AES_256_GCM: Algorithm = Algorithm {
+    key_len: aes::AES_256_KEY_LEN,
+    init: aes_gcm_init_256,
+    seal: aes_gcm_seal,
+    open: aes_gcm_open,
+    id: AlgorithmID::AES_256_GCM,
+};
+
+fn aes_gcm_init_128(
+    key: &[u8],
+    cpu_features: cpu::Features,
+) -> Result<KeyInner, error::Unspecified> {
+    let key = key.try_into().map_err(|_| error::Unspecified)?;
+    Ok(KeyInner::AesGcm(aes_gcm::Key::new(
+        aes::KeyBytes::AES_128(key),
+        cpu_features,
+    )?))
+}
+
+fn aes_gcm_init_256(
+    key: &[u8],
+    cpu_features: cpu::Features,
+) -> Result<KeyInner, error::Unspecified> {
+    let key = key.try_into().map_err(|_| error::Unspecified)?;
+    Ok(KeyInner::AesGcm(aes_gcm::Key::new(
+        aes::KeyBytes::AES_256(key),
+        cpu_features,
+    )?))
+}
+
+fn aes_gcm_seal(
+    key: &KeyInner,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    _cpu_features: cpu::Features,
+) -> Result<Tag, error::Unspecified> {
+    let key = match key {
+        KeyInner::AesGcm(key) => key,
+        _ => unreachable!(),
+    };
+    aes_gcm::seal(key, nonce, aad, in_out)
+}
+
+pub(super) fn aes_gcm_open(
+    key: &KeyInner,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    src: RangeFrom<usize>,
+    _cpu_features: cpu::Features,
+) -> Result<Tag, error::Unspecified> {
+    let key = match key {
+        KeyInner::AesGcm(key) => key,
+        _ => unreachable!(),
+    };
+    aes_gcm::open(key, nonce, aad, in_out, src)
+}
+
+/// ChaCha20-Poly1305 as described in [RFC 8439].
+///
+/// The keys are 256 bits long and the nonces are 96 bits long.
+///
+/// [RFC 8439]: https://tools.ietf.org/html/rfc8439
+pub static CHACHA20_POLY1305: Algorithm = Algorithm {
+    key_len: chacha20_poly1305::KEY_LEN,
+    init: chacha20_poly1305_init,
+    seal: chacha20_poly1305_seal,
+    open: chacha20_poly1305_open,
+    id: AlgorithmID::CHACHA20_POLY1305,
+};
+
+/// Copies |key| into |ctx_buf|.
+fn chacha20_poly1305_init(
+    key: &[u8],
+    _cpu_features: cpu::Features,
+) -> Result<KeyInner, error::Unspecified> {
+    let key: [u8; chacha20_poly1305::KEY_LEN] = key.try_into()?;
+    Ok(KeyInner::ChaCha20Poly1305(chacha20_poly1305::Key::new(key)))
+}
+
+fn chacha20_poly1305_seal(
+    key: &KeyInner,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    cpu_features: cpu::Features,
+) -> Result<Tag, error::Unspecified> {
+    let key = match key {
+        KeyInner::ChaCha20Poly1305(key) => key,
+        _ => unreachable!(),
+    };
+    chacha20_poly1305::seal(key, nonce, aad, in_out, cpu_features)
+        .map_err(error::erase::<InputTooLongError>)
+}
+
+fn chacha20_poly1305_open(
+    key: &KeyInner,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    src: RangeFrom<usize>,
+    cpu_features: cpu::Features,
+) -> Result<Tag, error::Unspecified> {
+    let key = match key {
+        KeyInner::ChaCha20Poly1305(key) => key,
+        _ => unreachable!(),
+    };
+    let in_out = Overlapping::new(in_out, src).map_err(error::erase::<IndexError>)?;
+    chacha20_poly1305::open(key, nonce, aad, in_out, cpu_features)
+        .map_err(error::erase::<InputTooLongError>)
+}
diff --git a/ring-0.17.14/src/aead/chacha.rs b/ring-0.17.14/src/aead/chacha.rs
new file mode 100644
index 0000000000..8d7230b58c
--- /dev/null
+++ b/ring-0.17.14/src/aead/chacha.rs
@@ -0,0 +1,327 @@
+// Copyright 2016 Brian Smith.
+// Portions Copyright (c) 2016, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{overlapping, quic::Sample, Nonce};
+use crate::cpu;
+use cfg_if::cfg_if;
+
+cfg_if! {
+    if #[cfg(any(
+                all(target_arch = "aarch64", target_endian = "little"),
+                all(target_arch = "arm", target_endian = "little"),
+                target_arch = "x86",
+                target_arch = "x86_64"
+            ))] {
+        #[macro_use]
+        mod ffi;
+        #[cfg(any(target_arch = "x86", test))]
+        mod fallback;
+    } else {
+        mod fallback;
+    }
+}
+
+use crate::polyfill::ArraySplitMap;
+
+pub type Overlapping<'o> = overlapping::Overlapping<'o, u8>;
+
+#[derive(Clone)]
+pub struct Key {
+    words: [u32; KEY_LEN / 4],
+}
+
+impl Key {
+    pub(super) fn new(value: [u8; KEY_LEN]) -> Self {
+        Self {
+            words: value.array_split_map(u32::from_le_bytes),
+        }
+    }
+}
+
+impl Key {
+    // Encrypts `in_out` with the counter 0 and returns counter 1,
+    // where the counter is derived from the nonce `nonce`.
+    #[inline]
+    pub(super) fn encrypt_single_block_with_ctr_0<const N: usize>(
+        &self,
+        nonce: Nonce,
+        in_out: &mut [u8; N],
+        cpu: cpu::Features,
+    ) -> Counter {
+        assert!(N <= BLOCK_LEN);
+        let (zero, one) = Counter::zero_one_less_safe(nonce);
+        self.encrypt(zero, in_out.as_mut().into(), cpu);
+        one
+    }
+
+    #[inline]
+    pub fn new_mask(&self, sample: Sample) -> [u8; 5] {
+        let cpu = cpu::features(); // TODO: Remove this.
+        let (ctr, nonce) = sample.split_at(4);
+        let ctr = u32::from_le_bytes(ctr.try_into().unwrap());
+        let nonce = Nonce::assume_unique_for_key(nonce.try_into().unwrap());
+        let ctr = Counter::from_nonce_and_ctr(nonce, ctr);
+
+        let mut out: [u8; 5] = [0; 5];
+        self.encrypt(ctr, out.as_mut().into(), cpu);
+        out
+    }
+
+    #[inline(always)]
+    pub(super) fn encrypt(&self, counter: Counter, in_out: Overlapping<'_>, cpu: cpu::Features) {
+        cfg_if! {
+            if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+                use cpu::{GetFeature as _, arm::Neon};
+                const NEON_MIN_LEN: usize = 192 + 1;
+                if in_out.len() >= NEON_MIN_LEN {
+                    if let Some(cpu) = cpu.get_feature() {
+                        return chacha20_ctr32_ffi!(
+                            unsafe { (NEON_MIN_LEN, Neon, Overlapping<'_>) => ChaCha20_ctr32_neon },
+                            self, counter, in_out, cpu);
+                    }
+                }
+                if in_out.len() >= 1 {
+                    chacha20_ctr32_ffi!(
+                        unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw },
+                        self, counter, in_out, ())
+                }
+            } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] {
+                use cpu::{GetFeature as _, arm::Neon};
+                const NEON_MIN_LEN: usize = 192 + 1;
+                if in_out.len() >= NEON_MIN_LEN {
+                    if let Some(cpu) = cpu.get_feature() {
+                        return chacha20_ctr32_ffi!(
+                            unsafe { (NEON_MIN_LEN, Neon, &mut [u8]) => ChaCha20_ctr32_neon },
+                            self, counter, in_out.copy_within(), cpu);
+                    }
+                }
+                if in_out.len() >= 1 {
+                    chacha20_ctr32_ffi!(
+                        unsafe { (1, (), &mut [u8]) => ChaCha20_ctr32_nohw },
+                        self, counter, in_out.copy_within(), ())
+                }
+            } else if #[cfg(target_arch = "x86")] {
+                use cpu::{GetFeature as _, intel::Ssse3};
+                if in_out.len() >= 1 {
+                    if let Some(cpu) = cpu.get_feature() {
+                        chacha20_ctr32_ffi!(
+                            unsafe { (1, Ssse3, &mut [u8]) => ChaCha20_ctr32_ssse3 },
+                            self, counter, in_out.copy_within(), cpu)
+                    } else {
+                        let _: cpu::Features = cpu;
+                        fallback::ChaCha20_ctr32(self, counter, in_out)
+                    }
+                }
+            } else if #[cfg(target_arch = "x86_64")] {
+                use cpu::{GetFeature, intel::{Avx2, Ssse3}};
+                const SSE_MIN_LEN: usize = 128 + 1; // Also AVX2, SSSE3_4X, SSSE3
+                if in_out.len() >= SSE_MIN_LEN {
+                    let values = cpu.values();
+                    if let Some(cpu) = values.get_feature() {
+                        return chacha20_ctr32_ffi!(
+                            unsafe { (SSE_MIN_LEN, Avx2, Overlapping<'_>) => ChaCha20_ctr32_avx2 },
+                            self, counter, in_out, cpu);
+                    }
+                    if let Some(cpu) = values.get_feature() {
+                        return chacha20_ctr32_ffi!(
+                            unsafe { (SSE_MIN_LEN, Ssse3, Overlapping<'_>) =>
+                                ChaCha20_ctr32_ssse3_4x },
+                            self, counter, in_out, cpu);
+                    }
+                }
+                if in_out.len() >= 1 {
+                    chacha20_ctr32_ffi!(
+                        unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw },
+                        self, counter, in_out, ())
+                }
+            } else {
+                let _: cpu::Features = cpu;
+                fallback::ChaCha20_ctr32(self, counter, in_out)
+            }
+        }
+    }
+
+    #[inline]
+    pub(super) fn words_less_safe(&self) -> &[u32; KEY_LEN / 4] {
+        &self.words
+    }
+}
+
+/// Counter || Nonce, all native endian.
+#[repr(transparent)]
+pub struct Counter([u32; 4]);
+
+impl Counter {
+    // Nonce-reuse: the caller must only use the first counter (0) for at most
+    // a single block.
+    fn zero_one_less_safe(nonce: Nonce) -> (Self, Self) {
+        let ctr0 @ Self([_, n0, n1, n2]) = Self::from_nonce_and_ctr(nonce, 0);
+        let ctr1 = Self([1, n0, n1, n2]);
+        (ctr0, ctr1)
+    }
+
+    fn from_nonce_and_ctr(nonce: Nonce, ctr: u32) -> Self {
+        let [n0, n1, n2] = nonce.as_ref().array_split_map(u32::from_le_bytes);
+        Self([ctr, n0, n1, n2])
+    }
+
+    /// This is "less safe" because it hands off management of the counter to
+    /// the caller.
+    #[cfg(any(
+        test,
+        not(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86_64"
+        ))
+    ))]
+    fn into_words_less_safe(self) -> [u32; 4] {
+        self.0
+    }
+}
+
+pub const KEY_LEN: usize = 32;
+
+const BLOCK_LEN: usize = 64;
+
+#[cfg(test)]
+mod tests {
+    extern crate alloc;
+
+    use super::{super::overlapping::IndexError, *};
+    use crate::error;
+    use crate::testutil as test;
+    use alloc::vec;
+
+    const MAX_ALIGNMENT_AND_OFFSET: (usize, usize) = (15, 259);
+    const MAX_ALIGNMENT_AND_OFFSET_SUBSET: (usize, usize) =
+        if cfg!(any(not(debug_assertions), feature = "slow_tests")) {
+            MAX_ALIGNMENT_AND_OFFSET
+        } else {
+            (0, 0)
+        };
+
+    #[test]
+    fn chacha20_test_default() {
+        // Always use `MAX_OFFSET` if we hav assembly code.
+        let max_offset = if cfg!(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86",
+            target_arch = "x86_64"
+        )) {
+            MAX_ALIGNMENT_AND_OFFSET
+        } else {
+            MAX_ALIGNMENT_AND_OFFSET_SUBSET
+        };
+        chacha20_test(max_offset, Key::encrypt);
+    }
+
+    // Smoketest the fallback implementation.
+    #[test]
+    fn chacha20_test_fallback() {
+        chacha20_test(MAX_ALIGNMENT_AND_OFFSET_SUBSET, |key, ctr, in_out, _cpu| {
+            fallback::ChaCha20_ctr32(key, ctr, in_out)
+        });
+    }
+
+    // Verifies the encryption is successful when done on overlapping buffers.
+    //
+    // On some branches of the 32-bit x86 and ARM assembly code the in-place
+    // operation fails in some situations where the input/output buffers are
+    // not exactly overlapping. Such failures are dependent not only on the
+    // degree of overlapping but also the length of the data. `encrypt_within`
+    // works around that.
+    fn chacha20_test(
+        max_alignment_and_offset: (usize, usize),
+        f: impl for<'k, 'o> Fn(&'k Key, Counter, Overlapping<'o>, cpu::Features),
+    ) {
+        let cpu = cpu::features();
+
+        // Reuse a buffer to avoid slowing down the tests with allocations.
+        let mut buf = vec![0u8; 1300];
+
+        test::run(
+            test_vector_file!("chacha_tests.txt"),
+            move |section, test_case| {
+                assert_eq!(section, "");
+
+                let key = test_case.consume_bytes("Key");
+                let key: &[u8; KEY_LEN] = key.as_slice().try_into()?;
+                let key = Key::new(*key);
+
+                let ctr = test_case.consume_usize("Ctr");
+                let nonce = test_case.consume_bytes("Nonce");
+                let input = test_case.consume_bytes("Input");
+                let output = test_case.consume_bytes("Output");
+
+                // Run the test case over all prefixes of the input because the
+                // behavior of ChaCha20 implementation changes dependent on the
+                // length of the input.
+                for len in 0..=input.len() {
+                    #[allow(clippy::cast_possible_truncation)]
+                    chacha20_test_case_inner(
+                        &key,
+                        &nonce,
+                        ctr as u32,
+                        &input[..len],
+                        &output[..len],
+                        &mut buf,
+                        max_alignment_and_offset,
+                        cpu,
+                        &f,
+                    );
+                }
+
+                Ok(())
+            },
+        );
+    }
+
+    fn chacha20_test_case_inner(
+        key: &Key,
+        nonce: &[u8],
+        ctr: u32,
+        input: &[u8],
+        expected: &[u8],
+        buf: &mut [u8],
+        (max_alignment, max_offset): (usize, usize),
+        cpu: cpu::Features,
+        f: &impl for<'k, 'o> Fn(&'k Key, Counter, Overlapping<'o>, cpu::Features),
+    ) {
+        const ARBITRARY: u8 = 123;
+
+        for alignment in 0..=max_alignment {
+            buf[..alignment].fill(ARBITRARY);
+            let buf = &mut buf[alignment..];
+            for offset in 0..=max_offset {
+                let buf = &mut buf[..(offset + input.len())];
+                buf[..offset].fill(ARBITRARY);
+                let src = offset..;
+                buf[src.clone()].copy_from_slice(input);
+
+                let ctr = Counter::from_nonce_and_ctr(
+                    Nonce::try_assume_unique_for_key(nonce).unwrap(),
+                    ctr,
+                );
+                let in_out = Overlapping::new(buf, src)
+                    .map_err(error::erase::<IndexError>)
+                    .unwrap();
+                f(key, ctr, in_out, cpu);
+                assert_eq!(&buf[..input.len()], expected)
+            }
+        }
+    }
+}
diff --git a/ring-0.17.14/src/aead/chacha/fallback.rs b/ring-0.17.14/src/aead/chacha/fallback.rs
new file mode 100644
index 0000000000..38f5430fd7
--- /dev/null
+++ b/ring-0.17.14/src/aead/chacha/fallback.rs
@@ -0,0 +1,108 @@
+// Copyright 2021 Brian Smith.
+// Portions Copyright (c) 2014, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+// Adapted from the public domain, estream code by D. Bernstein.
+// Adapted from the BoringSSL crypto/chacha/chacha.c.
+
+use super::{super::overlapping::IndexError, Counter, Key, Overlapping, BLOCK_LEN};
+use crate::{bb, polyfill::sliceutil};
+use core::mem::size_of;
+
+pub(super) fn ChaCha20_ctr32(key: &Key, counter: Counter, mut in_out: Overlapping<'_>) {
+    const SIGMA: [u32; 4] = [
+        u32::from_le_bytes(*b"expa"),
+        u32::from_le_bytes(*b"nd 3"),
+        u32::from_le_bytes(*b"2-by"),
+        u32::from_le_bytes(*b"te k"),
+    ];
+
+    let key = key.words_less_safe();
+    let counter = counter.into_words_less_safe();
+
+    let mut state = [
+        SIGMA[0], SIGMA[1], SIGMA[2], SIGMA[3], key[0], key[1], key[2], key[3], key[4], key[5],
+        key[6], key[7], counter[0], counter[1], counter[2], counter[3],
+    ];
+
+    let mut in_out_len = in_out.len();
+
+    let mut buf = [0u8; BLOCK_LEN];
+    while in_out_len > 0 {
+        chacha_core(&mut buf, &state);
+        state[12] += 1;
+
+        debug_assert_eq!(in_out_len, in_out.len());
+
+        // Both branches do the same thing, but the duplication helps the
+        // compiler optimize (vectorize) the `BLOCK_LEN` case.
+        if in_out_len >= BLOCK_LEN {
+            in_out = in_out
+                .split_first_chunk::<BLOCK_LEN>(|in_out| {
+                    bb::xor_assign_at_start(&mut buf, in_out.input());
+                    sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf);
+                })
+                .unwrap_or_else(|IndexError { .. }| {
+                    // Since `in_out_len == in_out.len() && in_out_len >= BLOCK_LEN`.
+                    unreachable!()
+                });
+        } else {
+            bb::xor_assign_at_start(&mut buf, in_out.input());
+            sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf);
+            break;
+        }
+
+        in_out_len -= BLOCK_LEN;
+    }
+}
+
+// Performs 20 rounds of ChaCha on `input`, storing the result in `output`.
+#[inline(always)]
+fn chacha_core(output: &mut [u8; BLOCK_LEN], input: &State) {
+    let mut x = *input;
+
+    for _ in (0..20).step_by(2) {
+        quarterround(&mut x, 0, 4, 8, 12);
+        quarterround(&mut x, 1, 5, 9, 13);
+        quarterround(&mut x, 2, 6, 10, 14);
+        quarterround(&mut x, 3, 7, 11, 15);
+        quarterround(&mut x, 0, 5, 10, 15);
+        quarterround(&mut x, 1, 6, 11, 12);
+        quarterround(&mut x, 2, 7, 8, 13);
+        quarterround(&mut x, 3, 4, 9, 14);
+    }
+
+    for (x, input) in x.iter_mut().zip(input.iter()) {
+        *x = x.wrapping_add(*input);
+    }
+
+    output
+        .chunks_exact_mut(size_of::<u32>())
+        .zip(x.iter())
+        .for_each(|(output, &x)| output.copy_from_slice(&x.to_le_bytes()));
+}
+
+#[inline(always)]
+fn quarterround(x: &mut State, a: usize, b: usize, c: usize, d: usize) {
+    #[inline(always)]
+    fn step(x: &mut State, a: usize, b: usize, c: usize, rotation: u32) {
+        x[a] = x[a].wrapping_add(x[b]);
+        x[c] = (x[c] ^ x[a]).rotate_left(rotation);
+    }
+    step(x, a, b, d, 16);
+    step(x, c, d, b, 12);
+    step(x, a, b, d, 8);
+    step(x, c, d, b, 7);
+}
+
+type State = [u32; BLOCK_LEN / 4];
diff --git a/ring-0.17.14/src/aead/chacha/ffi.rs b/ring-0.17.14/src/aead/chacha/ffi.rs
new file mode 100644
index 0000000000..bd570133fe
--- /dev/null
+++ b/ring-0.17.14/src/aead/chacha/ffi.rs
@@ -0,0 +1,66 @@
+// Copyright 2016-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{super::overlapping::Overlapping, Counter, Key};
+
+// `unsafe { (N, C, InOut) => f }` means that the function `f` is safe to call
+// iff the in/out length is at least `N`, the CPU features `C` are available,
+// and the input type is `InOut`. If `f` supports overlapping input/output then
+// `InOut` should be `Overlapping<'_, u8>`; otherwise it should be `&mut [u8]`.
+macro_rules! chacha20_ctr32_ffi {
+    ( unsafe { ($MIN_LEN:expr, $Cpu:ty, $InOut:ty) => $f:ident },
+      $key:expr, $counter:expr, $in_out:expr, $cpu:expr ) => {{
+        prefixed_extern! {
+            fn $f(
+                out: *mut u8,
+                in_: *const u8,
+                in_len: crate::c::size_t,
+                key: &[u32; 8],
+                counter: &crate::aead::chacha::Counter,
+            );
+        }
+        // SAFETY: The user asserts that $f has the signature above and is safe
+        // to call if additionally we have a value of type `$Cpu` and an in/out
+        // value of the indicated type, which we do.
+        unsafe {
+            crate::aead::chacha::ffi::chacha20_ctr32_ffi::<$InOut, $Cpu, $MIN_LEN>(
+                $key, $counter, $in_out, $cpu, $f,
+            )
+        }
+    }};
+}
+
+// Panics if `in_out.len() < MIN_LEN`. The caller should have guarded against
+// that so that the assertion gets optimized away.
+pub(super) unsafe fn chacha20_ctr32_ffi<
+    'o,
+    InOut: 'o + Into<Overlapping<'o, u8>>,
+    Cpu,
+    const MIN_LEN: usize,
+>(
+    key: &Key,
+    counter: Counter,
+    in_out: InOut,
+    cpu: Cpu,
+    f: unsafe extern "C" fn(*mut u8, *const u8, crate::c::size_t, &[u32; 8], &Counter),
+) {
+    assert!(MIN_LEN > 0);
+    let in_out: Overlapping<'_, u8> = in_out.into();
+    in_out.with_input_output_len(|input, output, len| {
+        assert!(len >= MIN_LEN);
+        let key = key.words_less_safe();
+        let _: Cpu = cpu;
+        unsafe { f(output, input, len, key, &counter) }
+    });
+}
diff --git a/ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs b/ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs
new file mode 100644
index 0000000000..bb7ec28419
--- /dev/null
+++ b/ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs
@@ -0,0 +1,230 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    super::{NONCE_LEN, TAG_LEN},
+    chacha::Overlapping,
+    check_input_lengths, Aad, InputTooLongError, Key, Nonce, Tag, KEY_LEN,
+};
+use cfg_if::cfg_if;
+
+macro_rules! declare_open {
+    ( $name:ident ) => {
+        prefixed_extern! {
+            fn $name(
+                out_plaintext: *mut u8,
+                ciphertext: *const u8,
+                plaintext_len: usize,
+                ad: *const u8,
+                ad_len: usize,
+                data: &mut InOut<open_data_in>,
+            );
+        }
+    };
+}
+
+macro_rules! declare_seal {
+    ( $name:ident ) => {
+        prefixed_extern! {
+            fn $name(
+                out_ciphertext: *mut u8,
+                plaintext: *const u8,
+                plaintext_len: usize,
+                ad: *const u8,
+                ad_len: usize,
+                data: &mut InOut<seal_data_in>,
+            );
+        }
+    };
+}
+
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+        use crate::cpu::arm::Neon;
+        type RequiredCpuFeatures = Neon;
+        type OptionalCpuFeatures = ();
+    } else {
+        use crate::cpu::intel::{Avx2, Bmi2, Sse41};
+        type RequiredCpuFeatures = Sse41;
+        type OptionalCpuFeatures = (Avx2, Bmi2);
+    }
+}
+
+pub(super) fn seal(
+    Key(key): &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    required_cpu_features: RequiredCpuFeatures,
+    optional_cpu_features: Option<OptionalCpuFeatures>,
+) -> Result<Tag, InputTooLongError> {
+    check_input_lengths(aad, in_out)?;
+
+    // XXX: BoringSSL uses `alignas(16)` on `key` instead of on the
+    // structure, but Rust can't do that yet; see
+    // https://github.com/rust-lang/rust/issues/73557.
+    //
+    // Keep in sync with the anonymous struct of BoringSSL's
+    // `chacha20_poly1305_seal_data`.
+    #[repr(align(16), C)]
+    #[derive(Clone, Copy)]
+    struct seal_data_in {
+        key: [u32; KEY_LEN / 4],
+        counter: u32,
+        nonce: [u8; NONCE_LEN],
+        extra_ciphertext: *const u8,
+        extra_ciphertext_len: usize,
+    }
+
+    let mut data = InOut {
+        input: seal_data_in {
+            key: *key.words_less_safe(),
+            counter: 0,
+            nonce: *nonce.as_ref(),
+            extra_ciphertext: core::ptr::null(),
+            extra_ciphertext_len: 0,
+        },
+    };
+
+    // Encrypts `plaintext_len` bytes from `plaintext` and writes them to `out_ciphertext`.
+
+    let output = in_out.as_mut_ptr();
+    let input = in_out.as_ptr();
+    let len = in_out.len();
+    let ad = aad.as_ref().as_ptr();
+    let ad_len = aad.as_ref().len();
+
+    #[allow(clippy::needless_late_init)]
+    let tag;
+
+    cfg_if! {
+        if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+            declare_seal! { chacha20_poly1305_seal }
+            let _: Neon = required_cpu_features;
+            let _: Option<()> = optional_cpu_features;
+            tag = unsafe {
+                chacha20_poly1305_seal(output, input, len, ad, ad_len, &mut data);
+                &data.out.tag
+            };
+        } else {
+            let _: Sse41 = required_cpu_features;
+            if matches!(optional_cpu_features, Some((Avx2 { .. }, Bmi2 { .. }))) {
+                declare_seal! { chacha20_poly1305_seal_avx2 }
+                tag = unsafe {
+                    chacha20_poly1305_seal_avx2(output, input, len, ad, ad_len, &mut data);
+                    &data.out.tag
+                };
+            } else {
+                declare_seal! { chacha20_poly1305_seal_sse41 }
+                tag = unsafe {
+                    chacha20_poly1305_seal_sse41(output, input, len, ad, ad_len, &mut data);
+                    &data.out.tag
+                };
+            }
+        }
+    }
+
+    Ok(Tag(*tag))
+}
+
+pub(super) fn open(
+    Key(key): &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: Overlapping<'_>,
+    required_cpu_features: RequiredCpuFeatures,
+    optional_cpu_features: Option<OptionalCpuFeatures>,
+) -> Result<Tag, InputTooLongError> {
+    check_input_lengths(aad, in_out.input())?;
+
+    // XXX: BoringSSL uses `alignas(16)` on `key` instead of on the
+    // structure, but Rust can't do that yet; see
+    // https://github.com/rust-lang/rust/issues/73557.
+    //
+    // Keep in sync with the anonymous struct of BoringSSL's
+    // `chacha20_poly1305_open_data`.
+    #[derive(Copy, Clone)]
+    #[repr(align(16), C)]
+    struct open_data_in {
+        key: [u32; KEY_LEN / 4],
+        counter: u32,
+        nonce: [u8; NONCE_LEN],
+    }
+
+    let mut data = InOut {
+        input: open_data_in {
+            key: *key.words_less_safe(),
+            counter: 0,
+            nonce: *nonce.as_ref(),
+        },
+    };
+
+    in_out.with_input_output_len(|input, output, len| {
+        let ad = aad.as_ref().as_ptr();
+        let ad_len = aad.as_ref().len();
+
+        #[allow(clippy::needless_late_init)]
+        let tag;
+
+        cfg_if! {
+            if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+                declare_open! { chacha20_poly1305_open }
+                let _: Neon = required_cpu_features;
+                let _: Option<()> = optional_cpu_features;
+                tag = unsafe {
+                    chacha20_poly1305_open(output, input, len, ad, ad_len, &mut data);
+                    &data.out.tag
+                };
+            } else {
+                let _: Sse41 = required_cpu_features;
+                if matches!(optional_cpu_features, Some((Avx2 { .. }, Bmi2 { .. }))) {
+                    declare_open! { chacha20_poly1305_open_avx2 }
+                    tag = unsafe {
+                        chacha20_poly1305_open_avx2(output, input, len, ad, ad_len, &mut data);
+                        &data.out.tag
+                    };
+                } else {
+                    declare_open! { chacha20_poly1305_open_sse41 }
+                    tag = unsafe {
+                        chacha20_poly1305_open_sse41(output, input, len, ad, ad_len, &mut data);
+                        &data.out.tag
+                    };
+                }
+            }
+        }
+
+        Ok(Tag(*tag))
+    })
+}
+
+// Keep in sync with BoringSSL's `chacha20_poly1305_open_data` and
+// `chacha20_poly1305_seal_data`.
+#[repr(C)]
+pub(super) union InOut<T>
+where
+    T: Copy,
+{
+    pub(super) input: T,
+    pub(super) out: Out,
+}
+
+// It isn't obvious whether the assembly code works for tags that aren't
+// 16-byte aligned. In practice it will always be 16-byte aligned because it
+// is embedded in a union where the other member of the union is 16-byte
+// aligned.
+#[derive(Clone, Copy)]
+#[repr(align(16), C)]
+pub(super) struct Out {
+    pub(super) tag: [u8; TAG_LEN],
+}
diff --git a/ring-0.17.14/src/aead/chacha20_poly1305/mod.rs b/ring-0.17.14/src/aead/chacha20_poly1305/mod.rs
new file mode 100644
index 0000000000..881749081a
--- /dev/null
+++ b/ring-0.17.14/src/aead/chacha20_poly1305/mod.rs
@@ -0,0 +1,167 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    chacha::{self, Counter, Overlapping},
+    poly1305, Aad, Nonce, Tag,
+};
+use crate::{
+    cpu,
+    error::InputTooLongError,
+    polyfill::{slice, sliceutil, u64_from_usize, usize_from_u64_saturated},
+};
+use cfg_if::cfg_if;
+
+cfg_if! {
+    if #[cfg(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            target_arch = "x86_64"))] {
+        use cpu::GetFeature as _;
+        mod integrated;
+    }
+}
+
+pub(super) const KEY_LEN: usize = chacha::KEY_LEN;
+
+const MAX_IN_OUT_LEN: usize = super::max_input_len(64, 1);
+// https://tools.ietf.org/html/rfc8439#section-2.8
+const _MAX_IN_OUT_LEN_BOUNDED_BY_RFC: () =
+    assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(274_877_906_880u64));
+
+#[derive(Clone)]
+pub(super) struct Key(chacha::Key);
+
+impl Key {
+    pub(super) fn new(value: [u8; KEY_LEN]) -> Self {
+        Self(chacha::Key::new(value))
+    }
+}
+
+pub(super) fn seal(
+    key: &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    cpu: cpu::Features,
+) -> Result<Tag, InputTooLongError> {
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ))]
+    if let Some(required) = cpu.get_feature() {
+        return integrated::seal(key, nonce, aad, in_out, required, cpu.get_feature());
+    }
+
+    seal_fallback(key, nonce, aad, in_out, cpu)
+}
+
+pub(super) fn seal_fallback(
+    Key(chacha20_key): &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: &mut [u8],
+    cpu: cpu::Features,
+) -> Result<Tag, InputTooLongError> {
+    let (counter, poly1305_key) = begin(chacha20_key, nonce, aad, in_out, cpu)?;
+    let mut auth = poly1305::Context::from_key(poly1305_key, cpu);
+
+    poly1305_update_padded_16(&mut auth, aad.as_ref());
+    chacha20_key.encrypt(counter, in_out.into(), cpu);
+    poly1305_update_padded_16(&mut auth, in_out);
+    Ok(finish(auth, aad.as_ref().len(), in_out.len()))
+}
+
+pub(super) fn open(
+    key: &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: Overlapping<'_>,
+    cpu: cpu::Features,
+) -> Result<Tag, InputTooLongError> {
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ))]
+    if let Some(required) = cpu.get_feature() {
+        return integrated::open(key, nonce, aad, in_out, required, cpu.get_feature());
+    }
+
+    open_fallback(key, nonce, aad, in_out, cpu)
+}
+
+pub(super) fn open_fallback(
+    Key(chacha20_key): &Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    in_out: Overlapping<'_>,
+    cpu: cpu::Features,
+) -> Result<Tag, InputTooLongError> {
+    let (counter, poly1305_key) = begin(chacha20_key, nonce, aad, in_out.input(), cpu)?;
+    let mut auth = poly1305::Context::from_key(poly1305_key, cpu);
+
+    poly1305_update_padded_16(&mut auth, aad.as_ref());
+    poly1305_update_padded_16(&mut auth, in_out.input());
+    let in_out_len = in_out.len();
+    chacha20_key.encrypt(counter, in_out, cpu);
+    Ok(finish(auth, aad.as_ref().len(), in_out_len))
+}
+
+fn check_input_lengths(aad: Aad<&[u8]>, input: &[u8]) -> Result<(), InputTooLongError> {
+    if input.len() > MAX_IN_OUT_LEN {
+        return Err(InputTooLongError::new(input.len()));
+    }
+
+    // RFC 8439 Section 2.8 says the maximum AAD length is 2**64 - 1, which is
+    // never larger than usize::MAX, so we don't need an explicit length
+    // check.
+    const _USIZE_BOUNDED_BY_U64: u64 = u64_from_usize(usize::MAX);
+    let _ = aad;
+
+    Ok(())
+}
+
+// Also used by chacha20_poly1305_openssh.
+pub(super) fn begin(
+    key: &chacha::Key,
+    nonce: Nonce,
+    aad: Aad<&[u8]>,
+    input: &[u8],
+    cpu: cpu::Features,
+) -> Result<(Counter, poly1305::Key), InputTooLongError> {
+    check_input_lengths(aad, input)?;
+
+    let mut key_bytes = [0u8; poly1305::KEY_LEN];
+    let counter = key.encrypt_single_block_with_ctr_0(nonce, &mut key_bytes, cpu);
+    let poly1305_key = poly1305::Key::new(key_bytes);
+    Ok((counter, poly1305_key))
+}
+
+fn finish(auth: poly1305::Context, aad_len: usize, in_out_len: usize) -> Tag {
+    let mut block = [0u8; poly1305::BLOCK_LEN];
+    let (alen, clen) = block.split_at_mut(poly1305::BLOCK_LEN / 2);
+    alen.copy_from_slice(&u64::to_le_bytes(u64_from_usize(aad_len)));
+    clen.copy_from_slice(&u64::to_le_bytes(u64_from_usize(in_out_len)));
+    auth.finish(&block)
+}
+
+#[inline]
+fn poly1305_update_padded_16(ctx: &mut poly1305::Context, input: &[u8]) {
+    let (whole, remainder) = slice::as_chunks(input);
+    ctx.update(whole);
+    if !remainder.is_empty() {
+        let mut block = [0u8; poly1305::BLOCK_LEN];
+        sliceutil::overwrite_at_start(&mut block, remainder);
+        ctx.update_block(block);
+    }
+}
diff --git a/ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs b/ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs
new file mode 100644
index 0000000000..cf3950ba6b
--- /dev/null
+++ b/ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs
@@ -0,0 +1,212 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! The [chacha20-poly1305@openssh.com] AEAD-ish construct.
+//!
+//! This should only be used by SSH implementations. It has a similar, but
+//! different API from `ring::aead` because the construct cannot use the same
+//! API as `ring::aead` due to the way the construct handles the encrypted
+//! packet length.
+//!
+//! The concatenation of a and b is denoted `a||b`. `K_1` and `K_2` are defined
+//! in the [chacha20-poly1305@openssh.com] specification. `packet_length`,
+//! `padding_length`, `payload`, and `random padding` are defined in
+//! [RFC 4253]. The term `plaintext` is used as a shorthand for
+//! `padding_length||payload||random padding`.
+//!
+//! [chacha20-poly1305@openssh.com]:
+//!    http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/usr.bin/ssh/PROTOCOL.chacha20poly1305?annotate=HEAD
+//! [RFC 4253]: https://tools.ietf.org/html/rfc4253
+
+use super::{
+    chacha::{self, *},
+    chacha20_poly1305, cpu, poly1305, Aad, Nonce, Tag,
+};
+use crate::{
+    bb,
+    error::{self, InputTooLongError},
+    polyfill::slice,
+};
+
+/// A key for sealing packets.
+pub struct SealingKey {
+    key: Key,
+}
+
+impl SealingKey {
+    /// Constructs a new `SealingKey`.
+    pub fn new(key_material: &[u8; KEY_LEN]) -> Self {
+        Self {
+            key: Key::new(key_material),
+        }
+    }
+
+    /// Seals (encrypts and signs) a packet.
+    ///
+    /// On input, `plaintext_in_ciphertext_out` must contain the unencrypted
+    /// `packet_length||plaintext` where `plaintext` is the
+    /// `padding_length||payload||random padding`. It will be overwritten by
+    /// `encrypted_packet_length||ciphertext`, where `encrypted_packet_length`
+    /// is encrypted with `K_1` and `ciphertext` is encrypted by `K_2`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `plaintext_in_ciphertext_out.len() < PACKET_LENGTH_LEN`.
+    ///
+    /// Panics if `plaintext_in_ciphertext_out` is longer than the maximum
+    /// input size for ChaCha20-Poly1305. Note that this limit is much,
+    /// much larger than SSH's 256KB maximum record size.
+    pub fn seal_in_place(
+        &self,
+        sequence_number: u32,
+        plaintext_in_ciphertext_out: &mut [u8],
+        tag_out: &mut [u8; TAG_LEN],
+    ) {
+        // XXX/TODO(SemVer): Refactor API to return an error.
+        let (len_in_out, data_and_padding_in_out): (&mut [u8; PACKET_LENGTH_LEN], _) =
+            slice::split_first_chunk_mut(plaintext_in_ciphertext_out).unwrap();
+
+        let cpu = cpu::features();
+        // XXX/TODO(SemVer): Refactor API to return an error.
+        let (counter, poly_key) = chacha20_poly1305::begin(
+            &self.key.k_2,
+            make_nonce(sequence_number),
+            Aad::from(len_in_out),
+            data_and_padding_in_out,
+            cpu,
+        )
+        .map_err(error::erase::<InputTooLongError>)
+        .unwrap();
+
+        let _: Counter = self.key.k_1.encrypt_single_block_with_ctr_0(
+            make_nonce(sequence_number),
+            len_in_out,
+            cpu,
+        );
+        self.key
+            .k_2
+            .encrypt(counter, data_and_padding_in_out.into(), cpu);
+
+        let Tag(tag) = poly1305::sign(poly_key, plaintext_in_ciphertext_out, cpu);
+        *tag_out = tag;
+    }
+}
+
+/// A key for opening packets.
+pub struct OpeningKey {
+    key: Key,
+}
+
+impl OpeningKey {
+    /// Constructs a new `OpeningKey`.
+    pub fn new(key_material: &[u8; KEY_LEN]) -> Self {
+        Self {
+            key: Key::new(key_material),
+        }
+    }
+
+    /// Returns the decrypted, but unauthenticated, packet length.
+    ///
+    /// Importantly, the result won't be authenticated until `open_in_place` is
+    /// called.
+    pub fn decrypt_packet_length(
+        &self,
+        sequence_number: u32,
+        encrypted_packet_length: [u8; PACKET_LENGTH_LEN],
+    ) -> [u8; PACKET_LENGTH_LEN] {
+        let cpu = cpu::features();
+        let mut packet_length = encrypted_packet_length;
+        let _: Counter = self.key.k_1.encrypt_single_block_with_ctr_0(
+            make_nonce(sequence_number),
+            &mut packet_length,
+            cpu,
+        );
+        packet_length
+    }
+
+    /// Opens (authenticates and decrypts) a packet.
+    ///
+    /// `ciphertext_in_plaintext_out` must be of the form
+    /// `encrypted_packet_length||ciphertext` where `ciphertext` is the
+    /// encrypted `plaintext`. When the function succeeds the ciphertext is
+    /// replaced by the plaintext and the result is `Ok(plaintext)`, where
+    /// `plaintext` is `&ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..]`;
+    /// otherwise the contents of `ciphertext_in_plaintext_out` are unspecified
+    /// and must not be used.
+    pub fn open_in_place<'a>(
+        &self,
+        sequence_number: u32,
+        ciphertext_in_plaintext_out: &'a mut [u8],
+        tag: &[u8; TAG_LEN],
+    ) -> Result<&'a [u8], error::Unspecified> {
+        let (packet_length, after_packet_length): (&mut [u8; PACKET_LENGTH_LEN], _) =
+            slice::split_first_chunk_mut(ciphertext_in_plaintext_out).ok_or(error::Unspecified)?;
+
+        let cpu = cpu::features();
+        let (counter, poly_key) = chacha20_poly1305::begin(
+            &self.key.k_2,
+            make_nonce(sequence_number),
+            Aad::from(packet_length),
+            after_packet_length,
+            cpu,
+        )
+        .map_err(error::erase::<InputTooLongError>)?;
+
+        // We must verify the tag before decrypting so that
+        // `ciphertext_in_plaintext_out` is unmodified if verification fails.
+        // This is beyond what we guarantee.
+        let calculated_tag = poly1305::sign(poly_key, ciphertext_in_plaintext_out, cpu);
+        bb::verify_slices_are_equal(calculated_tag.as_ref(), tag)?;
+
+        // Won't panic because the length was checked above.
+        let after_packet_length = &mut ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..];
+
+        self.key
+            .k_2
+            .encrypt(counter, after_packet_length.into(), cpu);
+
+        Ok(after_packet_length)
+    }
+}
+
+struct Key {
+    k_1: chacha::Key,
+    k_2: chacha::Key,
+}
+
+impl Key {
+    fn new(key_material: &[u8; KEY_LEN]) -> Self {
+        // The first half becomes K_2 and the second half becomes K_1.
+        let (k_2, k_1) = key_material.split_at(chacha::KEY_LEN);
+        Self {
+            k_1: chacha::Key::new(k_1.try_into().unwrap()),
+            k_2: chacha::Key::new(k_2.try_into().unwrap()),
+        }
+    }
+}
+
+fn make_nonce(sequence_number: u32) -> Nonce {
+    let [s0, s1, s2, s3] = sequence_number.to_be_bytes();
+    let nonce = [0, 0, 0, 0, 0, 0, 0, 0, s0, s1, s2, s3];
+    Nonce::assume_unique_for_key(nonce)
+}
+
+/// The length of key.
+pub const KEY_LEN: usize = chacha::KEY_LEN * 2;
+
+/// The length in bytes of the `packet_length` field in a SSH packet.
+pub const PACKET_LENGTH_LEN: usize = 4; // 32 bits
+
+/// The length in bytes of an authentication tag.
+pub const TAG_LEN: usize = super::TAG_LEN;
diff --git a/ring-0.17.14/src/aead/gcm.rs b/ring-0.17.14/src/aead/gcm.rs
new file mode 100644
index 0000000000..443c19e16b
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm.rs
@@ -0,0 +1,163 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use self::ffi::{Block, BLOCK_LEN, ZERO_BLOCK};
+use super::{aes_gcm, Aad};
+use crate::{
+    bits::{BitLength, FromByteLen as _},
+    error::{self, InputTooLongError},
+    polyfill::{slice::AsChunks, sliceutil::overwrite_at_start, NotSend},
+};
+use cfg_if::cfg_if;
+
+pub(super) use ffi::KeyValue;
+
+cfg_if! {
+    if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] {
+        pub(super) use self::ffi::{HTable, Xi};
+    } else {
+        use self::ffi::{HTable, Xi};
+    }
+}
+
+#[macro_use]
+mod ffi;
+
+pub(super) mod clmul;
+pub(super) mod clmulavxmovbe;
+pub(super) mod fallback;
+pub(super) mod neon;
+pub(super) mod vclmulavx2;
+
+pub(super) struct Context<'key, K> {
+    Xi: Xi,
+    key: &'key K,
+    aad_len: BitLength<u64>,
+    in_out_len: BitLength<u64>,
+    _not_send: NotSend,
+}
+
+impl<'key, K: UpdateBlock> Context<'key, K> {
+    #[inline(always)]
+    pub(crate) fn new(
+        key: &'key K,
+        aad: Aad<&[u8]>,
+        in_out_len: usize,
+    ) -> Result<Self, error::Unspecified> {
+        if in_out_len > aes_gcm::MAX_IN_OUT_LEN {
+            return Err(error::Unspecified);
+        }
+        let in_out_len =
+            BitLength::from_byte_len(in_out_len).map_err(error::erase::<InputTooLongError>)?;
+        let aad_len = BitLength::from_byte_len(aad.as_ref().len())
+            .map_err(error::erase::<InputTooLongError>)?;
+
+        // NIST SP800-38D Section 5.2.1.1 says that the maximum AAD length is
+        // 2**64 - 1 bits, i.e. BitLength<u64>::MAX, so we don't need to do an
+        // explicit check here.
+
+        let mut ctx = Self {
+            Xi: Xi(ZERO_BLOCK),
+            key,
+            aad_len,
+            in_out_len,
+            _not_send: NotSend::VALUE,
+        };
+
+        for ad in aad.0.chunks(BLOCK_LEN) {
+            let mut block = ZERO_BLOCK;
+            overwrite_at_start(&mut block, ad);
+            ctx.update_block(block);
+        }
+
+        Ok(ctx)
+    }
+}
+
+#[cfg(all(
+    target_arch = "aarch64",
+    target_endian = "little",
+    target_pointer_width = "64"
+))]
+impl<K> Context<'_, K> {
+    pub(super) fn in_out_whole_block_bits(&self) -> BitLength<usize> {
+        use crate::polyfill::usize_from_u64;
+        const WHOLE_BLOCK_BITS_MASK: usize = !0b111_1111;
+        #[allow(clippy::assertions_on_constants)]
+        const _WHOLE_BLOCK_BITS_MASK_CORRECT: () =
+            assert!(WHOLE_BLOCK_BITS_MASK == !((BLOCK_LEN * 8) - 1));
+        BitLength::from_bits(usize_from_u64(self.in_out_len.as_bits()) & WHOLE_BLOCK_BITS_MASK)
+    }
+}
+
+#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+/// Access to `inner` for the integrated AES-GCM implementations only.
+impl Context<'_, clmul::Key> {
+    #[inline]
+    pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
+        (&self.key.inner(), &mut self.Xi)
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl Context<'_, clmulavxmovbe::Key> {
+    /// Access to `inner` for the integrated AES-GCM implementations only.
+    #[inline]
+    pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
+        (self.key.inner(), &mut self.Xi)
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl Context<'_, vclmulavx2::Key> {
+    /// Access to `inner` for the integrated AES-GCM implementations only.
+    #[inline]
+    pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
+        (self.key.inner(), &mut self.Xi)
+    }
+}
+
+impl<K: UpdateBlocks> Context<'_, K> {
+    #[inline(always)]
+    pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
+        self.key.update_blocks(&mut self.Xi, input);
+    }
+}
+
+impl<K: UpdateBlock> Context<'_, K> {
+    pub fn update_block(&mut self, a: Block) {
+        self.key.update_block(&mut self.Xi, a);
+    }
+
+    #[inline(always)]
+    pub(super) fn pre_finish<F>(mut self, f: F) -> super::Tag
+    where
+        F: FnOnce(Block) -> super::Tag,
+    {
+        let mut block = [0u8; BLOCK_LEN];
+        let (alen, clen) = block.split_at_mut(BLOCK_LEN / 2);
+        alen.copy_from_slice(&BitLength::<u64>::to_be_bytes(self.aad_len));
+        clen.copy_from_slice(&BitLength::<u64>::to_be_bytes(self.in_out_len));
+        self.update_block(block);
+        f(self.Xi.0)
+    }
+}
+
+pub(super) trait UpdateBlock {
+    fn update_block(&self, xi: &mut Xi, a: Block);
+}
+
+pub(super) trait UpdateBlocks {
+    fn update_blocks(&self, xi: &mut Xi, input: AsChunks<u8, BLOCK_LEN>);
+}
diff --git a/ring-0.17.14/src/aead/gcm/clmul.rs b/ring-0.17.14/src/aead/gcm/clmul.rs
new file mode 100644
index 0000000000..8cd55a4eeb
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm/clmul.rs
@@ -0,0 +1,73 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+
+use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
+use crate::aead::gcm::ffi::BLOCK_LEN;
+use crate::cpu;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use {super::UpdateBlocks, crate::polyfill::slice::AsChunks};
+
+#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+pub(in super::super) type RequiredCpuFeatures = cpu::arm::PMull;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(in super::super) type RequiredCpuFeatures = (cpu::intel::ClMul, cpu::intel::Ssse3);
+
+#[derive(Clone)]
+pub struct Key {
+    h_table: HTable,
+}
+
+impl Key {
+    #[cfg_attr(target_arch = "x86_64", inline(never))]
+    pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self {
+        Self {
+            h_table: unsafe { htable_new!(gcm_init_clmul, value) },
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    pub(super) fn inner(&self) -> &HTable {
+        &self.h_table
+    }
+}
+
+impl UpdateBlock for Key {
+    #[cfg(target_arch = "aarch64")]
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        prefixed_extern! {
+            fn gcm_gmult_clmul(xi: &mut Xi, Htable: &HTable);
+        }
+        xi.bitxor_assign(a);
+        unsafe { self.h_table.gmult(gcm_gmult_clmul, xi) };
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        self.update_blocks(xi, (&a).into())
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl UpdateBlocks for Key {
+    fn update_blocks(&self, xi: &mut Xi, input: AsChunks<u8, { BLOCK_LEN }>) {
+        unsafe { ghash!(gcm_ghash_clmul, xi, &self.h_table, input) }
+    }
+}
diff --git a/ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs b/ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs
new file mode 100644
index 0000000000..c53e2410cc
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs
@@ -0,0 +1,51 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{HTable, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN};
+use crate::{cpu::intel, polyfill::slice::AsChunks};
+
+#[derive(Clone)]
+pub struct Key {
+    h_table: HTable,
+}
+
+impl Key {
+    #[inline(never)]
+    pub(in super::super) fn new(
+        value: KeyValue,
+        _required_cpu_features: (intel::ClMul, intel::Avx, intel::Movbe),
+    ) -> Self {
+        Self {
+            h_table: unsafe { htable_new!(gcm_init_avx, value) },
+        }
+    }
+
+    pub(super) fn inner(&self) -> &HTable {
+        &self.h_table
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        self.update_blocks(xi, (&a).into())
+    }
+}
+
+impl UpdateBlocks for Key {
+    fn update_blocks(&self, xi: &mut Xi, input: AsChunks<u8, BLOCK_LEN>) {
+        unsafe { ghash!(gcm_ghash_avx, xi, self.inner(), input) }
+    }
+}
diff --git a/ring-0.17.14/src/aead/gcm/fallback.rs b/ring-0.17.14/src/aead/gcm/fallback.rs
new file mode 100644
index 0000000000..4d5403bde8
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm/fallback.rs
@@ -0,0 +1,271 @@
+// Copyright (c) 2019, Google Inc.
+// Portions Copyright 2020-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// This file is based on BoringSSL's gcm_nohw.c.
+
+// This file contains a implementation of GHASH based on the notes
+// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
+// algorithm described in
+// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
+//
+// Unlike the BearSSL notes, we use u128 in the 64-bit implementation.
+
+use super::{ffi::U128, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN};
+use crate::polyfill::{slice::AsChunks, ArraySplitMap as _};
+
+#[derive(Clone)]
+pub struct Key {
+    h: U128,
+}
+
+impl Key {
+    pub(in super::super) fn new(value: KeyValue) -> Self {
+        Self { h: init(value) }
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        xi.bitxor_assign(a);
+        gmult(xi, self.h);
+    }
+}
+
+impl UpdateBlocks for Key {
+    fn update_blocks(&self, xi: &mut Xi, input: AsChunks<u8, BLOCK_LEN>) {
+        ghash(xi, self.h, input);
+    }
+}
+
+#[cfg(target_pointer_width = "64")]
+fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) {
+    #[allow(clippy::cast_possible_truncation)]
+    #[inline(always)]
+    fn lo(a: u128) -> u64 {
+        a as u64
+    }
+
+    #[inline(always)]
+    fn hi(a: u128) -> u64 {
+        lo(a >> 64)
+    }
+
+    #[inline(always)]
+    fn mul(a: u64, b: u64) -> u128 {
+        u128::from(a) * u128::from(b)
+    }
+
+    // One term every four bits means the largest term is 64/4 = 16, which barely
+    // overflows into the next term. Using one term every five bits would cost 25
+    // multiplications instead of 16. It is faster to mask off the bottom four
+    // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits
+    // separately.
+    let a0 = a & 0x1111111111111110;
+    let a1 = a & 0x2222222222222220;
+    let a2 = a & 0x4444444444444440;
+    let a3 = a & 0x8888888888888880;
+
+    let b0 = b & 0x1111111111111111;
+    let b1 = b & 0x2222222222222222;
+    let b2 = b & 0x4444444444444444;
+    let b3 = b & 0x8888888888888888;
+
+    let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1);
+    let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2);
+    let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3);
+    let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0);
+
+    // Multiply the bottom four bits of |a| with |b|.
+    let a0_mask = 0u64.wrapping_sub(a & 1);
+    let a1_mask = 0u64.wrapping_sub((a >> 1) & 1);
+    let a2_mask = 0u64.wrapping_sub((a >> 2) & 1);
+    let a3_mask = 0u64.wrapping_sub((a >> 3) & 1);
+    let extra = u128::from(a0_mask & b)
+        ^ (u128::from(a1_mask & b) << 1)
+        ^ (u128::from(a2_mask & b) << 2)
+        ^ (u128::from(a3_mask & b) << 3);
+
+    let lo = (lo(c0) & 0x1111111111111111)
+        ^ (lo(c1) & 0x2222222222222222)
+        ^ (lo(c2) & 0x4444444444444444)
+        ^ (lo(c3) & 0x8888888888888888)
+        ^ lo(extra);
+    let hi = (hi(c0) & 0x1111111111111111)
+        ^ (hi(c1) & 0x2222222222222222)
+        ^ (hi(c2) & 0x4444444444444444)
+        ^ (hi(c3) & 0x8888888888888888)
+        ^ hi(extra);
+    (lo, hi)
+}
+
+#[cfg(not(target_pointer_width = "64"))]
+fn gcm_mul32_nohw(a: u32, b: u32) -> u64 {
+    #[inline(always)]
+    fn mul(a: u32, b: u32) -> u64 {
+        u64::from(a) * u64::from(b)
+    }
+
+    // One term every four bits means the largest term is 32/4 = 8, which does not
+    // overflow into the next term.
+    let a0 = a & 0x11111111;
+    let a1 = a & 0x22222222;
+    let a2 = a & 0x44444444;
+    let a3 = a & 0x88888888;
+
+    let b0 = b & 0x11111111;
+    let b1 = b & 0x22222222;
+    let b2 = b & 0x44444444;
+    let b3 = b & 0x88888888;
+
+    let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1);
+    let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2);
+    let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3);
+    let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0);
+
+    (c0 & 0x1111111111111111)
+        | (c1 & 0x2222222222222222)
+        | (c2 & 0x4444444444444444)
+        | (c3 & 0x8888888888888888)
+}
+
+#[cfg(not(target_pointer_width = "64"))]
+fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) {
+    #[inline(always)]
+    fn lo(a: u64) -> u32 {
+        a as u32
+    }
+    #[inline(always)]
+    fn hi(a: u64) -> u32 {
+        lo(a >> 32)
+    }
+
+    let a0 = lo(a);
+    let a1 = hi(a);
+    let b0 = lo(b);
+    let b1 = hi(b);
+    // Karatsuba multiplication.
+    let lo = gcm_mul32_nohw(a0, b0);
+    let hi = gcm_mul32_nohw(a1, b1);
+    let mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
+    (lo ^ (mid << 32), hi ^ (mid >> 32))
+}
+
+fn init(value: KeyValue) -> U128 {
+    let xi = value.into_inner();
+
+    // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
+    // avoids a shift by 1 in the multiplication, needed to account for bit
+    // reversal losing a bit after multiplication, that is,
+    // rev128(X) * rev128(Y) = rev255(X*Y).
+    //
+    // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation
+    // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped.
+    //
+    // See also slide 16 of
+    // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
+    let mut lo = xi[1];
+    let mut hi = xi[0];
+
+    let mut carry = hi >> 63;
+    carry = 0u64.wrapping_sub(carry);
+
+    hi <<= 1;
+    hi |= lo >> 63;
+    lo <<= 1;
+
+    // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we
+    // conditionally add 0xc200...0001.
+    lo ^= carry & 1;
+    hi ^= carry & 0xc200000000000000;
+
+    // This implementation does not use the rest of |Htable|.
+    U128 { hi, lo }
+}
+
+fn gcm_polyval_nohw(xi: &mut [u64; 2], h: U128) {
+    // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
+    // through |r3|. Note there is no byte or bit reversal because we are
+    // evaluating POLYVAL.
+    let (r0, mut r1) = gcm_mul64_nohw(xi[0], h.lo);
+    let (mut r2, mut r3) = gcm_mul64_nohw(xi[1], h.hi);
+    let (mut mid0, mut mid1) = gcm_mul64_nohw(xi[0] ^ xi[1], h.hi ^ h.lo);
+    mid0 ^= r0 ^ r2;
+    mid1 ^= r1 ^ r3;
+    r2 ^= mid1;
+    r1 ^= mid0;
+
+    // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and
+    // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We
+    // have:
+    //
+    //       1 = x^121 + x^126 + x^127 + x^128
+    //  x^-128 = x^-7 + x^-2 + x^-1 + 1
+    //
+    // This is the GHASH reduction step, but with bits flowing in reverse.
+
+    // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require
+    // another reduction steps. Instead, we gather the excess bits, incorporate
+    // them into |r0| and |r1| and reduce once. See slides 17-19
+    // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
+    r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57);
+
+    // 1
+    r2 ^= r0;
+    r3 ^= r1;
+
+    // x^-1
+    r2 ^= r0 >> 1;
+    r2 ^= r1 << 63;
+    r3 ^= r1 >> 1;
+
+    // x^-2
+    r2 ^= r0 >> 2;
+    r2 ^= r1 << 62;
+    r3 ^= r1 >> 2;
+
+    // x^-7
+    r2 ^= r0 >> 7;
+    r2 ^= r1 << 57;
+    r3 ^= r1 >> 7;
+
+    *xi = [r2, r3];
+}
+
+fn gmult(xi: &mut Xi, h: U128) {
+    with_swapped_xi(xi, |swapped| {
+        gcm_polyval_nohw(swapped, h);
+    })
+}
+
+fn ghash(xi: &mut Xi, h: U128, input: AsChunks<u8, BLOCK_LEN>) {
+    with_swapped_xi(xi, |swapped| {
+        input.into_iter().for_each(|&input| {
+            let input = input.array_split_map(u64::from_be_bytes);
+            swapped[0] ^= input[1];
+            swapped[1] ^= input[0];
+            gcm_polyval_nohw(swapped, h);
+        });
+    });
+}
+
+#[inline]
+fn with_swapped_xi(Xi(xi): &mut Xi, f: impl FnOnce(&mut [u64; 2])) {
+    let unswapped: [u64; 2] = xi.array_split_map(u64::from_be_bytes);
+    let mut swapped: [u64; 2] = [unswapped[1], unswapped[0]];
+    f(&mut swapped);
+    let (xi_0, xi_1) = xi.split_at_mut(BLOCK_LEN / 2);
+    xi_0.copy_from_slice(&u64::to_be_bytes(swapped[1]));
+    xi_1.copy_from_slice(&u64::to_be_bytes(swapped[0]));
+}
diff --git a/ring-0.17.14/src/aead/gcm/ffi.rs b/ring-0.17.14/src/aead/gcm/ffi.rs
new file mode 100644
index 0000000000..c655a0dd39
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm/ffi.rs
@@ -0,0 +1,165 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{
+    bb,
+    polyfill::{slice::AsChunks, ArraySplitMap},
+};
+
+pub(in super::super) const BLOCK_LEN: usize = 16;
+pub(in super::super) type Block = [u8; BLOCK_LEN];
+pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN];
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+macro_rules! htable_new {
+    ( $name:ident, $value:expr $(,)? ) => {{
+        use crate::aead::gcm::ffi::HTable;
+        prefixed_extern! {
+            fn $name(HTable: &mut HTable, h: &[u64; 2]);
+        }
+        HTable::new($name, $value)
+    }};
+}
+
+/// SAFETY:
+///  * The function `$name` must meet the contract of the `f` paramweter of
+///    `ghash()`.
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+macro_rules! ghash {
+    ( $name:ident, $xi:expr, $h_table:expr, $input:expr $(,)? ) => {{
+        use crate::aead::gcm::ffi::{HTable, Xi};
+        prefixed_extern! {
+            fn $name(
+                xi: &mut Xi,
+                Htable: &HTable,
+                inp: *const u8,
+                len: crate::c::NonZero_size_t,
+            );
+        }
+        $h_table.ghash($name, $xi, $input)
+    }};
+}
+
+pub(in super::super) struct KeyValue([u64; 2]);
+
+impl KeyValue {
+    pub(in super::super) fn new(value: Block) -> Self {
+        Self(value.array_split_map(u64::from_be_bytes))
+    }
+
+    pub(super) fn into_inner(self) -> [u64; 2] {
+        self.0
+    }
+}
+
+/// SAFETY:
+///   * `f` must read `len` bytes from `inp`; it may assume
+///     that `len` is a (non-zero) multiple of `BLOCK_LEN`.
+///   * `f` may inspect CPU features.
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+impl HTable {
+    pub(super) unsafe fn new(
+        init: unsafe extern "C" fn(HTable: &mut HTable, &[u64; 2]),
+        value: KeyValue,
+    ) -> Self {
+        let mut r = Self {
+            Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN],
+        };
+        unsafe { init(&mut r, &value.0) };
+        r
+    }
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little")
+    ))]
+    pub(super) unsafe fn gmult(
+        &self,
+        f: unsafe extern "C" fn(xi: &mut Xi, h_table: &HTable),
+        xi: &mut Xi,
+    ) {
+        unsafe { f(xi, self) }
+    }
+
+    pub(super) unsafe fn ghash(
+        &self,
+        f: unsafe extern "C" fn(
+            xi: &mut Xi,
+            Htable: &HTable,
+            inp: *const u8,
+            len: crate::c::NonZero_size_t,
+        ),
+        xi: &mut Xi,
+        input: AsChunks<u8, BLOCK_LEN>,
+    ) {
+        use core::num::NonZeroUsize;
+
+        let input = input.as_flattened();
+
+        let input_len = match NonZeroUsize::new(input.len()) {
+            Some(len) => len,
+            None => {
+                return;
+            }
+        };
+
+        // SAFETY:
+        //  * There are `input_len: NonZeroUsize` bytes available at `input` for
+        //    `f` to read.
+        unsafe {
+            f(xi, self, input.as_ptr(), input_len);
+        }
+    }
+}
+
+// The alignment is required by some assembly code, such as `ghash-ssse3-*`.
+#[derive(Clone)]
+#[repr(C, align(16))]
+pub(in super::super) struct HTable {
+    Htable: [U128; HTABLE_LEN],
+}
+
+#[derive(Clone, Copy)]
+#[repr(C)]
+pub(super) struct U128 {
+    pub(super) hi: u64,
+    pub(super) lo: u64,
+}
+
+const HTABLE_LEN: usize = 16;
+
+#[repr(transparent)]
+pub(in super::super) struct Xi(pub(super) Block);
+
+impl Xi {
+    #[inline]
+    pub(super) fn bitxor_assign(&mut self, a: Block) {
+        self.0 = bb::xor_16(self.0, a)
+    }
+}
diff --git a/ring-0.17.14/src/aead/gcm/neon.rs b/ring-0.17.14/src/aead/gcm/neon.rs
new file mode 100644
index 0000000000..814cd52187
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm/neon.rs
@@ -0,0 +1,52 @@
+// Copyright 2018-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little")
+))]
+
+use super::{HTable, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN};
+use crate::{cpu, polyfill::slice::AsChunks};
+
+pub(in super::super) type RequiredCpuFeatures = cpu::arm::Neon;
+
+#[derive(Clone)]
+pub struct Key {
+    h_table: HTable,
+}
+
+impl Key {
+    pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self {
+        Self {
+            h_table: unsafe { htable_new!(gcm_init_neon, value) },
+        }
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        prefixed_extern! {
+            fn gcm_gmult_neon(xi: &mut Xi, Htable: &HTable);
+        }
+        xi.bitxor_assign(a);
+        unsafe { self.h_table.gmult(gcm_gmult_neon, xi) };
+    }
+}
+
+impl UpdateBlocks for Key {
+    fn update_blocks(&self, xi: &mut Xi, input: AsChunks<u8, BLOCK_LEN>) {
+        unsafe { ghash!(gcm_ghash_neon, xi, &self.h_table, input) }
+    }
+}
diff --git a/ring-0.17.14/src/aead/gcm/vclmulavx2.rs b/ring-0.17.14/src/aead/gcm/vclmulavx2.rs
new file mode 100644
index 0000000000..ebf4e76ad4
--- /dev/null
+++ b/ring-0.17.14/src/aead/gcm/vclmulavx2.rs
@@ -0,0 +1,46 @@
+// Copyright 2018-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
+use crate::{
+    aead::gcm::ffi::BLOCK_LEN,
+    cpu::intel::{Avx2, VAesClmul},
+    polyfill::slice::AsChunks,
+};
+
+#[derive(Clone)]
+pub struct Key {
+    h_table: HTable,
+}
+
+impl Key {
+    pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
+        Self {
+            h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },
+        }
+    }
+
+    pub(super) fn inner(&self) -> &HTable {
+        &self.h_table
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
+        unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) }
+    }
+}
diff --git a/ring-0.17.14/src/aead/less_safe_key.rs b/ring-0.17.14/src/aead/less_safe_key.rs
new file mode 100644
index 0000000000..10483fc0d3
--- /dev/null
+++ b/ring-0.17.14/src/aead/less_safe_key.rs
@@ -0,0 +1,181 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Aad, Algorithm, KeyInner, Nonce, Tag, UnboundKey, TAG_LEN};
+use crate::{cpu, error};
+use core::ops::RangeFrom;
+
+/// Immutable keys for use in situations where `OpeningKey`/`SealingKey` and
+/// `NonceSequence` cannot reasonably be used.
+///
+/// Prefer to use `OpeningKey`/`SealingKey` and `NonceSequence` when practical.
+#[derive(Clone)]
+pub struct LessSafeKey {
+    inner: KeyInner,
+    algorithm: &'static Algorithm,
+}
+
+impl LessSafeKey {
+    /// Constructs a `LessSafeKey`.
+    #[inline]
+    pub fn new(key: UnboundKey) -> Self {
+        key.into_inner()
+    }
+
+    pub(super) fn new_(
+        algorithm: &'static Algorithm,
+        key_bytes: &[u8],
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::Unspecified> {
+        Ok(Self {
+            inner: algorithm.new_key(key_bytes, cpu_features)?,
+            algorithm,
+        })
+    }
+
+    /// Like [open_in_place](Self::open_in_place), except the authentication tag is
+    /// passed separately.
+    #[inline]
+    pub fn open_in_place_separate_tag<'in_out, A>(
+        &self,
+        nonce: Nonce,
+        aad: Aad<A>,
+        tag: Tag,
+        in_out: &'in_out mut [u8],
+        ciphertext: RangeFrom<usize>,
+    ) -> Result<&'in_out mut [u8], error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        let aad = Aad::from(aad.as_ref());
+        self.algorithm.open_within(
+            &self.inner,
+            nonce,
+            aad,
+            tag,
+            in_out,
+            ciphertext,
+            cpu::features(),
+        )
+    }
+
+    /// Like [`super::OpeningKey::open_in_place()`], except it accepts an
+    /// arbitrary nonce.
+    ///
+    /// `nonce` must be unique for every use of the key to open data.
+    #[inline]
+    pub fn open_in_place<'in_out, A>(
+        &self,
+        nonce: Nonce,
+        aad: Aad<A>,
+        in_out: &'in_out mut [u8],
+    ) -> Result<&'in_out mut [u8], error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        self.open_within(nonce, aad, in_out, 0..)
+    }
+
+    /// Like [`super::OpeningKey::open_within()`], except it accepts an
+    /// arbitrary nonce.
+    ///
+    /// `nonce` must be unique for every use of the key to open data.
+    #[inline]
+    pub fn open_within<'in_out, A>(
+        &self,
+        nonce: Nonce,
+        aad: Aad<A>,
+        in_out: &'in_out mut [u8],
+        ciphertext_and_tag: RangeFrom<usize>,
+    ) -> Result<&'in_out mut [u8], error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        let tag_offset = in_out
+            .len()
+            .checked_sub(TAG_LEN)
+            .ok_or(error::Unspecified)?;
+
+        // Split the tag off the end of `in_out`.
+        let (in_out, received_tag) = in_out.split_at_mut(tag_offset);
+        let received_tag = (*received_tag).try_into()?;
+        let ciphertext = ciphertext_and_tag;
+
+        self.open_in_place_separate_tag(nonce, aad, received_tag, in_out, ciphertext)
+    }
+
+    /// Like [`super::SealingKey::seal_in_place_append_tag()`], except it
+    /// accepts an arbitrary nonce.
+    ///
+    /// `nonce` must be unique for every use of the key to seal data.
+    #[inline]
+    pub fn seal_in_place_append_tag<A, InOut>(
+        &self,
+        nonce: Nonce,
+        aad: Aad<A>,
+        in_out: &mut InOut,
+    ) -> Result<(), error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+        InOut: AsMut<[u8]> + for<'in_out> Extend<&'in_out u8>,
+    {
+        self.seal_in_place_separate_tag(nonce, aad, in_out.as_mut())
+            .map(|tag| in_out.extend(tag.as_ref()))
+    }
+
+    /// Like `super::SealingKey::seal_in_place_separate_tag()`, except it
+    /// accepts an arbitrary nonce.
+    ///
+    /// `nonce` must be unique for every use of the key to seal data.
+    #[inline]
+    pub fn seal_in_place_separate_tag<A>(
+        &self,
+        nonce: Nonce,
+        aad: Aad<A>,
+        in_out: &mut [u8],
+    ) -> Result<Tag, error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        self.algorithm.seal(
+            &self.inner,
+            nonce,
+            Aad::from(aad.as_ref()),
+            in_out,
+            cpu::features(),
+        )
+    }
+
+    /// The key's AEAD algorithm.
+    #[inline]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.algorithm
+    }
+
+    pub(super) fn fmt_debug(
+        &self,
+        type_name: &'static str,
+        f: &mut core::fmt::Formatter,
+    ) -> Result<(), core::fmt::Error> {
+        f.debug_struct(type_name)
+            .field("algorithm", &self.algorithm())
+            .finish()
+    }
+}
+
+impl core::fmt::Debug for LessSafeKey {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        self.fmt_debug("LessSafeKey", f)
+    }
+}
diff --git a/ring-0.17.14/src/aead/nonce.rs b/ring-0.17.14/src/aead/nonce.rs
new file mode 100644
index 0000000000..e314fae8d2
--- /dev/null
+++ b/ring-0.17.14/src/aead/nonce.rs
@@ -0,0 +1,51 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::error;
+
+/// A nonce for a single AEAD opening or sealing operation.
+///
+/// The user must ensure, for a particular key, that each nonce is unique.
+///
+/// `Nonce` intentionally doesn't implement `Clone` to ensure that each one is
+/// consumed at most once.
+pub struct Nonce([u8; NONCE_LEN]);
+
+impl Nonce {
+    /// Constructs a `Nonce` with the given value, assuming that the value is
+    /// unique for the lifetime of the key it is being used with.
+    ///
+    /// Fails if `value` isn't `NONCE_LEN` bytes long.
+    #[inline]
+    pub fn try_assume_unique_for_key(value: &[u8]) -> Result<Self, error::Unspecified> {
+        let value: &[u8; NONCE_LEN] = value.try_into()?;
+        Ok(Self::assume_unique_for_key(*value))
+    }
+
+    /// Constructs a `Nonce` with the given value, assuming that the value is
+    /// unique for the lifetime of the key it is being used with.
+    #[inline]
+    pub fn assume_unique_for_key(value: [u8; NONCE_LEN]) -> Self {
+        Self(value)
+    }
+}
+
+impl AsRef<[u8; NONCE_LEN]> for Nonce {
+    fn as_ref(&self) -> &[u8; NONCE_LEN] {
+        &self.0
+    }
+}
+
+/// All the AEADs we support use 96-bit nonces.
+pub const NONCE_LEN: usize = 96 / 8;
diff --git a/ring-0.17.14/src/aead/opening_key.rs b/ring-0.17.14/src/aead/opening_key.rs
new file mode 100644
index 0000000000..9e10fc9a05
--- /dev/null
+++ b/ring-0.17.14/src/aead/opening_key.rs
@@ -0,0 +1,143 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Authenticated Encryption with Associated Data (AEAD).
+//!
+//! See [Authenticated encryption: relations among notions and analysis of the
+//! generic composition paradigm][AEAD] for an introduction to the concept of
+//! AEADs.
+//!
+//! [AEAD]: https://eprint.iacr.org/2000/025.pdf
+//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD
+
+use super::{Aad, Algorithm, BoundKey, LessSafeKey, NonceSequence, UnboundKey};
+use crate::error;
+use core::ops::RangeFrom;
+
+/// An AEAD key for authenticating and decrypting ("opening"), bound to a nonce
+/// sequence.
+///
+/// Intentionally not `Clone` or `Copy` since cloning would allow duplication
+/// of the nonce sequence.
+pub struct OpeningKey<N: NonceSequence> {
+    key: LessSafeKey,
+    nonce_sequence: N,
+}
+
+impl<N: NonceSequence> BoundKey<N> for OpeningKey<N> {
+    fn new(key: UnboundKey, nonce_sequence: N) -> Self {
+        Self {
+            key: key.into_inner(),
+            nonce_sequence,
+        }
+    }
+
+    #[inline]
+    fn algorithm(&self) -> &'static Algorithm {
+        self.key.algorithm()
+    }
+}
+
+impl<N: NonceSequence> core::fmt::Debug for OpeningKey<N> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        self.key.fmt_debug("OpeningKey", f)
+    }
+}
+
+impl<N: NonceSequence> OpeningKey<N> {
+    /// Authenticates and decrypts (“opens”) data in place.
+    ///
+    /// `aad` is the additional authenticated data (AAD), if any.
+    ///
+    /// On input, `in_out` must be the ciphertext followed by the tag. When
+    /// `open_in_place()` returns `Ok(plaintext)`, the input ciphertext
+    /// has been overwritten by the plaintext; `plaintext` will refer to the
+    /// plaintext without the tag.
+    ///
+    /// When `open_in_place()` returns `Err(..)`, `in_out` may have been
+    /// overwritten in an unspecified way.
+    #[inline]
+    pub fn open_in_place<'in_out, A>(
+        &mut self,
+        aad: Aad<A>,
+        in_out: &'in_out mut [u8],
+    ) -> Result<&'in_out mut [u8], error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        self.key
+            .open_in_place(self.nonce_sequence.advance()?, aad, in_out)
+    }
+
+    /// Authenticates and decrypts (“opens”) data in place, with a shift.
+    ///
+    /// `aad` is the additional authenticated data (AAD), if any.
+    ///
+    /// On input, `in_out[ciphertext_and_tag]` must be the ciphertext followed
+    /// by the tag. When `open_within()` returns `Ok(plaintext)`, the plaintext
+    /// will be at `in_out[0..plaintext.len()]`. In other words, the following
+    /// two code fragments are equivalent for valid values of
+    /// `ciphertext_and_tag`, except `open_within` will often be more efficient:
+    ///
+    ///
+    /// ```skip
+    /// let plaintext = key.open_within(aad, in_out, cipertext_and_tag)?;
+    /// ```
+    ///
+    /// ```skip
+    /// let ciphertext_and_tag_len = in_out[ciphertext_and_tag].len();
+    /// in_out.copy_within(ciphertext_and_tag, 0);
+    /// let plaintext = key.open_in_place(aad, &mut in_out[..ciphertext_and_tag_len])?;
+    /// ```
+    ///
+    /// Similarly, `key.open_within(aad, in_out, 0..)` is equivalent to
+    /// `key.open_in_place(aad, in_out)`.
+    ///
+    ///  When `open_in_place()` returns `Err(..)`, `in_out` may have been
+    /// overwritten in an unspecified way.
+    ///
+    /// The shifting feature is useful in the case where multiple packets are
+    /// being reassembled in place. Consider this example where the peer has
+    /// sent the message “Split stream reassembled in place” split into
+    /// three sealed packets:
+    ///
+    /// ```ascii-art
+    ///                 Packet 1                  Packet 2                 Packet 3
+    /// Input:  [Header][Ciphertext][Tag][Header][Ciphertext][Tag][Header][Ciphertext][Tag]
+    ///                      |         +--------------+                        |
+    ///               +------+   +-----+    +----------------------------------+
+    ///               v          v          v
+    /// Output: [Plaintext][Plaintext][Plaintext]
+    ///        “Split stream reassembled in place”
+    /// ```
+    ///
+    /// This reassembly can be accomplished with three calls to `open_within()`.
+    #[inline]
+    pub fn open_within<'in_out, A>(
+        &mut self,
+        aad: Aad<A>,
+        in_out: &'in_out mut [u8],
+        ciphertext_and_tag: RangeFrom<usize>,
+    ) -> Result<&'in_out mut [u8], error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        self.key.open_within(
+            self.nonce_sequence.advance()?,
+            aad,
+            in_out,
+            ciphertext_and_tag,
+        )
+    }
+}
diff --git a/ring-0.17.14/src/aead/overlapping/array.rs b/ring-0.17.14/src/aead/overlapping/array.rs
new file mode 100644
index 0000000000..9c797da187
--- /dev/null
+++ b/ring-0.17.14/src/aead/overlapping/array.rs
@@ -0,0 +1,60 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg_attr(not(test), allow(dead_code))]
+
+use super::Overlapping;
+use crate::error::LenMismatchError;
+use core::array::TryFromSliceError;
+
+pub struct Array<'o, T, const N: usize> {
+    // Invariant: N != 0.
+    // Invariant: `self.in_out.len() == N`.
+    in_out: Overlapping<'o, T>,
+}
+
+impl<'o, T, const N: usize> Array<'o, T, N> {
+    pub(super) fn new(in_out: Overlapping<'o, T>) -> Result<Self, LenMismatchError> {
+        if N == 0 || in_out.len() != N {
+            return Err(LenMismatchError::new(N));
+        }
+        Ok(Self { in_out })
+    }
+
+    pub fn into_unwritten_output(self) -> &'o mut [T; N]
+    where
+        &'o mut [T]: TryInto<&'o mut [T; N], Error = TryFromSliceError>,
+    {
+        self.in_out
+            .into_unwritten_output()
+            .try_into()
+            .unwrap_or_else(|TryFromSliceError { .. }| {
+                unreachable!() // Due to invariant
+            })
+    }
+}
+
+impl<T, const N: usize> Array<'_, T, N> {
+    pub fn input<'s>(&'s self) -> &'s [T; N]
+    where
+        &'s [T]: TryInto<&'s [T; N], Error = TryFromSliceError>,
+    {
+        self.in_out
+            .input()
+            .try_into()
+            .unwrap_or_else(|TryFromSliceError { .. }| {
+                unreachable!() // Due to invariant
+            })
+    }
+}
diff --git a/ring-0.17.14/src/aead/overlapping/base.rs b/ring-0.17.14/src/aead/overlapping/base.rs
new file mode 100644
index 0000000000..6923587223
--- /dev/null
+++ b/ring-0.17.14/src/aead/overlapping/base.rs
@@ -0,0 +1,152 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub use self::index_error::IndexError;
+use super::Array;
+use crate::error::LenMismatchError;
+use core::{mem, ops::RangeFrom};
+
+pub struct Overlapping<'o, T> {
+    // Invariant: self.src.start <= in_out.len().
+    in_out: &'o mut [T],
+    src: RangeFrom<usize>,
+}
+
+impl<'o, T> From<&'o mut [T]> for Overlapping<'o, T> {
+    fn from(in_out: &'o mut [T]) -> Self {
+        Self { in_out, src: 0.. }
+    }
+}
+
+impl<'o, T> Overlapping<'o, T> {
+    pub fn new(in_out: &'o mut [T], src: RangeFrom<usize>) -> Result<Self, IndexError> {
+        match in_out.get(src.clone()) {
+            Some(_) => Ok(Self { in_out, src }),
+            None => Err(IndexError::new(src.start)),
+        }
+    }
+
+    #[cfg(any(
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86"
+    ))]
+    pub fn copy_within(self) -> &'o mut [T]
+    where
+        T: Copy,
+    {
+        if self.src.start == 0 {
+            self.in_out
+        } else {
+            let len = self.len();
+            self.in_out.copy_within(self.src, 0);
+            &mut self.in_out[..len]
+        }
+    }
+
+    #[cfg(any(
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86"
+    ))]
+    pub fn into_slice_src_mut(self) -> (&'o mut [T], RangeFrom<usize>) {
+        (self.in_out, self.src)
+    }
+
+    pub fn into_unwritten_output(self) -> &'o mut [T] {
+        let len = self.len();
+        self.in_out.get_mut(..len).unwrap_or_else(|| {
+            // The invariant ensures this succeeds.
+            unreachable!()
+        })
+    }
+}
+
+impl<T> Overlapping<'_, T> {
+    pub fn len(&self) -> usize {
+        self.input().len()
+    }
+
+    pub fn input(&self) -> &[T] {
+        self.in_out.get(self.src.clone()).unwrap_or_else(|| {
+            // Ensured by invariant.
+            unreachable!()
+        })
+    }
+
+    pub fn with_input_output_len<R>(self, f: impl FnOnce(*const T, *mut T, usize) -> R) -> R {
+        let len = self.len();
+        let output = self.in_out.as_mut_ptr();
+        // TODO: MSRV(1.65): use `output.cast_const()`
+        let output_const: *const T = output;
+        // SAFETY: The constructor ensures that `src` is a valid range.
+        // Equivalent to `self.in_out[src.clone()].as_ptr()` but without
+        // worries about compatibility with the stacked borrows model.
+        // TODO(MSRV-1.80, probably): Avoid special casing 0; see
+        // https://github.com/rust-lang/rust/pull/117329
+        // https://github.com/rust-lang/rustc_codegen_gcc/issues/516
+        let input = if self.src.start == 0 {
+            output_const
+        } else {
+            unsafe { output_const.add(self.src.start) }
+        };
+        f(input, output, len)
+    }
+
+    // Perhaps unlike `slice::split_first_chunk_mut`, this is biased,
+    // performance-wise, against the case where `N > self.len()`, so callers
+    // should be structured to avoid that.
+    //
+    // If the result is `Err` then nothing was written to `self`; if anything
+    // was written then the result will not be `Err`.
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub fn split_first_chunk<const N: usize>(
+        mut self,
+        f: impl for<'a> FnOnce(Array<'a, T, N>),
+    ) -> Result<Self, IndexError> {
+        let src = self.src.clone();
+        let end = self
+            .src
+            .start
+            .checked_add(N)
+            .ok_or_else(|| IndexError::new(N))?;
+        let first = self
+            .in_out
+            .get_mut(..end)
+            .ok_or_else(|| IndexError::new(N))?;
+        let first = Overlapping::new(first, src).unwrap_or_else(|IndexError { .. }| {
+            // Since `end == src.start + N`.
+            unreachable!()
+        });
+        let first = Array::new(first).unwrap_or_else(|LenMismatchError { .. }| {
+            // Since `end == src.start + N`.
+            unreachable!()
+        });
+        // Once we call `f`, we must return `Ok` because `f` may have written
+        // over (part of) the input.
+        Ok({
+            f(first);
+            let tail = mem::take(&mut self.in_out).get_mut(N..).unwrap_or_else(|| {
+                // There are at least `N` elements since `end == src.start + N`.
+                unreachable!()
+            });
+            Self::new(tail, self.src).unwrap_or_else(|IndexError { .. }| {
+                // Follows from `end == src.start + N`.
+                unreachable!()
+            })
+        })
+    }
+}
+
+cold_exhaustive_error! {
+    struct index_error::IndexError { index: usize }
+}
diff --git a/ring-0.17.14/src/aead/overlapping/mod.rs b/ring-0.17.14/src/aead/overlapping/mod.rs
new file mode 100644
index 0000000000..77086cda43
--- /dev/null
+++ b/ring-0.17.14/src/aead/overlapping/mod.rs
@@ -0,0 +1,23 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub use self::{
+    array::Array,
+    base::{IndexError, Overlapping},
+    partial_block::PartialBlock,
+};
+
+mod array;
+mod base;
+mod partial_block;
diff --git a/ring-0.17.14/src/aead/overlapping/partial_block.rs b/ring-0.17.14/src/aead/overlapping/partial_block.rs
new file mode 100644
index 0000000000..9b7864a1e1
--- /dev/null
+++ b/ring-0.17.14/src/aead/overlapping/partial_block.rs
@@ -0,0 +1,59 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::Overlapping;
+use crate::error::InputTooLongError;
+
+pub struct PartialBlock<'i, T, const BLOCK_LEN: usize> {
+    // invariant: `self.in_out.len() < BLOCK_LEN`.
+    in_out: Overlapping<'i, T>,
+}
+
+impl<'i, T, const BLOCK_LEN: usize> PartialBlock<'i, T, BLOCK_LEN> {
+    pub fn new(in_out: Overlapping<'i, T>) -> Result<Self, InputTooLongError> {
+        let len = in_out.len();
+        if len >= BLOCK_LEN {
+            return Err(InputTooLongError::new(len));
+        }
+        Ok(Self { in_out })
+    }
+
+    pub fn overwrite_at_start(self, padded: [T; BLOCK_LEN])
+    where
+        T: Copy,
+    {
+        let len = self.len();
+        let output = self.in_out.into_unwritten_output();
+        assert!(output.len() <= padded.len());
+        output.copy_from_slice(&padded[..len]);
+    }
+}
+
+impl<T, const BLOCK_LEN: usize> PartialBlock<'_, T, BLOCK_LEN> {
+    #[inline(always)]
+    pub fn input(&self) -> &[T] {
+        let r = self.in_out.input();
+        // Help the optimizer optimize the caller using the invariant.
+        // TODO: Does this actually help?
+        if r.len() >= BLOCK_LEN {
+            unreachable!()
+        }
+        r
+    }
+
+    #[inline(always)]
+    pub fn len(&self) -> usize {
+        self.input().len()
+    }
+}
diff --git a/ring-0.17.14/src/aead/poly1305.rs b/ring-0.17.14/src/aead/poly1305.rs
new file mode 100644
index 0000000000..70dcda3126
--- /dev/null
+++ b/ring-0.17.14/src/aead/poly1305.rs
@@ -0,0 +1,117 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// TODO: enforce maximum input length.
+
+use super::{Tag, TAG_LEN};
+#[cfg(all(target_arch = "arm", target_endian = "little"))]
+use crate::cpu::GetFeature as _;
+use crate::{cpu, polyfill::slice::AsChunks};
+
+mod ffi_arm_neon;
+mod ffi_fallback;
+
+/// A Poly1305 key.
+pub(super) struct Key {
+    key_and_nonce: [u8; KEY_LEN],
+}
+
+pub(super) const BLOCK_LEN: usize = 16;
+pub(super) const KEY_LEN: usize = 2 * BLOCK_LEN;
+
+impl Key {
+    #[inline]
+    pub(super) fn new(key_and_nonce: [u8; KEY_LEN]) -> Self {
+        Self { key_and_nonce }
+    }
+}
+
+pub(super) enum Context {
+    #[cfg(all(target_arch = "arm", target_endian = "little"))]
+    ArmNeon(ffi_arm_neon::State),
+    Fallback(ffi_fallback::State),
+}
+
+impl Context {
+    #[inline]
+    pub(super) fn from_key(key: Key, cpu: cpu::Features) -> Self {
+        #[cfg(all(target_arch = "arm", target_endian = "little"))]
+        if let Some(cpu) = cpu.get_feature() {
+            return ffi_arm_neon::State::new_context(key, cpu);
+        }
+        let _: cpu::Features = cpu;
+        ffi_fallback::State::new_context(key)
+    }
+
+    pub fn update_block(&mut self, input: [u8; BLOCK_LEN]) {
+        self.update(AsChunks::from_ref(&input))
+    }
+
+    pub fn update(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
+        self.update_internal(input.as_flattened());
+    }
+
+    fn update_internal(&mut self, input: &[u8]) {
+        match self {
+            #[cfg(all(target_arch = "arm", target_endian = "little"))]
+            Self::ArmNeon(state) => state.update_internal(input),
+            Self::Fallback(state) => state.update_internal(input),
+        }
+    }
+
+    pub(super) fn finish(mut self, input: &[u8]) -> Tag {
+        self.update_internal(input);
+        match self {
+            #[cfg(all(target_arch = "arm", target_endian = "little"))]
+            Self::ArmNeon(state) => state.finish(),
+            Self::Fallback(state) => state.finish(),
+        }
+    }
+}
+
+/// Implements the original, non-IETF padding semantics.
+///
+/// This is used by chacha20_poly1305_openssh and the standalone
+/// poly1305 test vectors.
+pub(super) fn sign(key: Key, input: &[u8], cpu_features: cpu::Features) -> Tag {
+    let ctx = Context::from_key(key, cpu_features);
+    ctx.finish(input)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::testutil as test;
+
+    // Adapted from BoringSSL's crypto/poly1305/poly1305_test.cc.
+    #[test]
+    pub fn test_poly1305() {
+        let cpu_features = cpu::features();
+        test::run(
+            test_vector_file!("poly1305_test.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+                let key = test_case.consume_bytes("Key");
+                let key: &[u8; KEY_LEN] = key.as_slice().try_into().unwrap();
+                let input = test_case.consume_bytes("Input");
+                let expected_mac = test_case.consume_bytes("MAC");
+                let key = Key::new(*key);
+                let Tag(actual_mac) = sign(key, &input, cpu_features);
+                assert_eq!(expected_mac, actual_mac.as_ref());
+
+                Ok(())
+            },
+        )
+    }
+}
diff --git a/ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs b/ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs
new file mode 100644
index 0000000000..5c1542d191
--- /dev/null
+++ b/ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs
@@ -0,0 +1,98 @@
+// Copyright 2015-2025 Brian Smith.
+// Portions Copyright (c) 2014, 2015, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(all(target_arch = "arm", target_endian = "little"))]
+
+use super::{Key, Tag, KEY_LEN, TAG_LEN};
+use crate::{c, cpu::arm::Neon};
+use core::num::NonZeroUsize;
+
+// XXX/TODO(MSRV): change to `pub(super)`.
+pub(in super::super) struct State {
+    state: poly1305_state_st,
+    neon: Neon,
+}
+
+// TODO: Is 16 enough?
+#[repr(C, align(16))]
+struct poly1305_state_st {
+    r: fe1305x2,
+    h: fe1305x2,
+    c: fe1305x2,
+    precomp: [fe1305x2; 2],
+
+    data: [u8; data_len()],
+
+    buf: [u8; 32],
+    buf_used: c::size_t,
+    key: [u8; 16],
+}
+
+const fn data_len() -> usize {
+    128
+}
+
+#[derive(Clone, Copy)]
+#[repr(C)]
+struct fe1305x2 {
+    v: [u32; 12], // for alignment; only using 10
+}
+
+impl State {
+    pub(super) fn new_context(Key { key_and_nonce }: Key, neon: Neon) -> super::Context {
+        prefixed_extern! {
+            fn CRYPTO_poly1305_init_neon(state: &mut poly1305_state_st, key: &[u8; KEY_LEN]);
+        }
+        let mut r = Self {
+            state: poly1305_state_st {
+                r: fe1305x2 { v: [0; 12] },
+                h: fe1305x2 { v: [0; 12] },
+                c: fe1305x2 { v: [0; 12] },
+                precomp: [fe1305x2 { v: [0; 12] }; 2],
+
+                data: [0u8; data_len()],
+                buf: Default::default(),
+                buf_used: 0,
+                key: [0u8; 16],
+            },
+            neon,
+        };
+        unsafe { CRYPTO_poly1305_init_neon(&mut r.state, &key_and_nonce) }
+        super::Context::ArmNeon(r)
+    }
+
+    pub(super) fn update_internal(&mut self, input: &[u8]) {
+        prefixed_extern! {
+            fn CRYPTO_poly1305_update_neon(
+                st: &mut poly1305_state_st,
+                input: *const u8,
+                in_len: c::NonZero_size_t);
+        }
+        if let Some(len) = NonZeroUsize::new(input.len()) {
+            let _: Neon = self.neon;
+            let input = input.as_ptr();
+            unsafe { CRYPTO_poly1305_update_neon(&mut self.state, input, len) }
+        }
+    }
+
+    pub(super) fn finish(mut self) -> Tag {
+        prefixed_extern! {
+            fn CRYPTO_poly1305_finish_neon(st: &mut poly1305_state_st, mac: &mut [u8; TAG_LEN]);
+        }
+        let mut tag = Tag([0u8; TAG_LEN]);
+        unsafe { CRYPTO_poly1305_finish_neon(&mut self.state, &mut tag.0) }
+        tag
+    }
+}
diff --git a/ring-0.17.14/src/aead/poly1305/ffi_fallback.rs b/ring-0.17.14/src/aead/poly1305/ffi_fallback.rs
new file mode 100644
index 0000000000..f689cfcb33
--- /dev/null
+++ b/ring-0.17.14/src/aead/poly1305/ffi_fallback.rs
@@ -0,0 +1,96 @@
+// Copyright 2015-2025 Brian Smith.
+// Portions Copyright (c) 2014, 2015, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Key, Tag, KEY_LEN, TAG_LEN};
+use crate::c;
+use core::num::NonZeroUsize;
+
+// XXX/TODO(MSRV): change to `pub(super)`.
+pub(in super::super) struct State {
+    state: poly1305_state_st,
+}
+
+// Keep in sync with `poly1305_state_st` in poly1305.c
+#[repr(C, align(64))]
+struct poly1305_state_st {
+    r0: u32,
+    r1: u32,
+    r2: u32,
+    r3: u32,
+    r4: u32,
+    s1: u32,
+    s2: u32,
+    s3: u32,
+    s4: u32,
+    h0: u32,
+    h1: u32,
+    h2: u32,
+    h3: u32,
+    h4: u32,
+    key: [u8; 16],
+}
+
+impl State {
+    pub(super) fn new_context(Key { key_and_nonce }: Key) -> super::Context {
+        prefixed_extern! {
+            fn CRYPTO_poly1305_init(state: &mut poly1305_state_st, key: &[u8; KEY_LEN]);
+        }
+        let mut r = Self {
+            state: poly1305_state_st {
+                r0: 0,
+                r1: 0,
+                r2: 0,
+                r3: 0,
+                r4: 0,
+                s1: 0,
+                s2: 0,
+                s3: 0,
+                s4: 0,
+                h0: 0,
+                h1: 0,
+                h2: 0,
+                h3: 0,
+                h4: 0,
+                key: [0u8; 16],
+            },
+        };
+        unsafe { CRYPTO_poly1305_init(&mut r.state, &key_and_nonce) }
+        super::Context::Fallback(r)
+    }
+
+    // `input.len % BLOCK_LEN == 0` must be true for every call except the
+    // final one.
+    pub(super) fn update_internal(&mut self, input: &[u8]) {
+        prefixed_extern! {
+            fn CRYPTO_poly1305_update(
+                state: &mut poly1305_state_st,
+                input: *const u8,
+                in_len: c::NonZero_size_t);
+        }
+        if let Some(len) = NonZeroUsize::new(input.len()) {
+            let input = input.as_ptr();
+            unsafe { CRYPTO_poly1305_update(&mut self.state, input, len) }
+        }
+    }
+
+    pub(super) fn finish(mut self) -> Tag {
+        prefixed_extern! {
+            fn CRYPTO_poly1305_finish(statep: &mut poly1305_state_st, mac: &mut [u8; TAG_LEN]);
+        }
+        let mut tag = Tag([0u8; TAG_LEN]);
+        unsafe { CRYPTO_poly1305_finish(&mut self.state, &mut tag.0) }
+        tag
+    }
+}
diff --git a/ring-0.17.14/src/aead/poly1305_test.txt b/ring-0.17.14/src/aead/poly1305_test.txt
new file mode 100644
index 0000000000..586477df8a
--- /dev/null
+++ b/ring-0.17.14/src/aead/poly1305_test.txt
@@ -0,0 +1,170 @@
+# Test Vectors from OpenSSL commit bbe9769ba66ab2512678a87b0d9b266ba970db05.
+
+Key = 2d773be37adb1e4d683bf0075e79c4ee037918535a7f99ccb7040fb5f5f43aea
+Input = 89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a91c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a620334270372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f141300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595
+MAC = c85d15ed44c378d6b00e23064c7bcd51
+
+Key = 99e5822dd4173c995e3dae0ddefb97743fde3b080134b39f76e9bf8d0e88d546
+Input = 000000000000000b170303020000000006db1f1f368d696a810a349c0c714c9a5e7850c2407d721acded95e018d7a85266a6e1289cdb4aeb18da5ac8a2b0026d24a59ad485227f3eaedbb2e7e35e1c66cd60f9abf716dcc9ac42682dd7dab287a7024c4eefc321cc0574e16793e37cec03c5bda42b54c114a80b57af26416c7be742005e20855c73e21dc8e2edc9d435cb6f6059280011c270b71570051c1c9b3052126620bc1e2730fa066c7a509d53c60e5ae1b40aa6e39e49669228c90eecb4a50db32a50bc49e90b4f4b359a1dfd11749cd3867fcf2fb7bb6cd4738f6a4ad6f7ca5058f7618845af9f020f6c3b967b8f4cd4a91e2813b507ae66f2d35c18284f7292186062e10fd5510d18775351ef334e7634ab4743f5b68f49adcab384d3fd75f7390f4006ef2a295c8c7a076ad54546cd25d2107fbe1436c840924aaebe5b370893cd63d1325b8616fc4810886bc152c53221b6df373119393255ee72bcaa880174f1717f9184fa91646f17a24ac55d16bfddca9581a92eda479201f0edbf633600d6066d1ab36d5d2415d71351bbcd608a25108d25641992c1f26c531cf9f90203bc4cc19f5927d834b0a47116d3884bbb164b8ec883d1ac832e56b3918a98601a08d171881541d594db399c6ae6151221745aec814c45b0b05b565436fd6f137aa10a0c0b643761dbd6f9a9dcb99b1a6e690854ce0769cde39761d82fcdec15f0d92d7d8e94ade8eb83fbe0
+MAC = 2637408fe13086ea73f971e3425e2820
+
+
+# RFC 8439, section 2.5.2.
+
+Key = 85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b
+Input = "Cryptographic Forum Research Group"
+MAC = a8061dc1305136c6c22b8baf0c0127a9
+
+
+# RFC 8439, section A.3.
+
+Key = 0000000000000000000000000000000000000000000000000000000000000000
+Input = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+MAC = 00000000000000000000000000000000
+
+Key = 0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e
+Input = 416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e792073746174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e204945544620616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d656e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e20616e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074696d65206f7220706c6163652c207768696368206172652061646472657373656420746f
+MAC = 36e5f6b5c5e06070f0efca96227a863e
+
+Key = 36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000
+Input = 416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e792073746174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e204945544620616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d656e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e20616e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074696d65206f7220706c6163652c207768696368206172652061646472657373656420746f
+MAC = f3477e7cd95417af89a6b8794c310cf0
+
+Key = 1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0
+Input = 2754776173206272696c6c69672c20616e642074686520736c6974687920746f7665730a446964206779726520616e642067696d626c6520696e2074686520776162653a0a416c6c206d696d737920776572652074686520626f726f676f7665732c0a416e6420746865206d6f6d65207261746873206f757467726162652e
+MAC = 4541669a7eaaee61e708dc7cbcc5eb62
+
+Key = 0200000000000000000000000000000000000000000000000000000000000000
+Input = ffffffffffffffffffffffffffffffff
+MAC = 03000000000000000000000000000000
+
+Key = 02000000000000000000000000000000ffffffffffffffffffffffffffffffff
+Input = 02000000000000000000000000000000
+MAC = 03000000000000000000000000000000
+
+Key = 0100000000000000000000000000000000000000000000000000000000000000
+Input = fffffffffffffffffffffffffffffffff0ffffffffffffffffffffffffffffff11000000000000000000000000000000
+MAC = 05000000000000000000000000000000
+
+Key = 0100000000000000000000000000000000000000000000000000000000000000
+Input = fffffffffffffffffffffffffffffffffbfefefefefefefefefefefefefefefe01010101010101010101010101010101
+MAC = 00000000000000000000000000000000
+
+Key = 0200000000000000000000000000000000000000000000000000000000000000
+Input = fdffffffffffffffffffffffffffffff
+MAC = faffffffffffffffffffffffffffffff
+
+Key = 0100000000000000040000000000000000000000000000000000000000000000
+Input = e33594d7505e43b900000000000000003394d7505e4379cd01000000000000000000000000000000000000000000000001000000000000000000000000000000
+MAC = 14000000000000005500000000000000
+
+Key = 0100000000000000040000000000000000000000000000000000000000000000
+Input = e33594d7505e43b900000000000000003394d7505e4379cd010000000000000000000000000000000000000000000000
+MAC = 13000000000000000000000000000000
+
+
+# Additional test vectors that are long enough to ensure OpenSSL's SIMD
+# assembly is fully tested.
+
+# Length 2048.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfed
+MAC = 69d28f73dd09d39a92aa179da354b7ea
+
+# Length 2049.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc8
+MAC = d6a26654b88572e875d9661c83471c1b
+
+# Length 2050.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852
+MAC = 9fbbb7f7adcd0cd5b46a4a520b22499a
+
+# Length 2051.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f5
+MAC = eb7cdceb97ade2a07622f8f5a4b1ce15
+
+# Length 2052.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f590
+MAC = d41c310927cd92e14784ea78b85503db
+
+# Length 2053.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073
+MAC = 16af133c423f783a14c49d9f526384cf
+
+# Length 2054.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4
+MAC = 00c75db8f0636b22f195645b03091f5f
+
+# Length 2055.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f434
+MAC = 4a532bc740f581555831345f3b75bf33
+
+# Length 2056.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a
+MAC = 698c7d32c5923871d124a2479e521706
+
+# Length 2057.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c
+MAC = a677187dbf3c927aeeafb9ebce0f61dc
+
+# Length 2058.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a
+MAC = 201fed7eee981b31d2cc42ff6c38141a
+
+# Length 2059.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28
+MAC = 0c3d3d01a37f347c4f7c5826bcafb3e1
+
+# Length 2060.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c9
+MAC = 33a4e0e0bed7c84c5cc5dd4784410f07
+
+# Length 2061.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e
+MAC = 8e41c40a2f8ec58fe594f3a3a2de4ae1
+
+# Length 2062.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e21
+MAC = c6e5d1810fd878ac6b844c66cef36a22
+
+# Length 2063.
+Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e21df
+MAC = f6eaae369c3cb5c05748e8d919178e00
+
+# Regression test for https://rt.openssl.org/Ticket/Display.html?id=4439
+Key = 2d773be37adb1e4d683bf0075e79c4ee037918535a7f99ccb7040fb5f5f43aea
+Input = 89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a91c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a620334270372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f141300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595
+MAC = c85d15ed44c378d6b00e23064c7bcd51
+
+# Regression tests for https://rt.openssl.org/Ticket/Display.html?id=4483
+
+Key = 7f1b02640000000000000000000000000000000000000000cccccccccccccccc
+Input = cccccccccccccccccccccccccccccccccccccccccccccccccc80ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccceccccccccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccccccccccccce3ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccaccccccccccccccccccccce6cccccccccc000000afccccccccccccccccccfffffff5000000000000000000000000000000000000000000000000000000ffffffe70000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000719205a8521dfc
+MAC = 8559b876eceed66eb37798c0457baff9
+
+Key = e00016000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaa
+Input = aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa000000000000000000800264
+MAC = 00bd1258978e205444c9aaaa82006fed
+
+Key = 0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c
+Input = 02fc
+MAC = 06120c0c0c0c0c0c0c0c0c0c0c0c0c0c
+
+Key = 00ff000000000000000000000000000000000000001e00000000000000007b7b
+Input = 7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b001300000000b300000000000000000000000000000000000000000000f20000000000000000000000000000000000002000efff0009000000000000000000000000100000000009000000640000000000000000000000001300000000b300000000000000000000000000000000000000000000f20000000000000000000000000000000000002000efff00090000000000000000007a000010000000000900000064000000000000000000000000000000000000000000000000fc
+MAC = 33205bbf9e9f8f7212ab9e2ab9b7e4a5
diff --git a/ring-0.17.14/src/aead/quic.rs b/ring-0.17.14/src/aead/quic.rs
new file mode 100644
index 0000000000..a0cbe08d7a
--- /dev/null
+++ b/ring-0.17.14/src/aead/quic.rs
@@ -0,0 +1,187 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! QUIC Header Protection.
+//!
+//! See draft-ietf-quic-tls.
+
+use crate::{
+    aead::{aes, chacha},
+    cpu, error, hkdf,
+};
+
+/// A key for generating QUIC Header Protection masks.
+pub struct HeaderProtectionKey {
+    inner: KeyInner,
+    algorithm: &'static Algorithm,
+}
+
+#[allow(clippy::large_enum_variant, variant_size_differences)]
+enum KeyInner {
+    Aes(aes::Key),
+    ChaCha20(chacha::Key),
+}
+
+impl From<hkdf::Okm<'_, &'static Algorithm>> for HeaderProtectionKey {
+    fn from(okm: hkdf::Okm<&'static Algorithm>) -> Self {
+        let mut key_bytes = [0; super::MAX_KEY_LEN];
+        let algorithm = *okm.len();
+        let key_bytes = &mut key_bytes[..algorithm.key_len()];
+        okm.fill(key_bytes).unwrap();
+        Self::new(algorithm, key_bytes).unwrap()
+    }
+}
+
+impl HeaderProtectionKey {
+    /// Create a new header protection key.
+    ///
+    /// `key_bytes` must be exactly `algorithm.key_len` bytes long.
+    pub fn new(
+        algorithm: &'static Algorithm,
+        key_bytes: &[u8],
+    ) -> Result<Self, error::Unspecified> {
+        Ok(Self {
+            inner: (algorithm.init)(key_bytes, cpu::features())?,
+            algorithm,
+        })
+    }
+
+    /// Generate a new QUIC Header Protection mask.
+    ///
+    /// `sample` must be exactly `self.algorithm().sample_len()` bytes long.
+    pub fn new_mask(&self, sample: &[u8]) -> Result<[u8; 5], error::Unspecified> {
+        let sample = <&[u8; SAMPLE_LEN]>::try_from(sample)?;
+
+        let out = (self.algorithm.new_mask)(&self.inner, *sample);
+        Ok(out)
+    }
+
+    /// The key's algorithm.
+    #[inline(always)]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.algorithm
+    }
+}
+
+const SAMPLE_LEN: usize = super::TAG_LEN;
+
+/// QUIC sample for new key masks
+pub type Sample = [u8; SAMPLE_LEN];
+
+/// A QUIC Header Protection Algorithm.
+pub struct Algorithm {
+    init: fn(key: &[u8], cpu_features: cpu::Features) -> Result<KeyInner, error::Unspecified>,
+
+    new_mask: fn(key: &KeyInner, sample: Sample) -> [u8; 5],
+
+    key_len: usize,
+    id: AlgorithmID,
+}
+
+impl hkdf::KeyType for &'static Algorithm {
+    #[inline]
+    fn len(&self) -> usize {
+        self.key_len()
+    }
+}
+
+impl Algorithm {
+    /// The length of the key.
+    #[inline(always)]
+    pub fn key_len(&self) -> usize {
+        self.key_len
+    }
+
+    /// The required sample length.
+    #[inline(always)]
+    pub fn sample_len(&self) -> usize {
+        SAMPLE_LEN
+    }
+}
+
+derive_debug_via_id!(Algorithm);
+
+#[derive(Debug, Eq, PartialEq)]
+enum AlgorithmID {
+    AES_128,
+    AES_256,
+    CHACHA20,
+}
+
+impl PartialEq for Algorithm {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+    }
+}
+
+impl Eq for Algorithm {}
+
+/// AES-128.
+pub static AES_128: Algorithm = Algorithm {
+    key_len: 16,
+    init: aes_init_128,
+    new_mask: aes_new_mask,
+    id: AlgorithmID::AES_128,
+};
+
+/// AES-256.
+pub static AES_256: Algorithm = Algorithm {
+    key_len: 32,
+    init: aes_init_256,
+    new_mask: aes_new_mask,
+    id: AlgorithmID::AES_256,
+};
+
+fn aes_init_128(key: &[u8], cpu_features: cpu::Features) -> Result<KeyInner, error::Unspecified> {
+    let key = key.try_into().map_err(|_| error::Unspecified)?;
+    let aes_key = aes::Key::new(aes::KeyBytes::AES_128(key), cpu_features)?;
+    Ok(KeyInner::Aes(aes_key))
+}
+
+fn aes_init_256(key: &[u8], cpu_features: cpu::Features) -> Result<KeyInner, error::Unspecified> {
+    let key = key.try_into().map_err(|_| error::Unspecified)?;
+    let aes_key = aes::Key::new(aes::KeyBytes::AES_256(key), cpu_features)?;
+    Ok(KeyInner::Aes(aes_key))
+}
+
+fn aes_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] {
+    let aes_key = match key {
+        KeyInner::Aes(key) => key,
+        _ => unreachable!(),
+    };
+
+    aes_key.new_mask(sample)
+}
+
+/// ChaCha20.
+pub static CHACHA20: Algorithm = Algorithm {
+    key_len: chacha::KEY_LEN,
+    init: chacha20_init,
+    new_mask: chacha20_new_mask,
+    id: AlgorithmID::CHACHA20,
+};
+
+fn chacha20_init(key: &[u8], _cpu_features: cpu::Features) -> Result<KeyInner, error::Unspecified> {
+    let chacha20_key: [u8; chacha::KEY_LEN] = key.try_into()?;
+    Ok(KeyInner::ChaCha20(chacha::Key::new(chacha20_key)))
+}
+
+fn chacha20_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] {
+    let chacha20_key = match key {
+        KeyInner::ChaCha20(key) => key,
+        _ => unreachable!(),
+    };
+
+    chacha20_key.new_mask(sample)
+}
diff --git a/ring-0.17.14/src/aead/sealing_key.rs b/ring-0.17.14/src/aead/sealing_key.rs
new file mode 100644
index 0000000000..5b8607b465
--- /dev/null
+++ b/ring-0.17.14/src/aead/sealing_key.rs
@@ -0,0 +1,104 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Authenticated Encryption with Associated Data (AEAD).
+//!
+//! See [Authenticated encryption: relations among notions and analysis of the
+//! generic composition paradigm][AEAD] for an introduction to the concept of
+//! AEADs.
+//!
+//! [AEAD]: https://eprint.iacr.org/2000/025.pdf
+//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD
+
+use super::{Aad, Algorithm, BoundKey, LessSafeKey, NonceSequence, Tag, UnboundKey};
+use crate::error;
+
+/// An AEAD key for encrypting and signing ("sealing"), bound to a nonce
+/// sequence.
+///
+/// Intentionally not `Clone` or `Copy` since cloning would allow duplication
+/// of the nonce sequence.
+pub struct SealingKey<N: NonceSequence> {
+    key: LessSafeKey,
+    nonce_sequence: N,
+}
+
+impl<N: NonceSequence> BoundKey<N> for SealingKey<N> {
+    fn new(key: UnboundKey, nonce_sequence: N) -> Self {
+        Self {
+            key: key.into_inner(),
+            nonce_sequence,
+        }
+    }
+
+    #[inline]
+    fn algorithm(&self) -> &'static Algorithm {
+        self.key.algorithm()
+    }
+}
+
+impl<N: NonceSequence> core::fmt::Debug for SealingKey<N> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        self.key.fmt_debug("SealingKey", f)
+    }
+}
+
+impl<N: NonceSequence> SealingKey<N> {
+    /// Encrypts and signs (“seals”) data in place, appending the tag to the
+    /// resulting ciphertext.
+    ///
+    /// `key.seal_in_place_append_tag(aad, in_out)` is equivalent to:
+    ///
+    /// ```skip
+    /// key.seal_in_place_separate_tag(aad, in_out.as_mut())
+    ///     .map(|tag| in_out.extend(tag.as_ref()))
+    /// ```
+    #[inline]
+    pub fn seal_in_place_append_tag<A, InOut>(
+        &mut self,
+        aad: Aad<A>,
+        in_out: &mut InOut,
+    ) -> Result<(), error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+        InOut: AsMut<[u8]> + for<'in_out> Extend<&'in_out u8>,
+    {
+        self.key
+            .seal_in_place_append_tag(self.nonce_sequence.advance()?, aad, in_out)
+    }
+
+    /// Encrypts and signs (“seals”) data in place.
+    ///
+    /// `aad` is the additional authenticated data (AAD), if any. This is
+    /// authenticated but not encrypted. The type `A` could be a byte slice
+    /// `&[u8]`, a byte array `[u8; N]` for some constant `N`, `Vec<u8>`, etc.
+    /// If there is no AAD then use `Aad::empty()`.
+    ///
+    /// The plaintext is given as the input value of `in_out`. `seal_in_place()`
+    /// will overwrite the plaintext with the ciphertext and return the tag.
+    /// For most protocols, the caller must append the tag to the ciphertext.
+    /// The tag will be `self.algorithm.tag_len()` bytes long.
+    #[inline]
+    pub fn seal_in_place_separate_tag<A>(
+        &mut self,
+        aad: Aad<A>,
+        in_out: &mut [u8],
+    ) -> Result<Tag, error::Unspecified>
+    where
+        A: AsRef<[u8]>,
+    {
+        self.key
+            .seal_in_place_separate_tag(self.nonce_sequence.advance()?, aad, in_out)
+    }
+}
diff --git a/ring-0.17.14/src/aead/shift.rs b/ring-0.17.14/src/aead/shift.rs
new file mode 100644
index 0000000000..3bd7f46ee4
--- /dev/null
+++ b/ring-0.17.14/src/aead/shift.rs
@@ -0,0 +1,32 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#[cfg(target_arch = "x86")]
+pub fn shift_full_blocks<const BLOCK_LEN: usize>(
+    in_out: super::overlapping::Overlapping<'_, u8>,
+    mut transform: impl FnMut(&[u8; BLOCK_LEN]) -> [u8; BLOCK_LEN],
+) {
+    let (in_out, src) = in_out.into_slice_src_mut();
+    let in_out_len = in_out[src.clone()].len();
+
+    for i in (0..in_out_len).step_by(BLOCK_LEN) {
+        let block = {
+            let input =
+                <&[u8; BLOCK_LEN]>::try_from(&in_out[(src.start + i)..][..BLOCK_LEN]).unwrap();
+            transform(input)
+        };
+        let output = <&mut [u8; BLOCK_LEN]>::try_from(&mut in_out[i..][..BLOCK_LEN]).unwrap();
+        *output = block;
+    }
+}
diff --git a/ring-0.17.14/src/aead/unbound_key.rs b/ring-0.17.14/src/aead/unbound_key.rs
new file mode 100644
index 0000000000..23f9df9eb0
--- /dev/null
+++ b/ring-0.17.14/src/aead/unbound_key.rs
@@ -0,0 +1,74 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Authenticated Encryption with Associated Data (AEAD).
+//!
+//! See [Authenticated encryption: relations among notions and analysis of the
+//! generic composition paradigm][AEAD] for an introduction to the concept of
+//! AEADs.
+//!
+//! [AEAD]: https://eprint.iacr.org/2000/025.pdf
+//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD
+
+use super::{Algorithm, LessSafeKey, MAX_KEY_LEN};
+use crate::{cpu, error, hkdf};
+
+/// An AEAD key without a designated role or nonce sequence.
+pub struct UnboundKey {
+    inner: LessSafeKey,
+}
+
+impl UnboundKey {
+    /// Constructs a `UnboundKey`.
+    ///
+    /// Fails if `key_bytes.len() != algorithm.key_len()`.
+    #[inline]
+    pub fn new(
+        algorithm: &'static Algorithm,
+        key_bytes: &[u8],
+    ) -> Result<Self, error::Unspecified> {
+        Ok(Self {
+            inner: LessSafeKey::new_(algorithm, key_bytes, cpu::features())?,
+        })
+    }
+
+    /// The key's AEAD algorithm.
+    #[inline]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.inner.algorithm()
+    }
+
+    #[inline]
+    pub(super) fn into_inner(self) -> LessSafeKey {
+        self.inner
+    }
+}
+
+impl core::fmt::Debug for UnboundKey {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        self.inner.fmt_debug("UnboundKey", f)
+    }
+}
+
+impl From<hkdf::Okm<'_, &'static Algorithm>> for UnboundKey {
+    fn from(okm: hkdf::Okm<&'static Algorithm>) -> Self {
+        let mut key_bytes = [0; MAX_KEY_LEN];
+        let key_bytes = &mut key_bytes[..okm.len().key_len()];
+        let algorithm = *okm.len();
+        okm.fill(key_bytes).unwrap();
+        Self {
+            inner: LessSafeKey::new_(algorithm, key_bytes, cpu::features()).unwrap(),
+        }
+    }
+}
diff --git a/ring-0.17.14/src/agreement.rs b/ring-0.17.14/src/agreement.rs
new file mode 100644
index 0000000000..770423f2be
--- /dev/null
+++ b/ring-0.17.14/src/agreement.rs
@@ -0,0 +1,311 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Key Agreement: ECDH, including X25519.
+//!
+//! # Example
+//!
+//! Note that this example uses X25519, but ECDH using NIST P-256/P-384 is done
+//! exactly the same way, just substituting
+//! `agreement::ECDH_P256`/`agreement::ECDH_P384` for `agreement::X25519`.
+//!
+//! ```
+//! use ring::{agreement, rand};
+//!
+//! let rng = rand::SystemRandom::new();
+//!
+//! let my_private_key = agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?;
+//!
+//! // Make `my_public_key` a byte slice containing my public key. In a real
+//! // application, this would be sent to the peer in an encoded protocol
+//! // message.
+//! let my_public_key = my_private_key.compute_public_key()?;
+//!
+//! let peer_public_key_bytes = {
+//!     // In a real application, the peer public key would be parsed out of a
+//!     // protocol message. Here we just generate one.
+//!     let peer_private_key =
+//!         agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?;
+//!     peer_private_key.compute_public_key()?
+//! };
+//!
+//! let peer_public_key = agreement::UnparsedPublicKey::new(
+//!     &agreement::X25519,
+//!     peer_public_key_bytes);
+//!
+//! agreement::agree_ephemeral(
+//!     my_private_key,
+//!     &peer_public_key,
+//!     |_key_material| {
+//!         // In a real application, we'd apply a KDF to the key material and the
+//!         // public keys (as recommended in RFC 7748) and then derive session
+//!         // keys from the result. We omit all that here.
+//!     },
+//! )?;
+//!
+//! # Ok::<(), ring::error::Unspecified>(())
+//! ```
+
+// The "NSA Guide" steps here are from from section 3.1, "Ephemeral Unified
+// Model."
+
+use crate::{cpu, debug, ec, error, rand};
+
+pub use crate::ec::{
+    curve25519::x25519::X25519,
+    suite_b::ecdh::{ECDH_P256, ECDH_P384},
+};
+
+/// A key agreement algorithm.
+pub struct Algorithm {
+    pub(crate) curve: &'static ec::Curve,
+    pub(crate) ecdh: fn(
+        out: &mut [u8],
+        private_key: &ec::Seed,
+        peer_public_key: untrusted::Input,
+        cpu: cpu::Features,
+    ) -> Result<(), error::Unspecified>,
+}
+
+derive_debug_via_field!(Algorithm, curve);
+
+impl Eq for Algorithm {}
+impl PartialEq for Algorithm {
+    fn eq(&self, other: &Self) -> bool {
+        self.curve.id == other.curve.id
+    }
+}
+
+/// An ephemeral private key for use (only) with `agree_ephemeral`. The
+/// signature of `agree_ephemeral` ensures that an `EphemeralPrivateKey` can be
+/// used for at most one key agreement.
+pub struct EphemeralPrivateKey {
+    private_key: ec::Seed,
+    algorithm: &'static Algorithm,
+}
+
+derive_debug_via_field!(
+    EphemeralPrivateKey,
+    stringify!(EphemeralPrivateKey),
+    algorithm
+);
+
+impl EphemeralPrivateKey {
+    /// Generate a new ephemeral private key for the given algorithm.
+    pub fn generate(
+        alg: &'static Algorithm,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<Self, error::Unspecified> {
+        let cpu_features = cpu::features();
+
+        // NSA Guide Step 1.
+        //
+        // This only handles the key generation part of step 1. The rest of
+        // step one is done by `compute_public_key()`.
+        let private_key = ec::Seed::generate(alg.curve, rng, cpu_features)?;
+        Ok(Self {
+            private_key,
+            algorithm: alg,
+        })
+    }
+
+    /// Computes the public key from the private key.
+    #[inline(always)]
+    pub fn compute_public_key(&self) -> Result<PublicKey, error::Unspecified> {
+        // NSA Guide Step 1.
+        //
+        // Obviously, this only handles the part of Step 1 between the private
+        // key generation and the sending of the public key to the peer. `out`
+        // is what should be sent to the peer.
+        self.private_key
+            .compute_public_key(cpu::features())
+            .map(|public_key| PublicKey {
+                algorithm: self.algorithm,
+                bytes: public_key,
+            })
+    }
+
+    /// The algorithm for the private key.
+    #[inline]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.algorithm
+    }
+
+    /// Do not use.
+    #[deprecated]
+    #[cfg(test)]
+    pub fn bytes(&self) -> &[u8] {
+        self.bytes_for_test()
+    }
+
+    #[cfg(test)]
+    pub(super) fn bytes_for_test(&self) -> &[u8] {
+        self.private_key.bytes_less_safe()
+    }
+}
+
+/// A public key for key agreement.
+#[derive(Clone)]
+pub struct PublicKey {
+    algorithm: &'static Algorithm,
+    bytes: ec::PublicKey,
+}
+
+impl AsRef<[u8]> for PublicKey {
+    fn as_ref(&self) -> &[u8] {
+        self.bytes.as_ref()
+    }
+}
+
+impl core::fmt::Debug for PublicKey {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        f.debug_struct("PublicKey")
+            .field("algorithm", &self.algorithm)
+            .field("bytes", &debug::HexStr(self.as_ref()))
+            .finish()
+    }
+}
+
+impl PublicKey {
+    /// The algorithm for the public key.
+    #[inline]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.algorithm
+    }
+}
+
+/// An unparsed, possibly malformed, public key for key agreement.
+#[derive(Clone, Copy)]
+pub struct UnparsedPublicKey<B> {
+    algorithm: &'static Algorithm,
+    bytes: B,
+}
+
+impl<B> AsRef<[u8]> for UnparsedPublicKey<B>
+where
+    B: AsRef<[u8]>,
+{
+    fn as_ref(&self) -> &[u8] {
+        self.bytes.as_ref()
+    }
+}
+
+impl<B: core::fmt::Debug> core::fmt::Debug for UnparsedPublicKey<B>
+where
+    B: AsRef<[u8]>,
+{
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        f.debug_struct("UnparsedPublicKey")
+            .field("algorithm", &self.algorithm)
+            .field("bytes", &debug::HexStr(self.bytes.as_ref()))
+            .finish()
+    }
+}
+
+impl<B> UnparsedPublicKey<B> {
+    /// Constructs a new `UnparsedPublicKey`.
+    pub fn new(algorithm: &'static Algorithm, bytes: B) -> Self {
+        Self { algorithm, bytes }
+    }
+
+    /// The algorithm for the public key.
+    #[inline]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.algorithm
+    }
+
+    /// TODO: doc
+    #[inline]
+    pub fn bytes(&self) -> &B {
+        &self.bytes
+    }
+}
+
+/// Performs a key agreement with an ephemeral private key and the given public
+/// key.
+///
+/// `my_private_key` is the ephemeral private key to use. Since it is moved, it
+/// will not be usable after calling `agree_ephemeral`, thus guaranteeing that
+/// the key is used for only one key agreement.
+///
+/// `peer_public_key` is the peer's public key. `agree_ephemeral` will return
+/// `Err(error_value)` if it does not match `my_private_key's` algorithm/curve.
+/// `agree_ephemeral` verifies that it is encoded in the standard form for the
+/// algorithm and that the key is *valid*; see the algorithm's documentation for
+/// details on how keys are to be encoded and what constitutes a valid key for
+/// that algorithm.
+///
+/// After the key agreement is done, `agree_ephemeral` calls `kdf` with the raw
+/// key material from the key agreement operation and then returns what `kdf`
+/// returns.
+#[inline]
+pub fn agree_ephemeral<B: AsRef<[u8]>, R>(
+    my_private_key: EphemeralPrivateKey,
+    peer_public_key: &UnparsedPublicKey<B>,
+    kdf: impl FnOnce(&[u8]) -> R,
+) -> Result<R, error::Unspecified> {
+    let peer_public_key = UnparsedPublicKey {
+        algorithm: peer_public_key.algorithm,
+        bytes: peer_public_key.bytes.as_ref(),
+    };
+    agree_ephemeral_(my_private_key, peer_public_key, kdf, cpu::features())
+}
+
+fn agree_ephemeral_<R>(
+    my_private_key: EphemeralPrivateKey,
+    peer_public_key: UnparsedPublicKey<&[u8]>,
+    kdf: impl FnOnce(&[u8]) -> R,
+    cpu: cpu::Features,
+) -> Result<R, error::Unspecified> {
+    // NSA Guide Prerequisite 1.
+    //
+    // The domain parameters are hard-coded. This check verifies that the
+    // peer's public key's domain parameters match the domain parameters of
+    // this private key.
+    if peer_public_key.algorithm != my_private_key.algorithm {
+        return Err(error::Unspecified);
+    }
+
+    let alg = &my_private_key.algorithm;
+
+    // NSA Guide Prerequisite 2, regarding which KDFs are allowed, is delegated
+    // to the caller.
+
+    // NSA Guide Prerequisite 3, "Prior to or during the key-agreement process,
+    // each party shall obtain the identifier associated with the other party
+    // during the key-agreement scheme," is delegated to the caller.
+
+    // NSA Guide Step 1 is handled by `EphemeralPrivateKey::generate()` and
+    // `EphemeralPrivateKey::compute_public_key()`.
+
+    let mut shared_key = [0u8; ec::ELEM_MAX_BYTES];
+    let shared_key = &mut shared_key[..alg.curve.elem_scalar_seed_len];
+
+    // NSA Guide Steps 2, 3, and 4.
+    //
+    // We have a pretty liberal interpretation of the NIST's spec's "Destroy"
+    // that doesn't meet the NSA requirement to "zeroize."
+    (alg.ecdh)(
+        shared_key,
+        &my_private_key.private_key,
+        untrusted::Input::from(peer_public_key.bytes),
+        cpu,
+    )?;
+
+    // NSA Guide Steps 5 and 6.
+    //
+    // Again, we have a pretty liberal interpretation of the NIST's spec's
+    // "Destroy" that doesn't meet the NSA requirement to "zeroize."
+    Ok(kdf(shared_key))
+}
diff --git a/ring-0.17.14/src/arithmetic.rs b/ring-0.17.14/src/arithmetic.rs
new file mode 100644
index 0000000000..94df7e461b
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic.rs
@@ -0,0 +1,47 @@
+// Copyright 2017-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub(crate) use self::{constant::limbs_from_hex, limb_slice_error::LimbSliceError};
+use crate::{error::LenMismatchError, limb::LIMB_BITS};
+
+#[macro_use]
+mod ffi;
+
+mod constant;
+
+#[cfg(feature = "alloc")]
+pub mod bigint;
+
+pub(crate) mod inout;
+mod limbs;
+mod limbs512;
+pub mod montgomery;
+
+mod n0;
+
+// The minimum number of limbs allowed for any `&[Limb]` operation.
+//
+// TODO: Use `256 / LIMB_BITS` so that the limit is independent of limb size.
+pub const MIN_LIMBS: usize = 4;
+
+// The maximum number of limbs allowed for any `&[Limb]` operation.
+pub const MAX_LIMBS: usize = 8192 / LIMB_BITS;
+
+cold_exhaustive_error! {
+    enum limb_slice_error::LimbSliceError {
+        len_mismatch => LenMismatch(LenMismatchError),
+        too_short => TooShort(usize),
+        too_long => TooLong(usize),
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/bigint.rs b/ring-0.17.14/src/arithmetic/bigint.rs
new file mode 100644
index 0000000000..7251a64ad8
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/bigint.rs
@@ -0,0 +1,1068 @@
+// Copyright 2015-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Multi-precision integers.
+//!
+//! # Modular Arithmetic.
+//!
+//! Modular arithmetic is done in finite commutative rings ℤ/mℤ for some
+//! modulus *m*. We work in finite commutative rings instead of finite fields
+//! because the RSA public modulus *n* is not prime, which means ℤ/nℤ contains
+//! nonzero elements that have no multiplicative inverse, so ℤ/nℤ is not a
+//! finite field.
+//!
+//! In some calculations we need to deal with multiple rings at once. For
+//! example, RSA private key operations operate in the rings ℤ/nℤ, ℤ/pℤ, and
+//! ℤ/qℤ. Types and functions dealing with such rings are all parameterized
+//! over a type `M` to ensure that we don't wrongly mix up the math, e.g. by
+//! multiplying an element of ℤ/pℤ by an element of ℤ/qℤ modulo q. This follows
+//! the "unit" pattern described in [Static checking of units in Servo].
+//!
+//! `Elem` also uses the static unit checking pattern to statically track the
+//! Montgomery factors that need to be canceled out in each value using it's
+//! `E` parameter.
+//!
+//! [Static checking of units in Servo]:
+//!     https://blog.mozilla.org/research/2014/06/23/static-checking-of-units-in-servo/
+
+use self::boxed_limbs::BoxedLimbs;
+pub(crate) use self::{
+    modulus::{Modulus, OwnedModulus},
+    modulusvalue::OwnedModulusValue,
+    private_exponent::PrivateExponent,
+};
+use super::{inout::AliasingSlices3, limbs512, montgomery::*, LimbSliceError, MAX_LIMBS};
+use crate::{
+    bits::BitLength,
+    c,
+    error::{self, LenMismatchError},
+    limb::{self, Limb, LIMB_BITS},
+    polyfill::slice::{self, AsChunks},
+};
+use core::{
+    marker::PhantomData,
+    num::{NonZeroU64, NonZeroUsize},
+};
+
+mod boxed_limbs;
+mod modulus;
+mod modulusvalue;
+mod private_exponent;
+
+pub trait PublicModulus {}
+
+// When we need to create a new `Elem`, first we create a `Storage` and then
+// move its `limbs` into the new element. When we want to recylce an `Elem`'s
+// memory allocation, we convert it back into a `Storage`.
+pub struct Storage<M> {
+    limbs: BoxedLimbs<M>,
+}
+
+impl<M, E> From<Elem<M, E>> for Storage<M> {
+    fn from(elem: Elem<M, E>) -> Self {
+        Self { limbs: elem.limbs }
+    }
+}
+
+/// Elements of ℤ/mℤ for some modulus *m*.
+//
+// Defaulting `E` to `Unencoded` is a convenience for callers from outside this
+// submodule. However, for maximum clarity, we always explicitly use
+// `Unencoded` within the `bigint` submodule.
+pub struct Elem<M, E = Unencoded> {
+    limbs: BoxedLimbs<M>,
+
+    /// The number of Montgomery factors that need to be canceled out from
+    /// `value` to get the actual value.
+    encoding: PhantomData<E>,
+}
+
+impl<M, E> Elem<M, E> {
+    pub fn clone_into(&self, mut out: Storage<M>) -> Self {
+        out.limbs.copy_from_slice(&self.limbs);
+        Self {
+            limbs: out.limbs,
+            encoding: self.encoding,
+        }
+    }
+}
+
+impl<M, E> Elem<M, E> {
+    #[inline]
+    pub fn is_zero(&self) -> bool {
+        limb::limbs_are_zero(&self.limbs).leak()
+    }
+}
+
+/// Does a Montgomery reduction on `limbs` assuming they are Montgomery-encoded ('R') and assuming
+/// they are the same size as `m`, but perhaps not reduced mod `m`. The result will be
+/// fully reduced mod `m`.
+///
+/// WARNING: Takes a `Storage` as an in/out value.
+fn from_montgomery_amm<M>(mut in_out: Storage<M>, m: &Modulus<M>) -> Elem<M, Unencoded> {
+    let mut one = [0; MAX_LIMBS];
+    one[0] = 1;
+    let one = &one[..m.limbs().len()];
+    limbs_mul_mont(
+        (&mut in_out.limbs[..], one),
+        m.limbs(),
+        m.n0(),
+        m.cpu_features(),
+    )
+    .unwrap_or_else(unwrap_impossible_limb_slice_error);
+    Elem {
+        limbs: in_out.limbs,
+        encoding: PhantomData,
+    }
+}
+
+#[cfg(any(test, not(target_arch = "x86_64")))]
+impl<M> Elem<M, R> {
+    #[inline]
+    pub fn into_unencoded(self, m: &Modulus<M>) -> Elem<M, Unencoded> {
+        from_montgomery_amm(Storage::from(self), m)
+    }
+}
+
+impl<M> Elem<M, Unencoded> {
+    pub fn from_be_bytes_padded(
+        input: untrusted::Input,
+        m: &Modulus<M>,
+    ) -> Result<Self, error::Unspecified> {
+        Ok(Self {
+            limbs: BoxedLimbs::from_be_bytes_padded_less_than(input, m)?,
+            encoding: PhantomData,
+        })
+    }
+
+    #[inline]
+    pub fn fill_be_bytes(&self, out: &mut [u8]) {
+        // See Falko Strenzke, "Manger's Attack revisited", ICICS 2010.
+        limb::big_endian_from_limbs(&self.limbs, out)
+    }
+}
+
+pub fn elem_mul_into<M, AF, BF>(
+    mut out: Storage<M>,
+    a: &Elem<M, AF>,
+    b: &Elem<M, BF>,
+    m: &Modulus<M>,
+) -> Elem<M, <(AF, BF) as ProductEncoding>::Output>
+where
+    (AF, BF): ProductEncoding,
+{
+    limbs_mul_mont(
+        (out.limbs.as_mut(), b.limbs.as_ref(), a.limbs.as_ref()),
+        m.limbs(),
+        m.n0(),
+        m.cpu_features(),
+    )
+    .unwrap_or_else(unwrap_impossible_limb_slice_error);
+    Elem {
+        limbs: out.limbs,
+        encoding: PhantomData,
+    }
+}
+
+pub fn elem_mul<M, AF, BF>(
+    a: &Elem<M, AF>,
+    mut b: Elem<M, BF>,
+    m: &Modulus<M>,
+) -> Elem<M, <(AF, BF) as ProductEncoding>::Output>
+where
+    (AF, BF): ProductEncoding,
+{
+    limbs_mul_mont(
+        (&mut b.limbs[..], &a.limbs[..]),
+        m.limbs(),
+        m.n0(),
+        m.cpu_features(),
+    )
+    .unwrap_or_else(unwrap_impossible_limb_slice_error);
+    Elem {
+        limbs: b.limbs,
+        encoding: PhantomData,
+    }
+}
+
+// r *= 2.
+fn elem_double<M, AF>(r: &mut Elem<M, AF>, m: &Modulus<M>) {
+    limb::limbs_double_mod(&mut r.limbs, m.limbs())
+        .unwrap_or_else(unwrap_impossible_len_mismatch_error)
+}
+
+// TODO: This is currently unused, but we intend to eventually use this to
+// reduce elements (x mod q) mod p in the RSA CRT. If/when we do so, we
+// should update the testing so it is reflective of that usage, instead of
+// the old usage.
+pub fn elem_reduced_once<A, M>(
+    mut r: Storage<M>,
+    a: &Elem<A, Unencoded>,
+    m: &Modulus<M>,
+    other_modulus_len_bits: BitLength,
+) -> Elem<M, Unencoded> {
+    assert_eq!(m.len_bits(), other_modulus_len_bits);
+    r.limbs.copy_from_slice(&a.limbs);
+    limb::limbs_reduce_once(&mut r.limbs, m.limbs())
+        .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+    Elem {
+        limbs: r.limbs,
+        encoding: PhantomData,
+    }
+}
+
+#[inline]
+pub fn elem_reduced<Larger, Smaller>(
+    mut r: Storage<Smaller>,
+    a: &Elem<Larger, Unencoded>,
+    m: &Modulus<Smaller>,
+    other_prime_len_bits: BitLength,
+) -> Elem<Smaller, RInverse> {
+    // This is stricter than required mathematically but this is what we
+    // guarantee and this is easier to check. The real requirement is that
+    // that `a < m*R` where `R` is the Montgomery `R` for `m`.
+    assert_eq!(other_prime_len_bits, m.len_bits());
+
+    // `limbs_from_mont_in_place` requires this.
+    assert_eq!(a.limbs.len(), m.limbs().len() * 2);
+
+    let mut tmp = [0; MAX_LIMBS];
+    let tmp = &mut tmp[..a.limbs.len()];
+    tmp.copy_from_slice(&a.limbs);
+
+    limbs_from_mont_in_place(&mut r.limbs, tmp, m.limbs(), m.n0());
+    Elem {
+        limbs: r.limbs,
+        encoding: PhantomData,
+    }
+}
+
+#[inline]
+fn elem_squared<M, E>(
+    mut a: Elem<M, E>,
+    m: &Modulus<M>,
+) -> Elem<M, <(E, E) as ProductEncoding>::Output>
+where
+    (E, E): ProductEncoding,
+{
+    limbs_square_mont(&mut a.limbs, m.limbs(), m.n0(), m.cpu_features())
+        .unwrap_or_else(unwrap_impossible_limb_slice_error);
+    Elem {
+        limbs: a.limbs,
+        encoding: PhantomData,
+    }
+}
+
+pub fn elem_widen<Larger, Smaller>(
+    mut r: Storage<Larger>,
+    a: Elem<Smaller, Unencoded>,
+    m: &Modulus<Larger>,
+    smaller_modulus_bits: BitLength,
+) -> Result<Elem<Larger, Unencoded>, error::Unspecified> {
+    if smaller_modulus_bits >= m.len_bits() {
+        return Err(error::Unspecified);
+    }
+    let (to_copy, to_zero) = r.limbs.split_at_mut(a.limbs.len());
+    to_copy.copy_from_slice(&a.limbs);
+    to_zero.fill(0);
+    Ok(Elem {
+        limbs: r.limbs,
+        encoding: PhantomData,
+    })
+}
+
+// TODO: Document why this works for all Montgomery factors.
+pub fn elem_add<M, E>(mut a: Elem<M, E>, b: Elem<M, E>, m: &Modulus<M>) -> Elem<M, E> {
+    limb::limbs_add_assign_mod(&mut a.limbs, &b.limbs, m.limbs())
+        .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+    a
+}
+
+// TODO: Document why this works for all Montgomery factors.
+pub fn elem_sub<M, E>(mut a: Elem<M, E>, b: &Elem<M, E>, m: &Modulus<M>) -> Elem<M, E> {
+    prefixed_extern! {
+        // `r` and `a` may alias.
+        fn LIMBS_sub_mod(
+            r: *mut Limb,
+            a: *const Limb,
+            b: *const Limb,
+            m: *const Limb,
+            num_limbs: c::NonZero_size_t,
+        );
+    }
+    let num_limbs = NonZeroUsize::new(m.limbs().len()).unwrap();
+    (a.limbs.as_mut(), b.limbs.as_ref())
+        .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, b| {
+            let m = m.limbs().as_ptr(); // Also non-dangling because num_limbs is non-zero.
+            unsafe { LIMBS_sub_mod(r, a, b, m, num_limbs) }
+        })
+        .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+    a
+}
+
+// The value 1, Montgomery-encoded some number of times.
+pub struct One<M, E>(Elem<M, E>);
+
+impl<M> One<M, RR> {
+    // Returns RR = = R**2 (mod n) where R = 2**r is the smallest power of
+    // 2**LIMB_BITS such that R > m.
+    //
+    // Even though the assembly on some 32-bit platforms works with 64-bit
+    // values, using `LIMB_BITS` here, rather than `N0::LIMBS_USED * LIMB_BITS`,
+    // is correct because R**2 will still be a multiple of the latter as
+    // `N0::LIMBS_USED` is either one or two.
+    pub(crate) fn newRR(mut out: Storage<M>, m: &Modulus<M>) -> Self {
+        // The number of limbs in the numbers involved.
+        let w = m.limbs().len();
+
+        // The length of the numbers involved, in bits. R = 2**r.
+        let r = w * LIMB_BITS;
+
+        m.oneR(&mut out.limbs);
+        let mut acc: Elem<M, R> = Elem {
+            limbs: out.limbs,
+            encoding: PhantomData,
+        };
+
+        // 2**t * R can be calculated by t doublings starting with R.
+        //
+        // Choose a t that divides r and where t doublings are cheaper than 1 squaring.
+        //
+        // We could choose other values of t than w. But if t < d then the exponentiation that
+        // follows would require multiplications. Normally d is 1 (i.e. the modulus length is a
+        // power of two: RSA 1024, 2048, 4097, 8192) or 3 (RSA 1536, 3072).
+        //
+        // XXX(perf): Currently t = w / 2 is slightly faster. TODO(perf): Optimize `elem_double`
+        // and re-run benchmarks to rebalance this.
+        let t = w;
+        let z = w.trailing_zeros();
+        let d = w >> z;
+        debug_assert_eq!(w, d * (1 << z));
+        debug_assert!(d <= t);
+        debug_assert!(t < r);
+        for _ in 0..t {
+            elem_double(&mut acc, m);
+        }
+
+        // Because t | r:
+        //
+        //     MontExp(2**t * R, r / t)
+        //   = (2**t)**(r / t)   * R (mod m) by definition of MontExp.
+        //   = (2**t)**(1/t * r) * R (mod m)
+        //   = (2**(t * 1/t))**r * R (mod m)
+        //   = (2**1)**r         * R (mod m)
+        //   = 2**r              * R (mod m)
+        //   = R * R                 (mod m)
+        //   = RR
+        //
+        // Like BoringSSL, use t = w (`m.limbs.len()`) which ensures that the exponent is a power
+        // of two. Consequently, there will be no multiplications in the Montgomery exponentiation;
+        // there will only be lg(r / t) squarings.
+        //
+        //     lg(r / t)
+        //   = lg((w * 2**b) / t)
+        //   = lg((t * 2**b) / t)
+        //   = lg(2**b)
+        //   = b
+        // TODO(MSRV:1.67): const B: u32 = LIMB_BITS.ilog2();
+        const B: u32 = if cfg!(target_pointer_width = "64") {
+            6
+        } else if cfg!(target_pointer_width = "32") {
+            5
+        } else {
+            panic!("unsupported target_pointer_width")
+        };
+        #[allow(clippy::assertions_on_constants)]
+        const _LIMB_BITS_IS_2_POW_B: () = assert!(LIMB_BITS == 1 << B);
+        debug_assert_eq!(r, t * (1 << B));
+        for _ in 0..B {
+            acc = elem_squared(acc, m);
+        }
+
+        Self(Elem {
+            limbs: acc.limbs,
+            encoding: PhantomData, // PhantomData<RR>
+        })
+    }
+}
+
+impl<M> One<M, RRR> {
+    pub(crate) fn newRRR(One(oneRR): One<M, RR>, m: &Modulus<M>) -> Self {
+        Self(elem_squared(oneRR, m))
+    }
+}
+
+impl<M, E> AsRef<Elem<M, E>> for One<M, E> {
+    fn as_ref(&self) -> &Elem<M, E> {
+        &self.0
+    }
+}
+
+impl<M: PublicModulus, E> One<M, E> {
+    pub fn clone_into(&self, out: Storage<M>) -> Self {
+        Self(self.0.clone_into(out))
+    }
+}
+
+/// Calculates base**exponent (mod m).
+///
+/// The run time  is a function of the number of limbs in `m` and the bit
+/// length and Hamming Weight of `exponent`. The bounds on `m` are pretty
+/// obvious but the bounds on `exponent` are less obvious. Callers should
+/// document the bounds they place on the maximum value and maximum Hamming
+/// weight of `exponent`.
+// TODO: The test coverage needs to be expanded, e.g. test with the largest
+// accepted exponent and with the most common values of 65537 and 3.
+pub(crate) fn elem_exp_vartime<M>(
+    out: Storage<M>,
+    base: Elem<M, R>,
+    exponent: NonZeroU64,
+    m: &Modulus<M>,
+) -> Elem<M, R> {
+    // Use what [Knuth] calls the "S-and-X binary method", i.e. variable-time
+    // square-and-multiply that scans the exponent from the most significant
+    // bit to the least significant bit (left-to-right). Left-to-right requires
+    // less storage compared to right-to-left scanning, at the cost of needing
+    // to compute `exponent.leading_zeros()`, which we assume to be cheap.
+    //
+    // As explained in [Knuth], exponentiation by squaring is the most
+    // efficient algorithm when the Hamming weight is 2 or less. It isn't the
+    // most efficient for all other, uncommon, exponent values but any
+    // suboptimality is bounded at least by the small bit length of `exponent`
+    // as enforced by its type.
+    //
+    // This implementation is slightly simplified by taking advantage of the
+    // fact that we require the exponent to be a positive integer.
+    //
+    // [Knuth]: The Art of Computer Programming, Volume 2: Seminumerical
+    //          Algorithms (3rd Edition), Section 4.6.3.
+    let exponent = exponent.get();
+    let mut acc = base.clone_into(out);
+    let mut bit = 1 << (64 - 1 - exponent.leading_zeros());
+    debug_assert!((exponent & bit) != 0);
+    while bit > 1 {
+        bit >>= 1;
+        acc = elem_squared(acc, m);
+        if (exponent & bit) != 0 {
+            acc = elem_mul(&base, acc, m);
+        }
+    }
+    acc
+}
+
+pub fn elem_exp_consttime<N, P>(
+    out: Storage<P>,
+    base: &Elem<N>,
+    oneRRR: &One<P, RRR>,
+    exponent: &PrivateExponent,
+    p: &Modulus<P>,
+    other_prime_len_bits: BitLength,
+) -> Result<Elem<P, Unencoded>, LimbSliceError> {
+    // `elem_exp_consttime_inner` is parameterized on `STORAGE_LIMBS` only so
+    // we can run tests with larger-than-supported-in-operation test vectors.
+    elem_exp_consttime_inner::<N, P, { ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS * STORAGE_ENTRIES }>(
+        out,
+        base,
+        oneRRR,
+        exponent,
+        p,
+        other_prime_len_bits,
+    )
+}
+
+// The maximum modulus size supported for `elem_exp_consttime` in normal
+// operation.
+const ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: usize = 2048 / LIMB_BITS;
+const _LIMBS_PER_CHUNK_DIVIDES_ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: () =
+    assert!(ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS % limbs512::LIMBS_PER_CHUNK == 0);
+const WINDOW_BITS: u32 = 5;
+const TABLE_ENTRIES: usize = 1 << WINDOW_BITS;
+const STORAGE_ENTRIES: usize = TABLE_ENTRIES + if cfg!(target_arch = "x86_64") { 3 } else { 0 };
+
+#[cfg(not(target_arch = "x86_64"))]
+fn elem_exp_consttime_inner<N, M, const STORAGE_LIMBS: usize>(
+    out: Storage<M>,
+    base_mod_n: &Elem<N>,
+    oneRRR: &One<M, RRR>,
+    exponent: &PrivateExponent,
+    m: &Modulus<M>,
+    other_prime_len_bits: BitLength,
+) -> Result<Elem<M, Unencoded>, LimbSliceError> {
+    use crate::{bssl, limb::Window};
+
+    let base_rinverse: Elem<M, RInverse> = elem_reduced(out, base_mod_n, m, other_prime_len_bits);
+
+    let num_limbs = m.limbs().len();
+    let m_chunked: AsChunks<Limb, { limbs512::LIMBS_PER_CHUNK }> = match slice::as_chunks(m.limbs())
+    {
+        (m, []) => m,
+        _ => {
+            return Err(LimbSliceError::len_mismatch(LenMismatchError::new(
+                num_limbs,
+            )))
+        }
+    };
+    let cpe = m_chunked.len(); // 512-bit chunks per entry.
+
+    // This code doesn't have the strict alignment requirements that the x86_64
+    // version does, but uses the same aligned storage for convenience.
+    assert!(STORAGE_LIMBS % (STORAGE_ENTRIES * limbs512::LIMBS_PER_CHUNK) == 0); // TODO: `const`
+    let mut table = limbs512::AlignedStorage::<STORAGE_LIMBS>::zeroed();
+    let mut table = table
+        .aligned_chunks_mut(TABLE_ENTRIES, cpe)
+        .map_err(LimbSliceError::len_mismatch)?;
+
+    // TODO: Rewrite the below in terms of `AsChunks`.
+    let table = table.as_flattened_mut();
+
+    fn gather<M>(table: &[Limb], acc: &mut Elem<M, R>, i: Window) {
+        prefixed_extern! {
+            fn LIMBS_select_512_32(
+                r: *mut Limb,
+                table: *const Limb,
+                num_limbs: c::size_t,
+                i: Window,
+            ) -> bssl::Result;
+        }
+        Result::from(unsafe {
+            LIMBS_select_512_32(acc.limbs.as_mut_ptr(), table.as_ptr(), acc.limbs.len(), i)
+        })
+        .unwrap();
+    }
+
+    fn power<M>(
+        table: &[Limb],
+        mut acc: Elem<M, R>,
+        m: &Modulus<M>,
+        i: Window,
+        mut tmp: Elem<M, R>,
+    ) -> (Elem<M, R>, Elem<M, R>) {
+        for _ in 0..WINDOW_BITS {
+            acc = elem_squared(acc, m);
+        }
+        gather(table, &mut tmp, i);
+        let acc = elem_mul(&tmp, acc, m);
+        (acc, tmp)
+    }
+
+    fn entry(table: &[Limb], i: usize, num_limbs: usize) -> &[Limb] {
+        &table[(i * num_limbs)..][..num_limbs]
+    }
+    fn entry_mut(table: &mut [Limb], i: usize, num_limbs: usize) -> &mut [Limb] {
+        &mut table[(i * num_limbs)..][..num_limbs]
+    }
+
+    // table[0] = base**0 (i.e. 1).
+    m.oneR(entry_mut(table, 0, num_limbs));
+
+    // table[1] = base*R == (base/R * RRR)/R
+    limbs_mul_mont(
+        (
+            entry_mut(table, 1, num_limbs),
+            base_rinverse.limbs.as_ref(),
+            oneRRR.as_ref().limbs.as_ref(),
+        ),
+        m.limbs(),
+        m.n0(),
+        m.cpu_features(),
+    )?;
+    for i in 2..TABLE_ENTRIES {
+        let (src1, src2) = if i % 2 == 0 {
+            (i / 2, i / 2)
+        } else {
+            (i - 1, 1)
+        };
+        let (previous, rest) = table.split_at_mut(num_limbs * i);
+        let src1 = entry(previous, src1, num_limbs);
+        let src2 = entry(previous, src2, num_limbs);
+        let dst = entry_mut(rest, 0, num_limbs);
+        limbs_mul_mont((dst, src1, src2), m.limbs(), m.n0(), m.cpu_features())?;
+    }
+
+    let mut acc = Elem {
+        limbs: base_rinverse.limbs,
+        encoding: PhantomData,
+    };
+    let tmp = m.alloc_zero();
+    let tmp = Elem {
+        limbs: tmp.limbs,
+        encoding: PhantomData,
+    };
+    let (acc, _) = limb::fold_5_bit_windows(
+        exponent.limbs(),
+        |initial_window| {
+            gather(&table, &mut acc, initial_window);
+            (acc, tmp)
+        },
+        |(acc, tmp), window| power(&table, acc, m, window, tmp),
+    );
+
+    Ok(acc.into_unencoded(m))
+}
+
+#[cfg(target_arch = "x86_64")]
+fn elem_exp_consttime_inner<N, M, const STORAGE_LIMBS: usize>(
+    out: Storage<M>,
+    base_mod_n: &Elem<N>,
+    oneRRR: &One<M, RRR>,
+    exponent: &PrivateExponent,
+    m: &Modulus<M>,
+    other_prime_len_bits: BitLength,
+) -> Result<Elem<M, Unencoded>, LimbSliceError> {
+    use super::limbs::x86_64::mont::{
+        gather5, mul_mont5, mul_mont_gather5_amm, power5_amm, scatter5, sqr_mont5,
+    };
+    use crate::{
+        cpu::{
+            intel::{Adx, Bmi2},
+            GetFeature as _,
+        },
+        limb::{LeakyWindow, Window},
+        polyfill::slice::AsChunksMut,
+    };
+
+    let n0 = m.n0();
+
+    let cpu2 = m.cpu_features().get_feature();
+    let cpu3 = m.cpu_features().get_feature();
+
+    if base_mod_n.limbs.len() != m.limbs().len() * 2 {
+        return Err(LimbSliceError::len_mismatch(LenMismatchError::new(
+            base_mod_n.limbs.len(),
+        )));
+    }
+
+    let m_original: AsChunks<Limb, 8> = match slice::as_chunks(m.limbs()) {
+        (m, []) => m,
+        _ => return Err(LimbSliceError::len_mismatch(LenMismatchError::new(8))),
+    };
+    let cpe = m_original.len(); // 512-bit chunks per entry
+
+    let oneRRR = &oneRRR.as_ref().limbs;
+    let oneRRR = match slice::as_chunks(oneRRR) {
+        (c, []) => c,
+        _ => {
+            return Err(LimbSliceError::len_mismatch(LenMismatchError::new(
+                oneRRR.len(),
+            )))
+        }
+    };
+
+    // The x86_64 assembly was written under the assumption that the input data
+    // is aligned to `MOD_EXP_CTIME_ALIGN` bytes, which was/is 64 in OpenSSL.
+    // Subsequently, it was changed such that, according to BoringSSL, they
+    // only require 16 byte alignment. We enforce the old, stronger, alignment
+    // unless/until we can see a benefit to reducing it.
+    //
+    // Similarly, OpenSSL uses the x86_64 assembly functions by giving it only
+    // inputs `tmp`, `am`, and `np` that immediately follow the table.
+    // According to BoringSSL, in older versions of the OpenSSL code, this
+    // extra space was required for memory safety because the assembly code
+    // would over-read the table; according to BoringSSL, this is no longer the
+    // case. Regardless, the upstream code also contained comments implying
+    // that this was also important for performance. For now, we do as OpenSSL
+    // did/does.
+    const MOD_EXP_CTIME_ALIGN: usize = 64;
+    // Required by
+    const _TABLE_ENTRIES_IS_32: () = assert!(TABLE_ENTRIES == 32);
+    const _STORAGE_ENTRIES_HAS_3_EXTRA: () = assert!(STORAGE_ENTRIES == TABLE_ENTRIES + 3);
+
+    assert!(STORAGE_LIMBS % (STORAGE_ENTRIES * limbs512::LIMBS_PER_CHUNK) == 0); // TODO: `const`
+    let mut table = limbs512::AlignedStorage::<STORAGE_LIMBS>::zeroed();
+    let mut table = table
+        .aligned_chunks_mut(STORAGE_ENTRIES, cpe)
+        .map_err(LimbSliceError::len_mismatch)?;
+    let (mut table, mut state) = table.split_at_mut(TABLE_ENTRIES * cpe);
+    assert_eq!((table.as_ptr() as usize) % MOD_EXP_CTIME_ALIGN, 0);
+
+    // These are named `(tmp, am, np)` in BoringSSL.
+    let (mut acc, mut rest) = state.split_at_mut(cpe);
+    let (mut base_cached, mut m_cached) = rest.split_at_mut(cpe);
+
+    // "To improve cache locality" according to upstream.
+    m_cached
+        .as_flattened_mut()
+        .copy_from_slice(m_original.as_flattened());
+    let m_cached = m_cached.as_ref();
+
+    let out: Elem<M, RInverse> = elem_reduced(out, base_mod_n, m, other_prime_len_bits);
+    let base_rinverse = match slice::as_chunks(&out.limbs) {
+        (c, []) => c,
+        _ => {
+            return Err(LimbSliceError::len_mismatch(LenMismatchError::new(
+                out.limbs.len(),
+            )))
+        }
+    };
+
+    // base_cached = base*R == (base/R * RRR)/R
+    mul_mont5(
+        base_cached.as_mut(),
+        base_rinverse,
+        oneRRR,
+        m_cached,
+        n0,
+        cpu2,
+    )?;
+    let base_cached = base_cached.as_ref();
+    let mut out = Storage::from(out); // recycle.
+
+    // Fill in all the powers of 2 of `acc` into the table using only squaring and without any
+    // gathering, storing the last calculated power into `acc`.
+    fn scatter_powers_of_2(
+        mut table: AsChunksMut<Limb, 8>,
+        mut acc: AsChunksMut<Limb, 8>,
+        m_cached: AsChunks<Limb, 8>,
+        n0: &N0,
+        mut i: LeakyWindow,
+        cpu: Option<(Adx, Bmi2)>,
+    ) -> Result<(), LimbSliceError> {
+        loop {
+            scatter5(acc.as_ref(), table.as_mut(), i)?;
+            i *= 2;
+            if i >= TABLE_ENTRIES as LeakyWindow {
+                break;
+            }
+            sqr_mont5(acc.as_mut(), m_cached, n0, cpu)?;
+        }
+        Ok(())
+    }
+
+    // All entries in `table` will be Montgomery encoded.
+
+    // acc = table[0] = base**0 (i.e. 1).
+    m.oneR(acc.as_flattened_mut());
+    scatter5(acc.as_ref(), table.as_mut(), 0)?;
+
+    // acc = base**1 (i.e. base).
+    acc.as_flattened_mut()
+        .copy_from_slice(base_cached.as_flattened());
+
+    // Fill in entries 1, 2, 4, 8, 16.
+    scatter_powers_of_2(table.as_mut(), acc.as_mut(), m_cached, n0, 1, cpu2)?;
+    // Fill in entries 3, 6, 12, 24; 5, 10, 20, 30; 7, 14, 28; 9, 18; 11, 22; 13, 26; 15, 30;
+    // 17; 19; 21; 23; 25; 27; 29; 31.
+    for i in (3..(TABLE_ENTRIES as LeakyWindow)).step_by(2) {
+        let power = Window::from(i - 1);
+        assert!(power < 32); // Not secret,
+        unsafe {
+            mul_mont_gather5_amm(
+                acc.as_mut(),
+                base_cached,
+                table.as_ref(),
+                m_cached,
+                n0,
+                power,
+                cpu3,
+            )
+        }?;
+        scatter_powers_of_2(table.as_mut(), acc.as_mut(), m_cached, n0, i, cpu2)?;
+    }
+
+    let table = table.as_ref();
+
+    let acc = limb::fold_5_bit_windows(
+        exponent.limbs(),
+        |initial_window| {
+            unsafe { gather5(acc.as_mut(), table, initial_window) }
+                .unwrap_or_else(unwrap_impossible_limb_slice_error);
+            acc
+        },
+        |mut acc, window| {
+            unsafe { power5_amm(acc.as_mut(), table, m_cached, n0, window, cpu3) }
+                .unwrap_or_else(unwrap_impossible_limb_slice_error);
+            acc
+        },
+    );
+
+    // Reuse `base_rinverse`'s limbs to save an allocation.
+    out.limbs.copy_from_slice(acc.as_flattened());
+    Ok(from_montgomery_amm(out, m))
+}
+
+/// Verified a == b**-1 (mod m), i.e. a**-1 == b (mod m).
+pub fn verify_inverses_consttime<M>(
+    a: &Elem<M, R>,
+    b: Elem<M, Unencoded>,
+    m: &Modulus<M>,
+) -> Result<(), error::Unspecified> {
+    let r = elem_mul(a, b, m);
+    limb::verify_limbs_equal_1_leak_bit(&r.limbs)
+}
+
+#[inline]
+pub fn elem_verify_equal_consttime<M, E>(
+    a: &Elem<M, E>,
+    b: &Elem<M, E>,
+) -> Result<(), error::Unspecified> {
+    let equal = limb::limbs_equal_limbs_consttime(&a.limbs, &b.limbs)
+        .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+    if !equal.leak() {
+        return Err(error::Unspecified);
+    }
+    Ok(())
+}
+
+#[cold]
+#[inline(never)]
+fn unwrap_impossible_len_mismatch_error<T>(LenMismatchError { .. }: LenMismatchError) -> T {
+    unreachable!()
+}
+
+#[cold]
+#[inline(never)]
+fn unwrap_impossible_limb_slice_error(err: LimbSliceError) {
+    match err {
+        LimbSliceError::LenMismatch(_) => unreachable!(),
+        LimbSliceError::TooShort(_) => unreachable!(),
+        LimbSliceError::TooLong(_) => unreachable!(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cpu;
+    use crate::testutil as test;
+
+    // Type-level representation of an arbitrary modulus.
+    struct M {}
+
+    impl PublicModulus for M {}
+
+    #[test]
+    fn test_elem_exp_consttime() {
+        let cpu_features = cpu::features();
+        test::run(
+            test_vector_file!("../../crypto/fipsmodule/bn/test/mod_exp_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let m = consume_modulus::<M>(test_case, "M");
+                let m = m.modulus(cpu_features);
+                let expected_result = consume_elem(test_case, "ModExp", &m);
+                let base = consume_elem(test_case, "A", &m);
+                let e = {
+                    let bytes = test_case.consume_bytes("E");
+                    PrivateExponent::from_be_bytes_for_test_only(untrusted::Input::from(&bytes), &m)
+                        .expect("valid exponent")
+                };
+
+                let oneRR = One::newRR(m.alloc_zero(), &m);
+                let oneRRR = One::newRRR(oneRR, &m);
+
+                // `base` in the test vectors is reduced (mod M) already but
+                // the API expects the bsae to be (mod N) where N = M * P for
+                // some other prime of the same length. Fake that here.
+                // Pretend there's another prime of equal length.
+                struct N {}
+                let other_modulus_len_bits = m.len_bits();
+                let base: Elem<N> = {
+                    let mut limbs = BoxedLimbs::zero(base.limbs.len() * 2);
+                    limbs[..base.limbs.len()].copy_from_slice(&base.limbs);
+                    Elem {
+                        limbs,
+                        encoding: PhantomData,
+                    }
+                };
+
+                let too_big = m.limbs().len() > ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS;
+                let actual_result = if !too_big {
+                    elem_exp_consttime(
+                        m.alloc_zero(),
+                        &base,
+                        &oneRRR,
+                        &e,
+                        &m,
+                        other_modulus_len_bits,
+                    )
+                } else {
+                    let actual_result = elem_exp_consttime(
+                        m.alloc_zero(),
+                        &base,
+                        &oneRRR,
+                        &e,
+                        &m,
+                        other_modulus_len_bits,
+                    );
+                    // TODO: Be more specific with which error we expect?
+                    assert!(actual_result.is_err());
+                    // Try again with a larger-than-normally-supported limit
+                    elem_exp_consttime_inner::<_, _, { (4096 / LIMB_BITS) * STORAGE_ENTRIES }>(
+                        m.alloc_zero(),
+                        &base,
+                        &oneRRR,
+                        &e,
+                        &m,
+                        other_modulus_len_bits,
+                    )
+                };
+                match actual_result {
+                    Ok(r) => assert_elem_eq(&r, &expected_result),
+                    Err(LimbSliceError::LenMismatch { .. }) => panic!(),
+                    Err(LimbSliceError::TooLong { .. }) => panic!(),
+                    Err(LimbSliceError::TooShort { .. }) => panic!(),
+                };
+
+                Ok(())
+            },
+        )
+    }
+
+    // TODO: fn test_elem_exp_vartime() using
+    // "src/rsa/bigint_elem_exp_vartime_tests.txt". See that file for details.
+    // In the meantime, the function is tested indirectly via the RSA
+    // verification and signing tests.
+    #[test]
+    fn test_elem_mul() {
+        let cpu_features = cpu::features();
+        test::run(
+            test_vector_file!("../../crypto/fipsmodule/bn/test/mod_mul_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let m = consume_modulus::<M>(test_case, "M");
+                let m = m.modulus(cpu_features);
+                let expected_result = consume_elem(test_case, "ModMul", &m);
+                let a = consume_elem(test_case, "A", &m);
+                let b = consume_elem(test_case, "B", &m);
+
+                let b = into_encoded(m.alloc_zero(), b, &m);
+                let a = into_encoded(m.alloc_zero(), a, &m);
+                let actual_result = elem_mul(&a, b, &m);
+                let actual_result = actual_result.into_unencoded(&m);
+                assert_elem_eq(&actual_result, &expected_result);
+
+                Ok(())
+            },
+        )
+    }
+
+    #[test]
+    fn test_elem_squared() {
+        let cpu_features = cpu::features();
+        test::run(
+            test_vector_file!("bigint_elem_squared_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let m = consume_modulus::<M>(test_case, "M");
+                let m = m.modulus(cpu_features);
+                let expected_result = consume_elem(test_case, "ModSquare", &m);
+                let a = consume_elem(test_case, "A", &m);
+
+                let a = into_encoded(m.alloc_zero(), a, &m);
+                let actual_result = elem_squared(a, &m);
+                let actual_result = actual_result.into_unencoded(&m);
+                assert_elem_eq(&actual_result, &expected_result);
+
+                Ok(())
+            },
+        )
+    }
+
+    #[test]
+    fn test_elem_reduced() {
+        let cpu_features = cpu::features();
+        test::run(
+            test_vector_file!("bigint_elem_reduced_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                struct M {}
+
+                let m_ = consume_modulus::<M>(test_case, "M");
+                let m = m_.modulus(cpu_features);
+                let expected_result = consume_elem(test_case, "R", &m);
+                let a =
+                    consume_elem_unchecked::<M>(test_case, "A", expected_result.limbs.len() * 2);
+                let other_modulus_len_bits = m_.len_bits();
+
+                let actual_result = elem_reduced(m.alloc_zero(), &a, &m, other_modulus_len_bits);
+                let oneRR = One::newRR(m.alloc_zero(), &m);
+                let actual_result = elem_mul(oneRR.as_ref(), actual_result, &m);
+                assert_elem_eq(&actual_result, &expected_result);
+
+                Ok(())
+            },
+        )
+    }
+
+    #[test]
+    fn test_elem_reduced_once() {
+        let cpu_features = cpu::features();
+        test::run(
+            test_vector_file!("bigint_elem_reduced_once_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                struct M {}
+                struct O {}
+                let m = consume_modulus::<M>(test_case, "m");
+                let m = m.modulus(cpu_features);
+                let a = consume_elem_unchecked::<O>(test_case, "a", m.limbs().len());
+                let expected_result = consume_elem::<M>(test_case, "r", &m);
+                let other_modulus_len_bits = m.len_bits();
+
+                let actual_result =
+                    elem_reduced_once(m.alloc_zero(), &a, &m, other_modulus_len_bits);
+                assert_elem_eq(&actual_result, &expected_result);
+
+                Ok(())
+            },
+        )
+    }
+
+    fn consume_elem<M>(
+        test_case: &mut test::TestCase,
+        name: &str,
+        m: &Modulus<M>,
+    ) -> Elem<M, Unencoded> {
+        let value = test_case.consume_bytes(name);
+        Elem::from_be_bytes_padded(untrusted::Input::from(&value), m).unwrap()
+    }
+
+    fn consume_elem_unchecked<M>(
+        test_case: &mut test::TestCase,
+        name: &str,
+        num_limbs: usize,
+    ) -> Elem<M, Unencoded> {
+        let bytes = test_case.consume_bytes(name);
+        let mut limbs = BoxedLimbs::zero(num_limbs);
+        limb::parse_big_endian_and_pad_consttime(untrusted::Input::from(&bytes), &mut limbs)
+            .unwrap();
+        Elem {
+            limbs,
+            encoding: PhantomData,
+        }
+    }
+
+    fn consume_modulus<M>(test_case: &mut test::TestCase, name: &str) -> OwnedModulus<M> {
+        let value = test_case.consume_bytes(name);
+        OwnedModulus::from(
+            OwnedModulusValue::from_be_bytes(untrusted::Input::from(&value)).unwrap(),
+        )
+    }
+
+    fn assert_elem_eq<M, E>(a: &Elem<M, E>, b: &Elem<M, E>) {
+        if elem_verify_equal_consttime(a, b).is_err() {
+            panic!("{:x?} != {:x?}", &*a.limbs, &*b.limbs);
+        }
+    }
+
+    fn into_encoded<M>(out: Storage<M>, a: Elem<M, Unencoded>, m: &Modulus<M>) -> Elem<M, R> {
+        let oneRR = One::newRR(out, m);
+        elem_mul(oneRR.as_ref(), a, m)
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs b/ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs
new file mode 100644
index 0000000000..6eb465c91d
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs
@@ -0,0 +1,81 @@
+// Copyright 2015-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::Modulus;
+use crate::{
+    error,
+    limb::{self, Limb},
+};
+use alloc::{boxed::Box, vec};
+use core::{
+    marker::PhantomData,
+    ops::{Deref, DerefMut},
+};
+
+/// All `BoxedLimbs<M>` are stored in the same number of limbs.
+pub(super) struct BoxedLimbs<M> {
+    limbs: Box<[Limb]>,
+
+    /// The modulus *m* that determines the size of `limbx`.
+    m: PhantomData<M>,
+}
+
+impl<M> Deref for BoxedLimbs<M> {
+    type Target = [Limb];
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        &self.limbs
+    }
+}
+
+impl<M> DerefMut for BoxedLimbs<M> {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.limbs
+    }
+}
+
+// TODO: `derive(Clone)` after https://github.com/rust-lang/rust/issues/26925
+// is resolved or restrict `M: Clone`.
+impl<M> Clone for BoxedLimbs<M> {
+    fn clone(&self) -> Self {
+        Self {
+            limbs: self.limbs.clone(),
+            m: self.m,
+        }
+    }
+}
+
+impl<M> BoxedLimbs<M> {
+    pub(super) fn from_be_bytes_padded_less_than(
+        input: untrusted::Input,
+        m: &Modulus<M>,
+    ) -> Result<Self, error::Unspecified> {
+        let mut r = Self::zero(m.limbs().len());
+        limb::parse_big_endian_and_pad_consttime(input, &mut r)?;
+        limb::verify_limbs_less_than_limbs_leak_bit(&r, m.limbs())?;
+        Ok(r)
+    }
+
+    pub(super) fn zero(len: usize) -> Self {
+        Self {
+            limbs: vec![0; len].into_boxed_slice(),
+            m: PhantomData,
+        }
+    }
+
+    pub(super) fn into_limbs(self) -> Box<[Limb]> {
+        self.limbs
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/bigint/modulus.rs b/ring-0.17.14/src/arithmetic/bigint/modulus.rs
new file mode 100644
index 0000000000..b0cd6d6519
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/bigint/modulus.rs
@@ -0,0 +1,202 @@
+// Copyright 2015-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    super::montgomery::Unencoded, unwrap_impossible_len_mismatch_error, BoxedLimbs, Elem,
+    OwnedModulusValue, PublicModulus, Storage, N0,
+};
+use crate::{
+    bits::BitLength,
+    cpu, error,
+    limb::{self, Limb, LIMB_BITS},
+    polyfill::LeadingZerosStripped,
+};
+use core::marker::PhantomData;
+
+/// The modulus *m* for a ring ℤ/mℤ, along with the precomputed values needed
+/// for efficient Montgomery multiplication modulo *m*. The value must be odd
+/// and larger than 2. The larger-than-1 requirement is imposed, at least, by
+/// the modular inversion code.
+pub struct OwnedModulus<M> {
+    inner: OwnedModulusValue<M>,
+
+    // n0 * N == -1 (mod r).
+    //
+    // r == 2**(N0::LIMBS_USED * LIMB_BITS) and LG_LITTLE_R == lg(r). This
+    // ensures that we can do integer division by |r| by simply ignoring
+    // `N0::LIMBS_USED` limbs. Similarly, we can calculate values modulo `r` by
+    // just looking at the lowest `N0::LIMBS_USED` limbs. This is what makes
+    // Montgomery multiplication efficient.
+    //
+    // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography
+    // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a
+    // multi-limb Montgomery multiplication of a * b (mod n), given the
+    // unreduced product t == a * b, we repeatedly calculate:
+    //
+    //    t1 := t % r         |t1| is |t|'s lowest limb (see previous paragraph).
+    //    t2 := t1*n0*n
+    //    t3 := t + t2
+    //    t := t3 / r         copy all limbs of |t3| except the lowest to |t|.
+    //
+    // In the last step, it would only make sense to ignore the lowest limb of
+    // |t3| if it were zero. The middle steps ensure that this is the case:
+    //
+    //                            t3 ==  0 (mod r)
+    //                        t + t2 ==  0 (mod r)
+    //                   t + t1*n0*n ==  0 (mod r)
+    //                       t1*n0*n == -t (mod r)
+    //                        t*n0*n == -t (mod r)
+    //                          n0*n == -1 (mod r)
+    //                            n0 == -1/n (mod r)
+    //
+    // Thus, in each iteration of the loop, we multiply by the constant factor
+    // n0, the negative inverse of n (mod r).
+    //
+    // TODO(perf): Not all 32-bit platforms actually make use of n0[1]. For the
+    // ones that don't, we could use a shorter `R` value and use faster `Limb`
+    // calculations instead of double-precision `u64` calculations.
+    n0: N0,
+}
+
+impl<M: PublicModulus> Clone for OwnedModulus<M> {
+    fn clone(&self) -> Self {
+        Self {
+            inner: self.inner.clone(),
+            n0: self.n0,
+        }
+    }
+}
+
+impl<M> OwnedModulus<M> {
+    pub(crate) fn from(n: OwnedModulusValue<M>) -> Self {
+        // n_mod_r = n % r. As explained in the documentation for `n0`, this is
+        // done by taking the lowest `N0::LIMBS_USED` limbs of `n`.
+        #[allow(clippy::useless_conversion)]
+        let n0 = {
+            prefixed_extern! {
+                fn bn_neg_inv_mod_r_u64(n: u64) -> u64;
+            }
+
+            // XXX: u64::from isn't guaranteed to be constant time.
+            let mut n_mod_r: u64 = u64::from(n.limbs()[0]);
+
+            if N0::LIMBS_USED == 2 {
+                // XXX: If we use `<< LIMB_BITS` here then 64-bit builds
+                // fail to compile because of `deny(exceeding_bitshifts)`.
+                debug_assert_eq!(LIMB_BITS, 32);
+                n_mod_r |= u64::from(n.limbs()[1]) << 32;
+            }
+            N0::precalculated(unsafe { bn_neg_inv_mod_r_u64(n_mod_r) })
+        };
+
+        Self { inner: n, n0 }
+    }
+
+    pub fn to_elem<L>(&self, l: &Modulus<L>) -> Result<Elem<L, Unencoded>, error::Unspecified> {
+        self.inner.verify_less_than(l)?;
+        let mut limbs = BoxedLimbs::zero(l.limbs().len());
+        limbs[..self.inner.limbs().len()].copy_from_slice(self.inner.limbs());
+        Ok(Elem {
+            limbs,
+            encoding: PhantomData,
+        })
+    }
+
+    pub(crate) fn modulus(&self, cpu_features: cpu::Features) -> Modulus<M> {
+        Modulus {
+            limbs: self.inner.limbs(),
+            n0: self.n0,
+            len_bits: self.len_bits(),
+            m: PhantomData,
+            cpu_features,
+        }
+    }
+
+    pub fn len_bits(&self) -> BitLength {
+        self.inner.len_bits()
+    }
+}
+
+impl<M: PublicModulus> OwnedModulus<M> {
+    pub fn be_bytes(&self) -> LeadingZerosStripped<impl ExactSizeIterator<Item = u8> + Clone + '_> {
+        LeadingZerosStripped::new(limb::unstripped_be_bytes(self.inner.limbs()))
+    }
+}
+
+pub struct Modulus<'a, M> {
+    limbs: &'a [Limb],
+    n0: N0,
+    len_bits: BitLength,
+    m: PhantomData<M>,
+    cpu_features: cpu::Features,
+}
+
+impl<M> Modulus<'_, M> {
+    pub(super) fn oneR(&self, out: &mut [Limb]) {
+        assert_eq!(self.limbs.len(), out.len());
+
+        let r = self.limbs.len() * LIMB_BITS;
+
+        // out = 2**r - m where m = self.
+        limb::limbs_negative_odd(out, self.limbs);
+
+        let lg_m = self.len_bits().as_bits();
+        let leading_zero_bits_in_m = r - lg_m;
+
+        // When m's length is a multiple of LIMB_BITS, which is the case we
+        // most want to optimize for, then we already have
+        // out == 2**r - m == 2**r (mod m).
+        if leading_zero_bits_in_m != 0 {
+            debug_assert!(leading_zero_bits_in_m < LIMB_BITS);
+            // Correct out to 2**(lg m) (mod m). `limbs_negative_odd` flipped
+            // all the leading zero bits to ones. Flip them back.
+            *out.last_mut().unwrap() &= (!0) >> leading_zero_bits_in_m;
+
+            // Now we have out == 2**(lg m) (mod m). Keep doubling until we get
+            // to 2**r (mod m).
+            for _ in 0..leading_zero_bits_in_m {
+                limb::limbs_double_mod(out, self.limbs)
+                    .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+            }
+        }
+
+        // Now out == 2**r (mod m) == 1*R.
+    }
+
+    // TODO: XXX Avoid duplication with `Modulus`.
+    pub fn alloc_zero(&self) -> Storage<M> {
+        Storage {
+            limbs: BoxedLimbs::zero(self.limbs.len()),
+        }
+    }
+
+    #[inline]
+    pub(super) fn limbs(&self) -> &[Limb] {
+        self.limbs
+    }
+
+    #[inline]
+    pub(super) fn n0(&self) -> &N0 {
+        &self.n0
+    }
+
+    pub fn len_bits(&self) -> BitLength {
+        self.len_bits
+    }
+
+    #[inline]
+    pub(crate) fn cpu_features(&self) -> cpu::Features {
+        self.cpu_features
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs b/ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs
new file mode 100644
index 0000000000..84f5884902
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs
@@ -0,0 +1,88 @@
+// Copyright 2015-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    super::{MAX_LIMBS, MIN_LIMBS},
+    BoxedLimbs, Modulus, PublicModulus,
+};
+use crate::{
+    bits::BitLength,
+    error,
+    limb::{self, Limb, LIMB_BYTES},
+};
+
+/// `OwnedModulus`, without the overhead of Montgomery multiplication support.
+pub(crate) struct OwnedModulusValue<M> {
+    limbs: BoxedLimbs<M>, // Also `value >= 3`.
+
+    len_bits: BitLength,
+}
+
+impl<M: PublicModulus> Clone for OwnedModulusValue<M> {
+    fn clone(&self) -> Self {
+        Self {
+            limbs: self.limbs.clone(),
+            len_bits: self.len_bits,
+        }
+    }
+}
+
+impl<M> OwnedModulusValue<M> {
+    pub(crate) fn from_be_bytes(input: untrusted::Input) -> Result<Self, error::KeyRejected> {
+        let num_limbs = (input.len() + LIMB_BYTES - 1) / LIMB_BYTES;
+        const _MODULUS_MIN_LIMBS_AT_LEAST_2: () = assert!(MIN_LIMBS >= 2);
+        if num_limbs < MIN_LIMBS {
+            return Err(error::KeyRejected::unexpected_error());
+        }
+        if num_limbs > MAX_LIMBS {
+            return Err(error::KeyRejected::too_large());
+        }
+        // The above implies n >= 3, so we don't need to check that.
+
+        // Reject leading zeros. Also reject the value zero ([0]) because zero
+        // isn't positive.
+        if untrusted::Reader::new(input).peek(0) {
+            return Err(error::KeyRejected::invalid_encoding());
+        }
+
+        let mut limbs = BoxedLimbs::zero(num_limbs);
+        limb::parse_big_endian_and_pad_consttime(input, &mut limbs)
+            .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?;
+        limb::limbs_reject_even_leak_bit(&limbs)
+            .map_err(|_: error::Unspecified| error::KeyRejected::invalid_component())?;
+
+        let len_bits = limb::limbs_minimal_bits(&limbs);
+
+        Ok(Self { limbs, len_bits })
+    }
+
+    pub fn verify_less_than<L>(&self, l: &Modulus<L>) -> Result<(), error::Unspecified> {
+        if self.len_bits() > l.len_bits() {
+            return Err(error::Unspecified);
+        }
+        if self.limbs.len() == l.limbs().len() {
+            limb::verify_limbs_less_than_limbs_leak_bit(&self.limbs, l.limbs())?;
+        }
+        Ok(())
+    }
+
+    pub fn len_bits(&self) -> BitLength {
+        self.len_bits
+    }
+
+    #[inline]
+    pub(super) fn limbs(&self) -> &[Limb] {
+        &self.limbs
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/bigint/private_exponent.rs b/ring-0.17.14/src/arithmetic/bigint/private_exponent.rs
new file mode 100644
index 0000000000..82e75bf888
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/bigint/private_exponent.rs
@@ -0,0 +1,77 @@
+// Copyright 2015-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{limb, BoxedLimbs, Limb, Modulus};
+use crate::error;
+use alloc::boxed::Box;
+
+pub struct PrivateExponent {
+    // Unlike most `[Limb]` we deal with, these are stored most significant
+    // word first.
+    limbs: Box<[Limb]>,
+}
+
+impl PrivateExponent {
+    // `p` is the modulus for which the exponent is in the interval [1, `p` - 1).
+    pub fn from_be_bytes_padded<M>(
+        input: untrusted::Input,
+        p: &Modulus<M>,
+    ) -> Result<Self, error::Unspecified> {
+        let mut dP = BoxedLimbs::from_be_bytes_padded_less_than(input, p)?;
+
+        // Proof that `dP < p - 1`:
+        //
+        // If `dP < p` then either `dP == p - 1` or `dP < p - 1`. Since `p` is
+        // odd, `p - 1` is even. `d` is odd, and an odd number modulo an even
+        // number is odd. Therefore `dP` must be odd. But then it cannot be
+        // `p - 1` and so we know `dP < p - 1`.
+        //
+        // Further we know `dP != 0` because `dP` is not even.
+        limb::limbs_reject_even_leak_bit(&dP)?;
+        dP.reverse();
+
+        Ok(Self {
+            limbs: dP.into_limbs(),
+        })
+    }
+
+    // Create a `PrivateExponent` with a value that we do not support in
+    // production use, to allow testing with additional test vectors.
+    #[cfg(test)]
+    pub fn from_be_bytes_for_test_only<M>(
+        input: untrusted::Input,
+        p: &Modulus<M>,
+    ) -> Result<Self, error::Unspecified> {
+        use crate::limb::LIMB_BYTES;
+
+        // Do exactly what `from_be_bytes_padded` does for any inputs it accepts.
+        if let r @ Ok(_) = Self::from_be_bytes_padded(input, p) {
+            return r;
+        }
+
+        let num_limbs = (input.len() + LIMB_BYTES - 1) / LIMB_BYTES;
+        let mut limbs = BoxedLimbs::<M>::zero(num_limbs);
+        limb::parse_big_endian_and_pad_consttime(input, &mut limbs)
+            .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?;
+        limbs.reverse();
+        Ok(Self {
+            limbs: limbs.into_limbs(),
+        })
+    }
+
+    #[inline]
+    pub(super) fn limbs(&self) -> &[Limb] {
+        &self.limbs
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/constant.rs b/ring-0.17.14/src/arithmetic/constant.rs
new file mode 100644
index 0000000000..81d61ea9a7
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/constant.rs
@@ -0,0 +1,27 @@
+use crate::limb::LeakyLimb;
+use core::mem::size_of;
+
+const fn parse_digit(d: u8) -> u8 {
+    match d.to_ascii_lowercase() {
+        b'0'..=b'9' => d - b'0',
+        b'a'..=b'f' => d - b'a' + 10,
+        _ => panic!(),
+    }
+}
+
+// TODO: this would be nicer as a trait, but currently traits don't support const functions
+pub const fn limbs_from_hex<const LIMBS: usize>(hex: &str) -> [LeakyLimb; LIMBS] {
+    let hex = hex.as_bytes();
+    let mut limbs = [0; LIMBS];
+    let limb_nibbles = size_of::<LeakyLimb>() * 2;
+    let mut i = 0;
+
+    while i < hex.len() {
+        let char = hex[hex.len() - 1 - i];
+        let val = parse_digit(char);
+        limbs[i / limb_nibbles] |= (val as LeakyLimb) << ((i % limb_nibbles) * 4);
+        i += 1;
+    }
+
+    limbs
+}
diff --git a/ring-0.17.14/src/arithmetic/ffi.rs b/ring-0.17.14/src/arithmetic/ffi.rs
new file mode 100644
index 0000000000..336fb68de8
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/ffi.rs
@@ -0,0 +1,97 @@
+// Copyright 2024-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{inout::AliasingSlices3, n0::N0, LimbSliceError, MAX_LIMBS, MIN_LIMBS};
+use crate::{c, limb::Limb, polyfill::usize_from_u32};
+use core::{mem::size_of, num::NonZeroUsize};
+
+const _MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES: () = {
+    // BoringSSL's limit: 8 kiloBYTES.
+    const BN_MONTGOMERY_MAX_WORDS: usize = (8 * 1092) / size_of::<Limb>();
+    assert!(MAX_LIMBS <= BN_MONTGOMERY_MAX_WORDS);
+
+    // Some 64-bit assembly implementations were written to take `len` as a
+    // `c_int`, so they zero out the undefined top half of `len` to convert it
+    // to a `usize`. But, others don't.
+    assert!(MAX_LIMBS <= usize_from_u32(u32::MAX));
+};
+
+macro_rules! bn_mul_mont_ffi {
+    ( $in_out:expr, $n:expr, $n0:expr, $cpu:expr,
+      unsafe { ($MIN_LEN:expr, $MOD_LEN:expr, $Cpu:ty) => $f:ident }) => {{
+        use crate::{c, limb::Limb};
+        prefixed_extern! {
+            // `r` and/or 'a' and/or 'b' may alias.
+            // XXX: BoringSSL declares these functions to return `int`.
+            fn $f(
+                r: *mut Limb,
+                a: *const Limb,
+                b: *const Limb,
+                n: *const Limb,
+                n0: &N0,
+                len: c::NonZero_size_t,
+            );
+        }
+        unsafe {
+            crate::arithmetic::ffi::bn_mul_mont_ffi::<$Cpu, { $MIN_LEN }, { $MOD_LEN }>(
+                $in_out, $n, $n0, $cpu, $f,
+            )
+        }
+    }};
+}
+
+#[inline]
+pub(super) unsafe fn bn_mul_mont_ffi<Cpu, const LEN_MIN: usize, const LEN_MOD: usize>(
+    in_out: impl AliasingSlices3<Limb>,
+    n: &[Limb],
+    n0: &N0,
+    cpu: Cpu,
+    f: unsafe extern "C" fn(
+        r: *mut Limb,
+        a: *const Limb,
+        b: *const Limb,
+        n: *const Limb,
+        n0: &N0,
+        len: c::NonZero_size_t,
+    ),
+) -> Result<(), LimbSliceError> {
+    assert_eq!(n.len() % LEN_MOD, 0); // The caller should guard against this.
+
+    /// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
+    /// limbs. For a long time we have required 4 limbs for all targets, though
+    /// this may be unnecessary.
+    const _MIN_LIMBS_AT_LEAST_4: () = assert!(MIN_LIMBS >= 4);
+    // We haven't tested shorter lengths.
+    assert!(LEN_MIN >= MIN_LIMBS);
+    if n.len() < LEN_MIN {
+        return Err(LimbSliceError::too_short(n.len()));
+    }
+    let len = NonZeroUsize::new(n.len()).unwrap_or_else(|| {
+        // Unreachable because we checked against `LEN_MIN`, and we checked
+        // `LEN_MIN` is nonzero.
+        unreachable!()
+    });
+
+    // Avoid stack overflow from the alloca inside.
+    if len.get() > MAX_LIMBS {
+        return Err(LimbSliceError::too_long(n.len()));
+    }
+    in_out
+        .with_non_dangling_non_null_pointers_rab(len, |r, a, b| {
+            let n = n.as_ptr();
+            let _: Cpu = cpu;
+            unsafe { f(r, a, b, n, n0, len) };
+        })
+        .map_err(LimbSliceError::len_mismatch)
+}
diff --git a/ring-0.17.14/src/arithmetic/inout.rs b/ring-0.17.14/src/arithmetic/inout.rs
new file mode 100644
index 0000000000..0c9c4a1293
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/inout.rs
@@ -0,0 +1,177 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub(crate) use crate::error::LenMismatchError;
+use core::num::NonZeroUsize;
+
+pub(crate) trait AliasingSlices2<T> {
+    /// The pointers passed to `f` will be valid and non-null, and will not
+    /// be dangling, so they can be passed to C functions.
+    ///
+    /// The first pointer, `r`, may be pointing to uninitialized memory for
+    /// `expected_len` elements of type `T`, properly aligned and writable.
+    /// `f` must not read from `r` before writing to it.
+    ///
+    /// The second & third pointers, `a` and `b`, point to `expected_len`
+    /// values of type `T`, properly aligned.
+    ///
+    /// `r`, `a`, and/or `b` may alias each other only in the following ways:
+    /// `ptr::eq(r, a)`, `ptr::eq(r, b)`, and/or `ptr::eq(a, b)`; i.e. they
+    /// will not be "overlapping."
+    ///
+    /// Implementations of this trait shouldn't override this default
+    /// implementation.
+    #[inline(always)]
+    fn with_non_dangling_non_null_pointers_ra<R>(
+        self,
+        expected_len: NonZeroUsize,
+        f: impl FnOnce(*mut T, *const T) -> R,
+    ) -> Result<R, LenMismatchError>
+    where
+        Self: Sized,
+    {
+        self.with_potentially_dangling_non_null_pointers_ra(expected_len.get(), f)
+    }
+
+    /// If `expected_len == 0` then the pointers passed to `f` may be
+    /// dangling pointers, which should not be passed to C functions. In all
+    /// other respects, this works like
+    /// `Self::with_non_dangling_non_null_pointers_rab`.
+    ///
+    /// Implementations of this trait should implement this method and not
+    /// `with_non_dangling_non_null_pointers_rab`. Users of this trait should
+    /// use `with_non_dangling_non_null_pointers_rab` and not this.
+    fn with_potentially_dangling_non_null_pointers_ra<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T) -> R,
+    ) -> Result<R, LenMismatchError>;
+}
+
+impl<T> AliasingSlices2<T> for &mut [T] {
+    fn with_potentially_dangling_non_null_pointers_ra<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T) -> R,
+    ) -> Result<R, LenMismatchError> {
+        let r = self;
+        if r.len() != expected_len {
+            return Err(LenMismatchError::new(r.len()));
+        }
+        Ok(f(r.as_mut_ptr(), r.as_ptr()))
+    }
+}
+
+impl<T> AliasingSlices2<T> for (&mut [T], &[T]) {
+    fn with_potentially_dangling_non_null_pointers_ra<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T) -> R,
+    ) -> Result<R, LenMismatchError> {
+        let (r, a) = self;
+        if r.len() != expected_len {
+            return Err(LenMismatchError::new(r.len()));
+        }
+        if a.len() != expected_len {
+            return Err(LenMismatchError::new(a.len()));
+        }
+        Ok(f(r.as_mut_ptr(), a.as_ptr()))
+    }
+}
+
+pub(crate) trait AliasingSlices3<T> {
+    /// The pointers passed to `f` will all be non-null and properly aligned,
+    /// and will not be dangling.
+    ///
+    /// The first pointer, `r` points to potentially-uninitialized writable
+    /// space for `expected_len` elements of type `T`. Accordingly, `f` must
+    /// not read from `r` before writing to it.
+    ///
+    /// The second & third pointers, `a` and `b`, point to `expected_len`
+    /// initialized values of type `T`.
+    ///
+    /// `r`, `a`, and/or `b` may alias each other, but only in the following
+    /// ways: `ptr::eq(r, a)`, `ptr::eq(r, b)`, and/or `ptr::eq(a, b)`; they
+    /// will not be "overlapping."
+    ///
+    /// Implementations of this trait shouldn't override this default
+    /// implementation.
+    #[inline(always)]
+    fn with_non_dangling_non_null_pointers_rab<R>(
+        self,
+        expected_len: NonZeroUsize,
+        f: impl FnOnce(*mut T, *const T, *const T) -> R,
+    ) -> Result<R, LenMismatchError>
+    where
+        Self: Sized,
+    {
+        self.with_potentially_dangling_non_null_pointers_rab(expected_len.get(), f)
+    }
+
+    /// If `expected_len == 0` then the pointers passed to `f` may be
+    /// dangling pointers, which should not be passed to C functions. In all
+    /// other respects, this works like
+    /// `Self::with_non_dangling_non_null_pointers_rab`.
+    ///
+    /// Implementations of this trait should implement this method and not
+    /// `with_non_dangling_non_null_pointers_rab`. Users of this trait should
+    /// use `with_non_dangling_non_null_pointers_rab` and not this.
+    fn with_potentially_dangling_non_null_pointers_rab<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T, *const T) -> R,
+    ) -> Result<R, LenMismatchError>;
+}
+
+impl<T> AliasingSlices3<T> for &mut [T] {
+    fn with_potentially_dangling_non_null_pointers_rab<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T, *const T) -> R,
+    ) -> Result<R, LenMismatchError> {
+        <Self as AliasingSlices2<T>>::with_potentially_dangling_non_null_pointers_ra(
+            self,
+            expected_len,
+            |r, a| f(r, r, a),
+        )
+    }
+}
+
+impl<T> AliasingSlices3<T> for (&mut [T], &[T], &[T]) {
+    fn with_potentially_dangling_non_null_pointers_rab<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T, *const T) -> R,
+    ) -> Result<R, LenMismatchError> {
+        let (r, a, b) = self;
+        ((r, a), b).with_potentially_dangling_non_null_pointers_rab(expected_len, f)
+    }
+}
+
+impl<RA, T> AliasingSlices3<T> for (RA, &[T])
+where
+    RA: AliasingSlices2<T>,
+{
+    fn with_potentially_dangling_non_null_pointers_rab<R>(
+        self,
+        expected_len: usize,
+        f: impl FnOnce(*mut T, *const T, *const T) -> R,
+    ) -> Result<R, LenMismatchError> {
+        let (ra, b) = self;
+        if b.len() != expected_len {
+            return Err(LenMismatchError::new(b.len()));
+        }
+        ra.with_potentially_dangling_non_null_pointers_ra(expected_len, |r, a| f(r, a, b.as_ptr()))
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs b/ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs
new file mode 100644
index 0000000000..fecce47f33
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs
@@ -0,0 +1,17 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(all(target_arch = "aarch64", target_endian = "little"))]
+
+pub(in super::super) mod mont;
diff --git a/ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs b/ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs
new file mode 100644
index 0000000000..edd289f7c0
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs
@@ -0,0 +1,59 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(all(target_arch = "aarch64", target_endian = "little"))]
+
+use super::super::super::{inout::AliasingSlices3 as _, n0::N0, LimbSliceError, MAX_LIMBS};
+use crate::{
+    c,
+    limb::Limb,
+    polyfill::slice::{AsChunks, AsChunksMut},
+};
+use core::num::NonZeroUsize;
+
+#[inline]
+pub(in super::super::super) fn sqr_mont5(
+    mut in_out: AsChunksMut<Limb, 8>,
+    n: AsChunks<Limb, 8>,
+    n0: &N0,
+) -> Result<(), LimbSliceError> {
+    prefixed_extern! {
+        // `r` and/or 'a' may alias.
+        // XXX: BoringSSL (kinda, implicitly) declares this to return `int`.
+        // `num` must be a non-zero multiple of 8.
+        fn bn_sqr8x_mont(
+            rp: *mut Limb,
+            ap: *const Limb,
+            ap_again: *const Limb,
+            np: *const Limb,
+            n0: &N0,
+            num: c::NonZero_size_t);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+    let n = n.as_flattened();
+    let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?;
+
+    // Avoid stack overflow from the alloca inside.
+    if num_limbs.get() > MAX_LIMBS {
+        return Err(LimbSliceError::too_long(num_limbs.get()));
+    }
+
+    in_out
+        .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, a_again| {
+            let n = n.as_ptr(); // Non-dangling because num_limbs > 0.
+            unsafe { bn_sqr8x_mont(r, a, a_again, n, n0, num_limbs) };
+        })
+        .map_err(LimbSliceError::len_mismatch)
+}
diff --git a/ring-0.17.14/src/arithmetic/limbs/mod.rs b/ring-0.17.14/src/arithmetic/limbs/mod.rs
new file mode 100644
index 0000000000..0dc6af36c3
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs/mod.rs
@@ -0,0 +1,16 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub(super) mod aarch64;
+pub(super) mod x86_64;
diff --git a/ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs b/ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs
new file mode 100644
index 0000000000..d7dc08cf7b
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs
@@ -0,0 +1,17 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+pub(in super::super::super) mod mont;
diff --git a/ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs b/ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs
new file mode 100644
index 0000000000..bf92c0c56f
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs
@@ -0,0 +1,312 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::super::super::{
+    inout::{AliasingSlices2, AliasingSlices3},
+    n0::N0,
+    LimbSliceError, MAX_LIMBS,
+};
+use crate::{
+    c,
+    cpu::intel::{Adx, Bmi1, Bmi2},
+    error::LenMismatchError,
+    limb::{LeakyWindow, Limb, Window},
+    polyfill::slice::{AsChunks, AsChunksMut},
+};
+use core::num::NonZeroUsize;
+
+const _512_IS_LIMB_BITS_TIMES_8: () = assert!(8 * Limb::BITS == 512);
+
+#[inline]
+pub(in super::super::super) fn mul_mont5(
+    mut r: AsChunksMut<Limb, 8>,
+    a: AsChunks<Limb, 8>,
+    b: AsChunks<Limb, 8>,
+    m: AsChunks<Limb, 8>,
+    n0: &N0,
+    maybe_adx_bmi2: Option<(Adx, Bmi2)>,
+) -> Result<(), LimbSliceError> {
+    mul_mont5_4x(
+        (r.as_flattened_mut(), a.as_flattened(), b.as_flattened()),
+        m.into(),
+        n0,
+        maybe_adx_bmi2,
+    )
+}
+
+pub const MIN_4X: usize = 8;
+
+#[inline]
+pub(in super::super::super) fn mul_mont5_4x(
+    in_out: impl AliasingSlices3<Limb>,
+    n: AsChunks<Limb, 4>,
+    n0: &N0,
+    maybe_adx_bmi2: Option<(Adx, Bmi2)>,
+) -> Result<(), LimbSliceError> {
+    const MOD_4X: usize = 4;
+    let n = n.as_flattened();
+    if let Some(cpu) = maybe_adx_bmi2 {
+        bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe {
+            (MIN_4X, MOD_4X, (Adx, Bmi2)) => bn_mulx4x_mont
+        })
+    } else {
+        bn_mul_mont_ffi!(in_out, n, n0, (), unsafe {
+            (MIN_4X, MOD_4X, ()) => bn_mul4x_mont
+        })
+    }
+}
+
+#[inline]
+pub(in super::super::super) fn sqr_mont5(
+    mut in_out: AsChunksMut<Limb, 8>,
+    n: AsChunks<Limb, 8>,
+    n0: &N0,
+    maybe_adx_bmi2: Option<(Adx, Bmi2)>,
+) -> Result<(), LimbSliceError> {
+    prefixed_extern! {
+        // `r` and/or 'a' may alias.
+        // XXX: BoringSSL declares this to return `int`.
+        // `num` must be a non-zero multiple of 8.
+        fn bn_sqr8x_mont(
+            rp: *mut Limb,
+            ap: *const Limb,
+            mulx_adx_capable: Limb,
+            np: *const Limb,
+            n0: &N0,
+            num: c::NonZero_size_t);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+    let n = n.as_flattened();
+    let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?;
+
+    // Avoid stack overflow from the alloca inside.
+    if num_limbs.get() > MAX_LIMBS {
+        return Err(LimbSliceError::too_long(num_limbs.get()));
+    }
+
+    // `Limb::from(mulx_adx.is_some())`, but intentionally branchy.
+    let mulx_adx_capable = match maybe_adx_bmi2 {
+        Some(_) => Limb::from(true),
+        None => Limb::from(false),
+    };
+
+    in_out
+        .with_non_dangling_non_null_pointers_ra(num_limbs, |r, a| {
+            let n = n.as_ptr(); // Non-dangling because num_limbs > 0.
+            unsafe { bn_sqr8x_mont(r, a, mulx_adx_capable, n, n0, num_limbs) };
+        })
+        .map_err(LimbSliceError::len_mismatch)
+}
+
+#[inline(always)]
+pub(in super::super::super) fn scatter5(
+    a: AsChunks<Limb, 8>,
+    mut table: AsChunksMut<Limb, 8>,
+    power: LeakyWindow,
+) -> Result<(), LimbSliceError> {
+    prefixed_extern! {
+        // Upstream uses `num: c::size_t` too, and `power: c::size_t`; see
+        // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`.
+        fn bn_scatter5(
+            inp: *const Limb,
+            num: c::NonZero_size_t,
+            table: *mut Limb,
+            power: LeakyWindow,
+        );
+    }
+    let num_limbs = check_common(a, table.as_ref())?;
+    let a = a.as_flattened();
+    let table = table.as_flattened_mut();
+    assert!(power < 32);
+    unsafe { bn_scatter5(a.as_ptr(), num_limbs, table.as_mut_ptr(), power) };
+    Ok(())
+}
+
+// SAFETY: `power` must be less than 32.
+#[inline(always)]
+pub(in super::super::super) unsafe fn gather5(
+    mut r: AsChunksMut<Limb, 8>,
+    table: AsChunks<Limb, 8>,
+    power: Window,
+) -> Result<(), LimbSliceError> {
+    prefixed_extern! {
+        // Upstream uses `num: c::size_t` too, and `power: c::size_t`; see
+        // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`.
+        fn bn_gather5(
+            out: *mut Limb,
+            num: c::NonZero_size_t,
+            table: *const Limb,
+            power: Window);
+    }
+    let num_limbs = check_common(r.as_ref(), table)?;
+    let r = r.as_flattened_mut();
+    let table = table.as_flattened();
+    // SAFETY: We cannot assert that `power` is in range because it is secret.
+    // TODO: Create a `Window5` type that is guaranteed to be in range.
+    unsafe { bn_gather5(r.as_mut_ptr(), num_limbs, table.as_ptr(), power) };
+    Ok(())
+}
+
+// SAFETY: `power` must be less than 32.
+#[inline(always)]
+pub(in super::super::super) unsafe fn mul_mont_gather5_amm(
+    mut r: AsChunksMut<Limb, 8>,
+    a: AsChunks<Limb, 8>,
+    table: AsChunks<Limb, 8>,
+    n: AsChunks<Limb, 8>,
+    n0: &N0,
+    power: Window,
+    maybe_adx_bmi1_bmi2: Option<(Adx, Bmi1, Bmi2)>,
+) -> Result<(), LimbSliceError> {
+    prefixed_extern! {
+        // Upstream has `num: c_int` and `power: c_int`; see
+        // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`.
+        fn bn_mul4x_mont_gather5(
+            rp: *mut Limb,
+            ap: *const Limb,
+            table: *const Limb,
+            np: *const Limb,
+            n0: &N0,
+            num: c::NonZero_size_t,
+            power: Window,
+        );
+        // Upstream has `num: c_int` and `power: c_int`; see
+        // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`.
+        fn bn_mulx4x_mont_gather5(
+            rp: *mut Limb,
+            ap: *const Limb,
+            table: *const Limb,
+            np: *const Limb,
+            n0: &N0,
+            num: c::NonZero_size_t,
+            power: Window,
+        );
+    }
+    let num_limbs = check_common_with_n(r.as_ref(), table, n)?;
+    let a = a.as_flattened();
+    if a.len() != num_limbs.get() {
+        return Err(LimbSliceError::len_mismatch(LenMismatchError::new(a.len())));
+    }
+    let r = r.as_flattened_mut();
+    let r = r.as_mut_ptr();
+    let a = a.as_ptr();
+    let table = table.as_flattened();
+    let table = table.as_ptr();
+    let n = n.as_flattened();
+    let n = n.as_ptr();
+    // SAFETY: We cannot assert that `power` is in range because it is secret.
+    // TODO: Create a `Window5` type that is guaranteed to be in range.
+    if maybe_adx_bmi1_bmi2.is_some() {
+        unsafe { bn_mulx4x_mont_gather5(r, a, table, n, n0, num_limbs, power) }
+    } else {
+        unsafe { bn_mul4x_mont_gather5(r, a, table, n, n0, num_limbs, power) }
+    };
+    Ok(())
+}
+
+// SAFETY: `power` must be less than 32.
+#[inline(always)]
+pub(in super::super::super) unsafe fn power5_amm(
+    mut in_out: AsChunksMut<Limb, 8>,
+    table: AsChunks<Limb, 8>,
+    n: AsChunks<Limb, 8>,
+    n0: &N0,
+    power: Window,
+    maybe_adx_bmi1_bmi2: Option<(Adx, Bmi1, Bmi2)>,
+) -> Result<(), LimbSliceError> {
+    prefixed_extern! {
+        // Upstream has `num: c_int` and `power: c_int`; see
+        // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`.
+        fn bn_power5_nohw(
+            rp: *mut Limb,
+            ap: *const Limb,
+            table: *const Limb,
+            np: *const Limb,
+            n0: &N0,
+            num: c::NonZero_size_t,
+            power: Window,
+        );
+        // Upstream has `num: c_int` and `power: c_int`; see
+        // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`.
+        fn bn_powerx5(
+            rp: *mut Limb,
+            ap: *const Limb,
+            table: *const Limb,
+            np: *const Limb,
+            n0: &N0,
+            num: c::NonZero_size_t,
+            power: Window,
+        );
+    }
+    let num_limbs = check_common_with_n(in_out.as_ref(), table, n)?;
+    let in_out = in_out.as_flattened_mut();
+    let r = in_out.as_mut_ptr();
+    let a = in_out.as_ptr();
+    let table = table.as_flattened();
+    let table = table.as_ptr();
+    let n = n.as_flattened();
+    let n = n.as_ptr();
+    // SAFETY: We cannot assert that `power` is in range because it is secret.
+    // TODO: Create a `Window5` type that is guaranteed to be in range.
+    if maybe_adx_bmi1_bmi2.is_some() {
+        unsafe { bn_powerx5(r, a, table, n, n0, num_limbs, power) }
+    } else {
+        unsafe { bn_power5_nohw(r, a, table, n, n0, num_limbs, power) }
+    };
+    Ok(())
+}
+
+// Helps the compiler will be able to hoist all of these checks out of the
+// loops in the caller. Try to help the compiler by doing the checks
+// consistently in each function and also by inlining this function and all the
+// callers.
+#[inline(always)]
+fn check_common(
+    a: AsChunks<Limb, 8>,
+    table: AsChunks<Limb, 8>,
+) -> Result<NonZeroUsize, LimbSliceError> {
+    assert_eq!((table.as_ptr() as usize) % 16, 0); // According to BoringSSL.
+    let a = a.as_flattened();
+    let table = table.as_flattened();
+    let num_limbs = NonZeroUsize::new(a.len()).ok_or_else(|| LimbSliceError::too_short(a.len()))?;
+    if num_limbs.get() > MAX_LIMBS {
+        return Err(LimbSliceError::too_long(a.len()));
+    }
+    if num_limbs.get() * 32 != table.len() {
+        return Err(LimbSliceError::len_mismatch(LenMismatchError::new(
+            table.len(),
+        )));
+    };
+    Ok(num_limbs)
+}
+
+#[inline(always)]
+fn check_common_with_n(
+    a: AsChunks<Limb, 8>,
+    table: AsChunks<Limb, 8>,
+    n: AsChunks<Limb, 8>,
+) -> Result<NonZeroUsize, LimbSliceError> {
+    // Choose `a` instead of `n` so that every function starts with
+    // `check_common` passing the exact same arguments, so that the compiler
+    // can easily de-dupe the checks.
+    let num_limbs = check_common(a, table)?;
+    let n = n.as_flattened();
+    if n.len() != num_limbs.get() {
+        return Err(LimbSliceError::len_mismatch(LenMismatchError::new(n.len())));
+    }
+    Ok(num_limbs)
+}
diff --git a/ring-0.17.14/src/arithmetic/limbs512/mod.rs b/ring-0.17.14/src/arithmetic/limbs512/mod.rs
new file mode 100644
index 0000000000..8b77ecf475
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs512/mod.rs
@@ -0,0 +1,17 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+mod storage;
+
+pub(super) use self::storage::{AlignedStorage, LIMBS_PER_CHUNK};
diff --git a/ring-0.17.14/src/arithmetic/limbs512/storage.rs b/ring-0.17.14/src/arithmetic/limbs512/storage.rs
new file mode 100644
index 0000000000..210192b4f6
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/limbs512/storage.rs
@@ -0,0 +1,60 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{
+    error::LenMismatchError,
+    limb::{Limb, LIMB_BITS},
+    polyfill::slice::{self, AsChunksMut},
+};
+use core::mem::{align_of, size_of};
+
+// Some x86_64 assembly is written under the assumption that some of its
+// input data and/or temporary storage is aligned to `MOD_EXP_CTIME_ALIGN`
+// bytes, which was/is 64 in OpenSSL.
+//
+// We use this in the non-X86-64 implementation of exponentiation as well,
+// with the hope of converging th two implementations into one.
+
+#[repr(C, align(64))]
+pub struct AlignedStorage<const N: usize>([Limb; N]);
+
+const _LIMB_SIZE_DIVIDES_ALIGNMENT: () =
+    assert!(align_of::<AlignedStorage<1>>() % size_of::<Limb>() == 0);
+
+pub const LIMBS_PER_CHUNK: usize = 512 / LIMB_BITS;
+
+impl<const N: usize> AlignedStorage<N> {
+    pub fn zeroed() -> Self {
+        assert_eq!(N % LIMBS_PER_CHUNK, 0); // TODO: const.
+        Self([0; N])
+    }
+
+    // The result will have every chunk aligned on a 64 byte boundary.
+    pub fn aligned_chunks_mut(
+        &mut self,
+        num_entries: usize,
+        chunks_per_entry: usize,
+    ) -> Result<AsChunksMut<Limb, LIMBS_PER_CHUNK>, LenMismatchError> {
+        let total_limbs = num_entries * chunks_per_entry * LIMBS_PER_CHUNK;
+        let len = self.0.len();
+        let flattened = self
+            .0
+            .get_mut(..total_limbs)
+            .ok_or_else(|| LenMismatchError::new(len))?;
+        match slice::as_chunks_mut(flattened) {
+            (chunks, []) => Ok(chunks),
+            (_, r) => Err(LenMismatchError::new(r.len())),
+        }
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/montgomery.rs b/ring-0.17.14/src/arithmetic/montgomery.rs
new file mode 100644
index 0000000000..4098eced31
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/montgomery.rs
@@ -0,0 +1,384 @@
+// Copyright 2017-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub use super::n0::N0;
+use super::{inout::AliasingSlices3, LimbSliceError, MIN_LIMBS};
+use crate::cpu;
+use cfg_if::cfg_if;
+
+// Indicates that the element is not encoded; there is no *R* factor
+// that needs to be canceled out.
+#[derive(Copy, Clone)]
+pub enum Unencoded {}
+
+// Indicates that the element is encoded; the value has one *R*
+// factor that needs to be canceled out.
+#[derive(Copy, Clone)]
+pub enum R {}
+
+// Indicates the element is encoded three times; the value has three
+// *R* factors that need to be canceled out.
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Copy, Clone)]
+pub enum RRR {}
+
+// Indicates the element is encoded twice; the value has two *R*
+// factors that need to be canceled out.
+#[derive(Copy, Clone)]
+pub enum RR {}
+
+// Indicates the element is inversely encoded; the value has one
+// 1/*R* factor that needs to be canceled out.
+#[derive(Copy, Clone)]
+pub enum RInverse {}
+
+pub trait Encoding {}
+
+impl Encoding for RRR {}
+impl Encoding for RR {}
+impl Encoding for R {}
+impl Encoding for Unencoded {}
+impl Encoding for RInverse {}
+
+/// The encoding of the result of a reduction.
+pub trait ReductionEncoding {
+    type Output: Encoding;
+}
+
+impl ReductionEncoding for RRR {
+    type Output = RR;
+}
+
+impl ReductionEncoding for RR {
+    type Output = R;
+}
+impl ReductionEncoding for R {
+    type Output = Unencoded;
+}
+impl ReductionEncoding for Unencoded {
+    type Output = RInverse;
+}
+
+/// The encoding of the result of a multiplication.
+pub trait ProductEncoding {
+    type Output: Encoding;
+}
+
+impl<E: ReductionEncoding> ProductEncoding for (Unencoded, E) {
+    type Output = E::Output;
+}
+
+impl<E: Encoding> ProductEncoding for (R, E) {
+    type Output = E;
+}
+
+impl ProductEncoding for (RR, RR) {
+    type Output = RRR;
+}
+
+impl<E: ReductionEncoding> ProductEncoding for (RInverse, E)
+where
+    E::Output: ReductionEncoding,
+{
+    type Output = <<E as ReductionEncoding>::Output as ReductionEncoding>::Output;
+}
+
+// XXX: Rust doesn't allow overlapping impls,
+// TODO (if/when Rust allows it):
+// impl<E1, E2: ReductionEncoding> ProductEncoding for
+//         (E1, E2) {
+//     type Output = <(E2, E1) as ProductEncoding>::Output;
+// }
+impl ProductEncoding for (RR, Unencoded) {
+    type Output = <(Unencoded, RR) as ProductEncoding>::Output;
+}
+impl ProductEncoding for (RR, RInverse) {
+    type Output = <(RInverse, RR) as ProductEncoding>::Output;
+}
+
+impl ProductEncoding for (RRR, RInverse) {
+    type Output = <(RInverse, RRR) as ProductEncoding>::Output;
+}
+
+#[allow(unused_imports)]
+use crate::{bssl, c, limb::Limb};
+
+#[inline(always)]
+pub(super) fn limbs_mul_mont(
+    in_out: impl AliasingSlices3<Limb>,
+    n: &[Limb],
+    n0: &N0,
+    cpu: cpu::Features,
+) -> Result<(), LimbSliceError> {
+    const MOD_FALLBACK: usize = 1; // No restriction.
+    cfg_if! {
+        if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+            let _: cpu::Features = cpu;
+            const MIN_4X: usize = 4;
+            const MOD_4X: usize = 4;
+            if n.len() >= MIN_4X && n.len() % MOD_4X == 0 {
+                bn_mul_mont_ffi!(in_out, n, n0, (), unsafe {
+                    (MIN_4X, MOD_4X, ()) => bn_mul4x_mont
+                })
+            } else {
+                bn_mul_mont_ffi!(in_out, n, n0, (), unsafe {
+                    (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw
+                })
+            }
+        } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] {
+            const MIN_8X: usize = 8;
+            const MOD_8X: usize = 8;
+            if n.len() >= MIN_8X && n.len() % MOD_8X == 0 {
+                use crate::cpu::{GetFeature as _, arm::Neon};
+                if let Some(cpu) = cpu.get_feature() {
+                    return bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe {
+                        (MIN_8X, MOD_8X, Neon) => bn_mul8x_mont_neon
+                    });
+                }
+            }
+            // The ARM version of `bn_mul_mont_nohw` has a minimum of 2.
+            const _MIN_LIMBS_AT_LEAST_2: () = assert!(MIN_LIMBS >= 2);
+            bn_mul_mont_ffi!(in_out, n, n0, (), unsafe {
+                (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw
+            })
+        } else if #[cfg(target_arch = "x86")] {
+            use crate::{cpu::GetFeature as _, cpu::intel::Sse2};
+            // The X86 implementation of `bn_mul_mont` has a minimum of 4.
+            const _MIN_LIMBS_AT_LEAST_4: () = assert!(MIN_LIMBS >= 4);
+            if let Some(cpu) = cpu.get_feature() {
+                bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe {
+                    (MIN_LIMBS, MOD_FALLBACK, Sse2) => bn_mul_mont
+                })
+            } else {
+                // This isn't really an FFI call; it's defined below.
+                unsafe {
+                    super::ffi::bn_mul_mont_ffi::<(), {MIN_LIMBS}, 1>(in_out, n, n0, (),
+                    bn_mul_mont_fallback)
+                }
+            }
+        } else if #[cfg(target_arch = "x86_64")] {
+            use crate::{cpu::GetFeature as _, polyfill::slice};
+            use super::limbs::x86_64;
+            if n.len() >= x86_64::mont::MIN_4X {
+                if let (n, []) = slice::as_chunks(n) {
+                    return x86_64::mont::mul_mont5_4x(in_out, n, n0, cpu.get_feature());
+                }
+            }
+            bn_mul_mont_ffi!(in_out, n, n0, (), unsafe {
+                (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw
+            })
+        } else {
+            // Use the fallback implementation implemented below through the
+            // FFI wrapper defined below, so that Rust and C code both go
+            // through `bn_mul_mont`.
+            bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe {
+                (MIN_LIMBS, MOD_FALLBACK, cpu::Features) => bn_mul_mont
+            })
+        }
+    }
+}
+
+cfg_if! {
+    if  #[cfg(not(any(
+            all(target_arch = "aarch64", target_endian = "little"),
+            all(target_arch = "arm", target_endian = "little"),
+            target_arch = "x86_64")))] {
+
+        // TODO: Stop calling this from C and un-export it.
+        #[cfg(not(target_arch = "x86"))]
+        prefixed_export! {
+            unsafe extern "C" fn bn_mul_mont(
+                r: *mut Limb,
+                a: *const Limb,
+                b: *const Limb,
+                n: *const Limb,
+                n0: &N0,
+                num_limbs: c::NonZero_size_t,
+            ) {
+                unsafe { bn_mul_mont_fallback(r, a, b, n, n0, num_limbs) }
+            }
+        }
+
+        #[cfg_attr(target_arch = "x86", cold)]
+        #[cfg_attr(target_arch = "x86", inline(never))]
+        unsafe extern "C" fn bn_mul_mont_fallback(
+            r: *mut Limb,
+            a: *const Limb,
+            b: *const Limb,
+            n: *const Limb,
+            n0: &N0,
+            num_limbs: c::NonZero_size_t,
+        ) {
+            use super::MAX_LIMBS;
+
+            let num_limbs = num_limbs.get();
+
+            // The mutable pointer `r` may alias `a` and/or `b`, so the lifetimes of
+            // any slices for `a` or `b` must not overlap with the lifetime of any
+            // mutable for `r`.
+
+            // Nothing aliases `n`
+            let n = unsafe { core::slice::from_raw_parts(n, num_limbs) };
+
+            let mut tmp = [0; 2 * MAX_LIMBS];
+            let tmp = &mut tmp[..(2 * num_limbs)];
+            {
+                let a: &[Limb] = unsafe { core::slice::from_raw_parts(a, num_limbs) };
+                let b: &[Limb] = unsafe { core::slice::from_raw_parts(b, num_limbs) };
+                limbs_mul(tmp, a, b);
+            }
+            let r: &mut [Limb] = unsafe { core::slice::from_raw_parts_mut(r, num_limbs) };
+            limbs_from_mont_in_place(r, tmp, n, n0);
+        }
+    }
+}
+
+// `bigint` needs then when the `alloc` feature is enabled. `bn_mul_mont` above needs this when
+// we are using the platforms for which we don't have `bn_mul_mont` in assembly.
+#[cfg(any(
+    feature = "alloc",
+    not(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86_64"
+    ))
+))]
+pub(super) fn limbs_from_mont_in_place(r: &mut [Limb], tmp: &mut [Limb], m: &[Limb], n0: &N0) {
+    prefixed_extern! {
+        fn bn_from_montgomery_in_place(
+            r: *mut Limb,
+            num_r: c::size_t,
+            a: *mut Limb,
+            num_a: c::size_t,
+            n: *const Limb,
+            num_n: c::size_t,
+            n0: &N0,
+        ) -> bssl::Result;
+    }
+    Result::from(unsafe {
+        bn_from_montgomery_in_place(
+            r.as_mut_ptr(),
+            r.len(),
+            tmp.as_mut_ptr(),
+            tmp.len(),
+            m.as_ptr(),
+            m.len(),
+            n0,
+        )
+    })
+    .unwrap()
+}
+
+#[cfg(not(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86_64"
+)))]
+fn limbs_mul(r: &mut [Limb], a: &[Limb], b: &[Limb]) {
+    debug_assert_eq!(r.len(), 2 * a.len());
+    debug_assert_eq!(a.len(), b.len());
+    let ab_len = a.len();
+
+    r[..ab_len].fill(0);
+    for (i, &b_limb) in b.iter().enumerate() {
+        r[ab_len + i] = unsafe {
+            limbs_mul_add_limb(r[i..][..ab_len].as_mut_ptr(), a.as_ptr(), b_limb, ab_len)
+        };
+    }
+}
+
+#[cfg(any(
+    test,
+    not(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86_64",
+    ))
+))]
+prefixed_extern! {
+    // `r` must not alias `a`
+    #[must_use]
+    fn limbs_mul_add_limb(r: *mut Limb, a: *const Limb, b: Limb, num_limbs: c::size_t) -> Limb;
+}
+
+/// r = r**2
+pub(super) fn limbs_square_mont(
+    r: &mut [Limb],
+    n: &[Limb],
+    n0: &N0,
+    cpu: cpu::Features,
+) -> Result<(), LimbSliceError> {
+    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+    {
+        use super::limbs::aarch64;
+        use crate::polyfill::slice;
+        if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) {
+            return aarch64::mont::sqr_mont5(r, n, n0);
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        use super::limbs::x86_64;
+        use crate::{cpu::GetFeature as _, polyfill::slice};
+        if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) {
+            return x86_64::mont::sqr_mont5(r, n, n0, cpu.get_feature());
+        }
+    }
+
+    limbs_mul_mont(r, n, n0, cpu)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::MAX_LIMBS;
+    use super::*;
+    use crate::limb::Limb;
+
+    #[test]
+    // TODO: wasm
+    fn test_mul_add_words() {
+        const ZERO: Limb = 0;
+        const MAX: Limb = ZERO.wrapping_sub(1);
+        static TEST_CASES: &[(&[Limb], &[Limb], Limb, Limb, &[Limb])] = &[
+            (&[0], &[0], 0, 0, &[0]),
+            (&[MAX], &[0], MAX, 0, &[MAX]),
+            (&[0], &[MAX], MAX, MAX - 1, &[1]),
+            (&[MAX], &[MAX], MAX, MAX, &[0]),
+            (&[0, 0], &[MAX, MAX], MAX, MAX - 1, &[1, MAX]),
+            (&[1, 0], &[MAX, MAX], MAX, MAX - 1, &[2, MAX]),
+            (&[MAX, 0], &[MAX, MAX], MAX, MAX, &[0, 0]),
+            (&[0, 1], &[MAX, MAX], MAX, MAX, &[1, 0]),
+            (&[MAX, MAX], &[MAX, MAX], MAX, MAX, &[0, MAX]),
+        ];
+
+        for (i, (r_input, a, w, expected_retval, expected_r)) in TEST_CASES.iter().enumerate() {
+            let mut r = [0; MAX_LIMBS];
+            let r = {
+                let r = &mut r[..r_input.len()];
+                r.copy_from_slice(r_input);
+                r
+            };
+            assert_eq!(r.len(), a.len()); // Sanity check
+            let actual_retval =
+                unsafe { limbs_mul_add_limb(r.as_mut_ptr(), a.as_ptr(), *w, a.len()) };
+            assert_eq!(&r, expected_r, "{}: {:x?} != {:x?}", i, r, expected_r);
+            assert_eq!(
+                actual_retval, *expected_retval,
+                "{}: {:x?} != {:x?}",
+                i, actual_retval, *expected_retval
+            );
+        }
+    }
+}
diff --git a/ring-0.17.14/src/arithmetic/n0.rs b/ring-0.17.14/src/arithmetic/n0.rs
new file mode 100644
index 0000000000..f0c77ddecc
--- /dev/null
+++ b/ring-0.17.14/src/arithmetic/n0.rs
@@ -0,0 +1,37 @@
+// Copyright 2015-2022 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::limb::Limb;
+
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+pub struct N0([Limb; 2]);
+
+impl N0 {
+    #[cfg(feature = "alloc")]
+    pub(super) const LIMBS_USED: usize = 64 / crate::limb::LIMB_BITS;
+
+    #[inline]
+    pub const fn precalculated(n0: u64) -> Self {
+        #[cfg(target_pointer_width = "64")]
+        {
+            Self([n0, 0])
+        }
+
+        #[cfg(target_pointer_width = "32")]
+        {
+            Self([n0 as Limb, (n0 >> crate::limb::LIMB_BITS) as Limb])
+        }
+    }
+}
diff --git a/ring-0.17.14/src/bb/boolmask.rs b/ring-0.17.14/src/bb/boolmask.rs
new file mode 100644
index 0000000000..d504df15aa
--- /dev/null
+++ b/ring-0.17.14/src/bb/boolmask.rs
@@ -0,0 +1,41 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::Word;
+use core::ops;
+
+// BoolMask is either `BoolMask::TRUE` or `BoolMask::FALSE`.
+#[repr(transparent)]
+pub struct BoolMask(Word);
+
+impl BoolMask {
+    #[cfg(test)]
+    pub(super) const TRUE: Self = Self(Word::MAX);
+    #[cfg(test)]
+    pub(super) const FALSE: Self = Self(0);
+
+    /// Returns true if `self` is `BoolMask::TRUE`; otherwise, returns false
+    /// (`self` is `BoolMask::FALSE`).
+    pub(crate) fn leak(self) -> bool {
+        self.0 != 0
+    }
+}
+
+impl ops::BitAnd for BoolMask {
+    type Output = Self;
+
+    fn bitand(self, rhs: Self) -> Self {
+        Self(self.0 & rhs.0)
+    }
+}
diff --git a/ring-0.17.14/src/bb/leaky.rs b/ring-0.17.14/src/bb/leaky.rs
new file mode 100644
index 0000000000..4ff5b6c914
--- /dev/null
+++ b/ring-0.17.14/src/bb/leaky.rs
@@ -0,0 +1,27 @@
+// Copyright 2015-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#[cfg(target_pointer_width = "64")]
+type CompilerWord = u64;
+
+#[cfg(target_pointer_width = "32")]
+type CompilerWord = u32;
+
+/// A native word that isn't secret.
+///
+/// `LeakyWord` supports `as` conversions to/from native types.
+///
+/// XXX: This isn't the native word size on targets where a pointer isn't the
+/// same size as a native word. TODO: Fix this.
+pub(crate) type LeakyWord = CompilerWord;
diff --git a/ring-0.17.14/src/bb/mod.rs b/ring-0.17.14/src/bb/mod.rs
new file mode 100644
index 0000000000..3920a5f0cf
--- /dev/null
+++ b/ring-0.17.14/src/bb/mod.rs
@@ -0,0 +1,162 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Building blocks.
+
+use crate::{c, error};
+use core::{ffi::c_int, num::NonZeroUsize};
+
+mod boolmask;
+mod leaky;
+mod word;
+
+pub(crate) use self::{boolmask::BoolMask, leaky::LeakyWord, word::Word};
+
+/// Returns `Ok(())` if `a == b` and `Err(error::Unspecified)` otherwise.
+pub fn verify_slices_are_equal(a: &[u8], b: &[u8]) -> Result<(), error::Unspecified> {
+    let len = a.len(); // Arbitrary choice.
+    if b.len() != len {
+        return Err(error::Unspecified);
+    }
+    match NonZeroUsize::new(len) {
+        Some(len) => {
+            let a = a.as_ptr();
+            let b = b.as_ptr();
+            // SAFETY: `a` and `b` are valid non-null non-dangling pointers to `len`
+            // bytes.
+            let result = unsafe { CRYPTO_memcmp(a, b, len) };
+            match result {
+                0 => Ok(()),
+                _ => Err(error::Unspecified),
+            }
+        }
+        None => Ok(()), // Empty slices are equal.
+    }
+}
+
+prefixed_extern! {
+    fn CRYPTO_memcmp(a: *const u8, b: *const u8, len: c::NonZero_size_t) -> c_int;
+}
+
+pub(crate) fn xor_16(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+    let a = u128::from_ne_bytes(a);
+    let b = u128::from_ne_bytes(b);
+    let r = a ^ b;
+    r.to_ne_bytes()
+}
+
+#[inline(always)]
+pub(crate) fn xor_assign<'a>(a: impl IntoIterator<Item = &'a mut u8>, b: u8) {
+    a.into_iter().for_each(|a| *a ^= b);
+}
+
+/// XORs the first N bytes of `b` into `a`, where N is
+/// `core::cmp::min(a.len(), b.len())`.
+#[inline(always)]
+pub(crate) fn xor_assign_at_start<'a>(
+    a: impl IntoIterator<Item = &'a mut u8>,
+    b: impl IntoIterator<Item = &'a u8>,
+) {
+    a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{bssl, rand};
+
+    fn leak_in_test(a: BoolMask) -> bool {
+        a.leak()
+    }
+
+    #[test]
+    fn test_constant_time() -> Result<(), error::Unspecified> {
+        prefixed_extern! {
+            fn bssl_constant_time_test_main() -> bssl::Result;
+        }
+        Result::from(unsafe { bssl_constant_time_test_main() })
+    }
+
+    #[test]
+    fn constant_time_conditional_memcpy() -> Result<(), error::Unspecified> {
+        let rng = rand::SystemRandom::new();
+        for _ in 0..100 {
+            let mut out = rand::generate::<[u8; 256]>(&rng)?.expose();
+            let input = rand::generate::<[u8; 256]>(&rng)?.expose();
+
+            // Mask to 16 bits to make zero more likely than it would otherwise be.
+            let b = (rand::generate::<[u8; 1]>(&rng)?.expose()[0] & 0x0f) == 0;
+
+            let ref_in = input;
+            let ref_out = if b { input } else { out };
+
+            prefixed_extern! {
+                fn bssl_constant_time_test_conditional_memcpy(dst: &mut [u8; 256], src: &[u8; 256], b: BoolMask);
+            }
+            unsafe {
+                bssl_constant_time_test_conditional_memcpy(
+                    &mut out,
+                    &input,
+                    if b { BoolMask::TRUE } else { BoolMask::FALSE },
+                )
+            }
+            assert_eq!(ref_in, input);
+            assert_eq!(ref_out, out);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn constant_time_conditional_memxor() -> Result<(), error::Unspecified> {
+        let rng = rand::SystemRandom::new();
+        for _ in 0..256 {
+            let mut out = rand::generate::<[u8; 256]>(&rng)?.expose();
+            let input = rand::generate::<[u8; 256]>(&rng)?.expose();
+
+            // Mask to 16 bits to make zero more likely than it would otherwise be.
+            let b = (rand::generate::<[u8; 1]>(&rng)?.expose()[0] & 0x0f) != 0;
+
+            let ref_in = input;
+            let mut ref_out = out;
+            if b {
+                xor_assign_at_start(&mut ref_out, &ref_in)
+            };
+
+            prefixed_extern! {
+                fn bssl_constant_time_test_conditional_memxor(dst: &mut [u8; 256], src: &[u8; 256], b: BoolMask);
+            }
+            unsafe {
+                bssl_constant_time_test_conditional_memxor(
+                    &mut out,
+                    &input,
+                    if b { BoolMask::TRUE } else { BoolMask::FALSE },
+                );
+            }
+
+            assert_eq!(ref_in, input);
+            assert_eq!(ref_out, out);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_bool_mask_bitwise_and_is_logical_and() {
+        assert!(leak_in_test(BoolMask::TRUE & BoolMask::TRUE));
+        assert!(!leak_in_test(BoolMask::TRUE & BoolMask::FALSE));
+        assert!(!leak_in_test(BoolMask::FALSE & BoolMask::TRUE));
+        assert!(!leak_in_test(BoolMask::FALSE & BoolMask::FALSE));
+    }
+}
diff --git a/ring-0.17.14/src/bb/word.rs b/ring-0.17.14/src/bb/word.rs
new file mode 100644
index 0000000000..a799df334d
--- /dev/null
+++ b/ring-0.17.14/src/bb/word.rs
@@ -0,0 +1,44 @@
+// Copyright 2015-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::LeakyWord;
+
+/// A native word that may hold a secret.
+///
+/// XXX: Currently this is a type alias of `LeakyWord` so it doesn't enforce,
+/// except by convention, the prevention of leaks. This is a temporary state to
+/// support the refactorings that will
+///
+/// XXX: This isn't the native word size on targets where a pointer isn't the
+/// same size as a native word. TODO: Fix this.
+///
+/// XXX: Over time, we'll evolve Word into a newtype with an API that minimizes
+/// leaks and makes all leaks explicit, like so:
+pub(crate) type Word = LeakyWord;
+
+/* TODO:
+#[repr(transparent)]
+pub(crate) struct Word(LeakyWord);
+
+impl Word {
+    pub fn leak_word(self) -> LeakyWord { self.0 }
+}
+
+impl From<LeakyWord> for Word {
+    fn from(w: LeakyWord) -> Self {
+        // TODO: Use a stronger `black_box`.
+        Self(core::hint::black_box(w))
+    }
+}
+*/
diff --git a/ring-0.17.14/src/bits.rs b/ring-0.17.14/src/bits.rs
new file mode 100644
index 0000000000..01b5dc53e4
--- /dev/null
+++ b/ring-0.17.14/src/bits.rs
@@ -0,0 +1,135 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Bit lengths.
+
+use crate::{error::InputTooLongError, polyfill};
+
+/// The length of something, in bits.
+///
+/// This can represent a bit length that isn't a whole number of bytes.
+#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)]
+#[repr(transparent)]
+pub struct BitLength<T = usize>(T);
+
+pub(crate) trait FromByteLen<T>: Sized {
+    /// Constructs a `BitLength` from the given length in bytes.
+    ///
+    /// Fails if `bytes * 8` is too large for a `T`.
+    fn from_byte_len(bytes: T) -> Result<Self, InputTooLongError<T>>;
+}
+
+impl FromByteLen<usize> for BitLength<usize> {
+    #[inline]
+    fn from_byte_len(bytes: usize) -> Result<Self, InputTooLongError> {
+        match bytes.checked_mul(8) {
+            Some(bits) => Ok(Self(bits)),
+            None => Err(InputTooLongError::new(bytes)),
+        }
+    }
+}
+
+impl FromByteLen<u64> for BitLength<u64> {
+    #[inline]
+    fn from_byte_len(bytes: u64) -> Result<Self, InputTooLongError<u64>> {
+        match bytes.checked_mul(8) {
+            Some(bits) => Ok(Self(bits)),
+            None => Err(InputTooLongError::new(bytes)),
+        }
+    }
+}
+
+impl FromByteLen<usize> for BitLength<u64> {
+    #[inline]
+    fn from_byte_len(bytes: usize) -> Result<Self, InputTooLongError<usize>> {
+        match polyfill::u64_from_usize(bytes).checked_mul(8) {
+            Some(bits) => Ok(Self(bits)),
+            None => Err(InputTooLongError::new(bytes)),
+        }
+    }
+}
+
+impl<T> BitLength<T> {
+    /// Constructs a `BitLength` from the given length in bits.
+    #[inline]
+    pub const fn from_bits(bits: T) -> Self {
+        Self(bits)
+    }
+}
+
+impl<T: Copy> BitLength<T> {
+    /// The number of bits this bit length represents, as the underlying type.
+    #[inline]
+    pub fn as_bits(self) -> T {
+        self.0
+    }
+}
+
+// Lengths measured in bits, where all arithmetic is guaranteed not to
+// overflow.
+impl BitLength<usize> {
+    #[cfg(feature = "alloc")]
+    #[inline]
+    pub(crate) fn half_rounded_up(&self) -> Self {
+        let round_up = self.0 & 1;
+        Self((self.0 / 2) + round_up)
+    }
+
+    /// The bit length, rounded up to a whole number of bytes.
+    #[inline]
+    pub const fn as_usize_bytes_rounded_up(&self) -> usize {
+        // Equivalent to (self.0 + 7) / 8, except with no potential for
+        // overflow and without branches.
+
+        // Branchless round_up = if self.0 & 0b111 != 0 { 1 } else { 0 };
+        let round_up = ((self.0 >> 2) | (self.0 >> 1) | self.0) & 1;
+
+        (self.0 / 8) + round_up
+    }
+
+    #[cfg(feature = "alloc")]
+    #[inline]
+    pub(crate) fn try_sub_1(self) -> Result<Self, crate::error::Unspecified> {
+        let sum = self.0.checked_sub(1).ok_or(crate::error::Unspecified)?;
+        Ok(Self(sum))
+    }
+}
+
+impl BitLength<u64> {
+    pub fn to_be_bytes(self) -> [u8; 8] {
+        self.0.to_be_bytes()
+    }
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+impl From<BitLength<usize>> for BitLength<u64> {
+    fn from(BitLength(value): BitLength<usize>) -> Self {
+        BitLength(polyfill::u64_from_usize(value))
+    }
+}
+
+impl TryFrom<BitLength<u64>> for BitLength<core::num::NonZeroU64> {
+    type Error = <core::num::NonZeroU64 as TryFrom<u64>>::Error;
+
+    fn try_from(BitLength(value): BitLength<u64>) -> Result<Self, Self::Error> {
+        value.try_into().map(BitLength)
+    }
+}
+
+const _TEST_AS_USIZE_BYTES_ROUNDED_UP_EVEN: () =
+    assert!(BitLength::from_bits(8192).as_usize_bytes_rounded_up() == 8192 / 8);
+const _TEST_AS_USIZE_BYTES_ROUNDED_UP_ONE_BIT_HIGH: () =
+    assert!(BitLength::from_bits(8192 + 1).as_usize_bytes_rounded_up() == (8192 / 8) + 1);
+const _TEST_AS_USIZE_BYTES_ROUNDED_UP_SEVEN_BITS_HIGH: () =
+    assert!(BitLength::from_bits(8192 + 7).as_usize_bytes_rounded_up() == (8192 / 8) + 1);
diff --git a/ring-0.17.14/src/bssl.rs b/ring-0.17.14/src/bssl.rs
new file mode 100644
index 0000000000..1b1c753784
--- /dev/null
+++ b/ring-0.17.14/src/bssl.rs
@@ -0,0 +1,59 @@
+// Copyright 2015 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::error;
+use core::ffi::c_int;
+
+/// An `int` returned from a foreign function containing **1** if the function
+/// was successful or **0** if an error occurred. This is the convention used by
+/// C code in `ring`.
+#[must_use]
+#[repr(transparent)]
+pub struct Result(c_int);
+
+impl From<Result> for core::result::Result<(), error::Unspecified> {
+    fn from(ret: Result) -> Self {
+        match ret.0 {
+            1 => Ok(()),
+            c => {
+                debug_assert_eq!(c, 0, "`bssl::Result` value must be 0 or 1");
+                Err(error::Unspecified)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    mod result {
+        use crate::bssl;
+        use core::{
+            ffi::c_int,
+            mem::{align_of, size_of},
+        };
+
+        #[test]
+        fn size_and_alignment() {
+            type Underlying = c_int;
+            assert_eq!(size_of::<bssl::Result>(), size_of::<Underlying>());
+            assert_eq!(align_of::<bssl::Result>(), align_of::<Underlying>());
+        }
+
+        #[test]
+        fn semantics() {
+            assert!(Result::from(bssl::Result(0)).is_err());
+            assert!(Result::from(bssl::Result(1)).is_ok());
+        }
+    }
+}
diff --git a/ring-0.17.14/src/c.rs b/ring-0.17.14/src/c.rs
new file mode 100644
index 0000000000..f5822ebccf
--- /dev/null
+++ b/ring-0.17.14/src/c.rs
@@ -0,0 +1,33 @@
+// Copyright 2016-2019 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! C types.
+//!
+//! Avoid using the `libc` crate to get C types since `libc` doesn't support
+//! all the targets we need to support. It turns out that the few types we need
+//! are all uniformly defined on the platforms we care about. This will
+//! probably change if/when we support 16-bit platforms or platforms where
+//! `usize` and `uintptr_t` are different sizes.
+//!
+//! TODO(MSRV, feature(c_size_t)): Use `core::{ffi::c_size_t}`.
+//! TODO(MSRV-1.79): Use `NonZero<c_size_t>`.
+
+// Keep in sync with the checks in base.h that verify these assumptions.
+
+#![allow(dead_code)]
+
+use core::num::NonZeroUsize;
+
+pub(crate) type size_t = usize;
+pub(crate) type NonZero_size_t = NonZeroUsize;
diff --git a/ring-0.17.14/src/cpu.rs b/ring-0.17.14/src/cpu.rs
new file mode 100644
index 0000000000..293e1b5355
--- /dev/null
+++ b/ring-0.17.14/src/cpu.rs
@@ -0,0 +1,231 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub(crate) use self::features::Features;
+use core::mem::size_of;
+
+macro_rules! impl_get_feature {
+    {
+      features: [
+          $( { ( $( $arch:expr ),+ ) => $Name:ident }, )+
+      ],
+    } => {
+        $(
+            #[cfg(any( $( target_arch = $arch ),+ ))]
+            #[derive(Clone, Copy)]
+            pub(crate) struct $Name(crate::cpu::Features);
+
+            #[cfg(any( $( target_arch = $arch ),+ ))]
+            impl $Name {
+                const fn mask() -> u32 {
+                    1 << (Shift::$Name as u32)
+                }
+            }
+
+            #[cfg(any( $( target_arch = $arch ),+ ))]
+            impl crate::cpu::GetFeature<$Name> for super::features::Values {
+                #[inline(always)]
+                fn get_feature(&self) -> Option<$Name> {
+                    const MASK: u32 = $Name::mask();
+                    const STATICALLY_DETECTED: bool = (crate::cpu::CAPS_STATIC & MASK) == MASK;
+                    if STATICALLY_DETECTED { // TODO: `const`
+                        return Some($Name(self.cpu()));
+                    }
+
+                    if (self.values() & MASK) == MASK {
+                        Some($Name(self.cpu()))
+                    } else {
+                        None
+                    }
+                }
+            }
+        )+
+
+        #[repr(u32)]
+        enum Shift {
+            $(
+                #[cfg(any( $( target_arch = $arch ),+ ))]
+                $Name,
+            )+
+
+            #[cfg(target_arch = "x86_64")]
+            IntelCpu,
+
+            #[cfg(any(all(target_arch = "aarch64", target_endian = "little"),
+                     all(target_arch = "arm", target_endian = "little"),
+                     target_arch = "x86", target_arch = "x86_64"))]
+            // Synthesized to ensure the dynamic flag set is always non-zero.
+            //
+            // Keep this at the end as it is never checked except during init.
+            Initialized,
+        }
+    }
+}
+
+pub(crate) trait GetFeature<T> {
+    fn get_feature(&self) -> Option<T>;
+}
+
+impl GetFeature<()> for features::Values {
+    #[inline(always)]
+    fn get_feature(&self) -> Option<()> {
+        Some(())
+    }
+}
+
+impl<A, B> GetFeature<(A, B)> for features::Values
+where
+    features::Values: GetFeature<A>,
+    features::Values: GetFeature<B>,
+{
+    #[inline(always)]
+    fn get_feature(&self) -> Option<(A, B)> {
+        match (self.get_feature(), self.get_feature()) {
+            (Some(a), Some(b)) => Some((a, b)),
+            _ => None,
+        }
+    }
+}
+
+impl<A, B, C> GetFeature<(A, B, C)> for features::Values
+where
+    features::Values: GetFeature<A>,
+    features::Values: GetFeature<B>,
+    features::Values: GetFeature<C>,
+{
+    #[inline(always)]
+    fn get_feature(&self) -> Option<(A, B, C)> {
+        match (self.get_feature(), self.get_feature(), self.get_feature()) {
+            (Some(a), Some(b), Some(c)) => Some((a, b, c)),
+            _ => None,
+        }
+    }
+}
+
+impl<F> GetFeature<F> for Features
+where
+    features::Values: GetFeature<F>,
+{
+    #[inline(always)]
+    fn get_feature(&self) -> Option<F> {
+        self.values().get_feature()
+    }
+}
+
+#[inline(always)]
+pub(crate) fn features() -> Features {
+    featureflags::get_or_init()
+}
+
+mod features {
+    use crate::polyfill::NotSend;
+
+    /// A witness indicating that CPU features have been detected and cached.
+    ///
+    /// This is a zero-sized type so that it can be "stored" wherever convenient.
+    #[derive(Copy, Clone)]
+    pub(crate) struct Features(NotSend);
+
+    impl Features {
+        pub fn values(self) -> Values {
+            Values {
+                values: super::featureflags::get(self),
+                cpu: self,
+            }
+        }
+    }
+
+    cfg_if::cfg_if! {
+        if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"),
+                     target_arch = "x86", target_arch = "x86_64"))] {
+            impl Features {
+                // SAFETY: This must only be called after CPU features have been written
+                // and synchronized.
+                pub(super) unsafe fn new_after_feature_flags_written_and_synced_unchecked() -> Self {
+                    Self(NotSend::VALUE)
+                }
+            }
+        } else {
+            impl Features {
+                pub(super) fn new_no_features_to_detect() -> Self {
+                    Self(NotSend::VALUE)
+                }
+            }
+        }
+    }
+
+    pub struct Values {
+        values: u32,
+        cpu: Features,
+    }
+
+    impl Values {
+        #[inline(always)]
+        pub(super) fn values(&self) -> u32 {
+            self.values
+        }
+
+        #[inline(always)]
+        pub(super) fn cpu(&self) -> Features {
+            self.cpu
+        }
+    }
+}
+
+const _: () = assert!(size_of::<Features>() == 0);
+
+cfg_if::cfg_if! {
+    if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")))] {
+        pub mod arm;
+        use arm::featureflags;
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        pub mod intel;
+        use intel::featureflags;
+    } else {
+        mod featureflags {
+            use super::Features;
+
+            #[inline(always)]
+            pub(super) fn get_or_init() -> Features {
+                Features::new_no_features_to_detect()
+            }
+
+            #[inline(always)]
+            pub(super) fn get(_cpu_features: Features) -> u32 {
+                STATIC_DETECTED
+            }
+
+            pub(super) const STATIC_DETECTED: u32 = 0;
+            pub(super) const FORCE_DYNAMIC_DETECTION: u32 = 0;
+        }
+    }
+}
+
+const CAPS_STATIC: u32 = featureflags::STATIC_DETECTED & !featureflags::FORCE_DYNAMIC_DETECTION;
+
+#[allow(clippy::assertions_on_constants, clippy::bad_bit_mask)]
+const _FORCE_DYNAMIC_DETECTION_HONORED: () =
+    assert!((CAPS_STATIC & featureflags::FORCE_DYNAMIC_DETECTION) == 0);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_static_is_subset_of_dynamic() {
+        let cpu = features();
+        let dynamic = featureflags::get(cpu);
+        assert_eq!(dynamic & CAPS_STATIC, CAPS_STATIC);
+    }
+}
diff --git a/ring-0.17.14/src/cpu/arm.rs b/ring-0.17.14/src/cpu/arm.rs
new file mode 100644
index 0000000000..8da973a1a3
--- /dev/null
+++ b/ring-0.17.14/src/cpu/arm.rs
@@ -0,0 +1,192 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::CAPS_STATIC;
+
+mod abi_assumptions {
+    use core::mem::size_of;
+
+    // TODO: Support ARM64_32; see
+    // https://github.com/briansmith/ring/issues/1832#issuecomment-1892928147. This also requires
+    // replacing all `cfg(target_pointer_width)` logic for non-pointer/reference things
+    // (`N0`, `Limb`, `LimbMask`, `crypto_word_t` etc.).
+    #[cfg(target_arch = "aarch64")]
+    const _ASSUMED_POINTER_SIZE: usize = 8;
+    #[cfg(target_arch = "arm")]
+    const _ASSUMED_POINTER_SIZE: usize = 4;
+    const _ASSUMED_USIZE_SIZE: () = assert!(size_of::<usize>() == _ASSUMED_POINTER_SIZE);
+    const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE);
+
+    // To support big-endian, we'd need to make several changes as described in
+    // https://github.com/briansmith/ring/issues/1832.
+    const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little"));
+}
+
+// uclibc: When linked statically, uclibc doesn't provide getauxval.
+// When linked dynamically, recent versions do provide it, but we
+// want to support older versions too. Assume that if uclibc is being
+// used, this is an embedded target where the user cares a lot about
+// minimizing code size and also that they know in advance exactly
+// what target features are supported, so rely only on static feature
+// detection.
+
+cfg_if::cfg_if! {
+    if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"),
+                 any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))] {
+        mod darwin;
+        use darwin as detect;
+    } else if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "fuchsia"))] {
+        mod fuchsia;
+        use fuchsia as detect;
+    } else if #[cfg(any(target_os = "android", target_os = "linux"))] {
+        mod linux;
+        use linux as detect;
+    } else if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))] {
+        mod windows;
+        use windows as detect;
+    } else {
+        mod detect {
+            pub const FORCE_DYNAMIC_DETECTION: u32 = 0;
+            pub fn detect_features() -> u32 { 0 }
+        }
+    }
+}
+
+impl_get_feature! {
+    features: [
+        // TODO(MSRV): 32-bit ARM doesn't have `target_feature = "neon"` yet.
+        { ("aarch64", "arm") => Neon },
+
+        // TODO(MSRV): There is no "pmull" feature listed from
+        // `rustc --print cfg --target=aarch64-apple-darwin`. Originally ARMv8 tied
+        // PMULL detection into AES detection, but later versions split it; see
+        // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile
+        // "Features introduced prior to 2020." Change this to use "pmull" when
+        // that is supported.
+        { ("aarch64") => PMull },
+
+        { ("aarch64") => Aes },
+
+        { ("aarch64") => Sha256 },
+
+        // Keep in sync with `ARMV8_SHA512`.
+
+        // "sha3" is overloaded for both SHA-3 and SHA-512.
+        { ("aarch64") => Sha512 },
+    ],
+}
+
+pub(super) mod featureflags {
+    pub(in super::super) use super::detect::FORCE_DYNAMIC_DETECTION;
+    use super::*;
+    use crate::{
+        cpu,
+        polyfill::{once_cell::race, usize_from_u32},
+    };
+    use core::num::NonZeroUsize;
+    #[cfg(all(target_arch = "arm", target_endian = "little"))]
+    use core::sync::atomic::{AtomicU32, Ordering};
+
+    pub(in super::super) fn get_or_init() -> cpu::Features {
+        fn init() -> NonZeroUsize {
+            let detected = detect::detect_features();
+            let filtered = (if cfg!(feature = "unstable-testing-arm-no-hw") {
+                !Neon::mask()
+            } else {
+                0
+            }) | (if cfg!(feature = "unstable-testing-arm-no-neon") {
+                Neon::mask()
+            } else {
+                0
+            });
+            let detected = detected & !filtered;
+            let merged = CAPS_STATIC | detected;
+
+            #[cfg(all(
+                target_arch = "arm",
+                target_endian = "little",
+                target_has_atomic = "32"
+            ))]
+            if (merged & Neon::mask()) == Neon::mask() {
+                // `neon_available` is declared as `alignas(4) uint32_t` in the C code.
+                // AtomicU32 is `#[repr(C, align(4))]`.
+                prefixed_extern! {
+                    static neon_available: AtomicU32;
+                }
+                // SAFETY: The C code only reads `neon_available`, and its
+                // reads are synchronized through the `OnceNonZeroUsize`
+                // Acquire/Release semantics as we ensure we have a
+                // `cpu::Features` instance before calling into the C code.
+                let p = unsafe { &neon_available };
+                p.store(1, Ordering::Relaxed);
+            }
+
+            let merged = usize_from_u32(merged) | (1 << (Shift::Initialized as u32));
+            NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit.
+        }
+
+        // SAFETY: This is the only caller. Any concurrent reading doesn't
+        // affect the safety of the writing.
+        let _: NonZeroUsize = FEATURES.get_or_init(init);
+
+        // SAFETY: We initialized the CPU features as required.
+        unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() }
+    }
+
+    pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 {
+        // SAFETY: Since only `get_or_init()` could have created
+        // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`,
+        // we know we are reading from `FEATURES` after initializing it.
+        //
+        // Also, 0 means "no features detected" to users, which is designed to
+        // be a safe configuration.
+        let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0);
+
+        // The truncation is lossless, as we set the value with a u32.
+        #[allow(clippy::cast_possible_truncation)]
+        let features = features as u32;
+
+        features
+    }
+
+    static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new();
+
+    // TODO(MSRV): There is no "pmull" feature listed from
+    // `rustc --print cfg --target=aarch64-apple-darwin`. Originally ARMv8 tied
+    // PMULL detection into AES detection, but later versions split it; see
+    // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile
+    // "Features introduced prior to 2020." Change this to use "pmull" when
+    // that is supported.
+    //
+    // "sha3" is overloaded for both SHA-3 and SHA-512.
+    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+    #[rustfmt::skip]
+    pub(in super::super) const STATIC_DETECTED: u32 = 0
+        | (if cfg!(target_feature = "neon") { Neon::mask() } else { 0 })
+        | (if cfg!(target_feature = "aes") { Aes::mask() } else { 0 })
+        | (if cfg!(target_feature = "aes") { PMull::mask() } else { 0 })
+        | (if cfg!(target_feature = "sha2") { Sha256::mask() } else { 0 })
+        | (if cfg!(target_feature = "sha3") { Sha512::mask() } else { 0 })
+        ;
+
+    // TODO(MSRV): 32-bit ARM doesn't support any static feature detection yet.
+    #[cfg(all(target_arch = "arm", target_endian = "little"))]
+    pub(in super::super) const STATIC_DETECTED: u32 = 0;
+}
+
+#[allow(clippy::assertions_on_constants)]
+const _AARCH64_HAS_NEON: () = assert!(
+    ((CAPS_STATIC & Neon::mask()) == Neon::mask())
+        || !cfg!(all(target_arch = "aarch64", target_endian = "little"))
+);
diff --git a/ring-0.17.14/src/cpu/arm/darwin.rs b/ring-0.17.14/src/cpu/arm/darwin.rs
new file mode 100644
index 0000000000..6d40bc7eca
--- /dev/null
+++ b/ring-0.17.14/src/cpu/arm/darwin.rs
@@ -0,0 +1,113 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Aes, Neon, PMull, Sha256, Sha512, CAPS_STATIC};
+use crate::polyfill::cstr;
+
+// ```
+// $ rustc +1.61.0 --print cfg --target=aarch64-apple-ios | grep -E "neon|aes|sha|pmull"
+// target_feature="aes"
+// target_feature="neon"
+// target_feature="sha2"
+// $ rustc +1.61.0 --print cfg --target=aarch64-apple-darwin | grep -E "neon|aes|sha|pmull"
+// target_feature="aes"
+// target_feature="neon"
+// target_feature="sha2"
+// target_feature="sha3"
+// ```
+//
+// XXX/TODO(coverage)/TODO(size): aarch64-apple-darwin is statically guaranteed to have "sha3" but
+// other aarch64-apple-* targets require dynamic detection. Since we don't have test coverage for
+// the other targets yet, we wouldn't have a way of testing the dynamic detection if we statically
+// enabled `Sha512` for -darwin. So instead, temporarily, we statically ignore the static
+// availability of the feature on -darwin so that it runs the dynamic detection.
+pub const MIN_STATIC_FEATURES: u32 = Neon::mask() | Aes::mask() | Sha256::mask() | PMull::mask();
+pub const FORCE_DYNAMIC_DETECTION: u32 = !MIN_STATIC_FEATURES;
+
+// MSRV: Enforce 1.61.0 onaarch64-apple-*, in particular) prior to. Earlier
+// versions of Rust before did not report the AAarch64 CPU features correctly
+// for these targets. Cargo.toml specifies `rust-version` but versions before
+// Rust 1.56 don't know about it.
+#[allow(clippy::assertions_on_constants)]
+const _AARCH64_APPLE_TARGETS_EXPECTED_FEATURES: () =
+    assert!((CAPS_STATIC & MIN_STATIC_FEATURES) == MIN_STATIC_FEATURES);
+
+// Ensure we don't accidentally allow features statically beyond
+// `MIN_STATIC_FEATURES` so that dynamic detection is done uniformly for
+// all of these targets.
+#[allow(clippy::assertions_on_constants)]
+const _AARCH64_APPLE_DARWIN_TARGETS_EXPECTED_FEATURES: () =
+    assert!(CAPS_STATIC == MIN_STATIC_FEATURES);
+
+pub fn detect_features() -> u32 {
+    fn detect_feature(name: cstr::Ref) -> bool {
+        use crate::polyfill;
+        use core::mem;
+        use libc::{c_int, c_void};
+
+        let mut value: c_int = 0;
+        let mut len = mem::size_of_val(&value);
+        let value_ptr = polyfill::ptr::from_mut(&mut value).cast::<c_void>();
+        // SAFETY: `value_ptr` is a valid pointer to `value` and `len` is the size of `value`.
+        let rc = unsafe {
+            libc::sysctlbyname(name.as_ptr(), value_ptr, &mut len, core::ptr::null_mut(), 0)
+        };
+        // All the conditions are separated so we can observe them in code coverage.
+        if rc != 0 {
+            return false;
+        }
+        debug_assert_eq!(len, mem::size_of_val(&value));
+        if len != mem::size_of_val(&value) {
+            return false;
+        }
+        value != 0
+    }
+
+    // We do not need to check for the presence of NEON, as Armv8-A always has it
+    const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask());
+
+    let mut features = 0;
+
+    // TODO(MSRV 1.77): Use c"..." literal.
+    const SHA512_NAME: cstr::Ref =
+        cstr::unwrap_const_from_bytes_with_nul(b"hw.optional.armv8_2_sha512\0");
+    if detect_feature(SHA512_NAME) {
+        features |= Sha512::mask();
+    }
+
+    features
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cpu;
+
+    #[test]
+    fn sha512_detection() {
+        // We intentionally disable static feature detection for SHA-512.
+        const _SHA512_NOT_STATICALLY_DETECTED: () = assert!((CAPS_STATIC & Sha512::mask()) == 0);
+
+        if cfg!(target_os = "macos") {
+            use crate::cpu::{arm::Sha512, GetFeature as _};
+
+            // All aarch64-apple-darwin targets have SHA3 enabled statically...
+            assert!(cfg!(target_feature = "sha3"));
+
+            // ...so we should detect it.
+            let cpu = cpu::features();
+            assert!(matches!(cpu.get_feature(), Some(Sha512 { .. })));
+        }
+    }
+}
diff --git a/ring-0.17.14/src/cpu/arm/fuchsia.rs b/ring-0.17.14/src/cpu/arm/fuchsia.rs
new file mode 100644
index 0000000000..bb1232b3a8
--- /dev/null
+++ b/ring-0.17.14/src/cpu/arm/fuchsia.rs
@@ -0,0 +1,58 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Aes, Neon, PMull, Sha256, Sha512, CAPS_STATIC};
+
+pub const FORCE_DYNAMIC_DETECTION: u32 = 0;
+
+pub fn detect_features() -> u32 {
+    type zx_status_t = i32;
+
+    #[link(name = "zircon")]
+    extern "C" {
+        fn zx_system_get_features(kind: u32, features: *mut u32) -> zx_status_t;
+    }
+
+    const ZX_OK: i32 = 0;
+    const ZX_FEATURE_KIND_CPU: u32 = 0;
+    const ZX_ARM64_FEATURE_ISA_AES: u32 = 1 << 3;
+    const ZX_ARM64_FEATURE_ISA_PMULL: u32 = 1 << 4;
+    const ZX_ARM64_FEATURE_ISA_SHA256: u32 = 1 << 6;
+    const ZX_ARM64_FEATURE_ISA_SHA512: u32 = 1 << 18;
+
+    let mut caps = 0;
+    let rc = unsafe { zx_system_get_features(ZX_FEATURE_KIND_CPU, &mut caps) };
+
+    let mut features = 0;
+
+    // We do not need to check for the presence of NEON, as Armv8-A always has it
+    const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask());
+
+    if rc == ZX_OK {
+        if caps & ZX_ARM64_FEATURE_ISA_AES == ZX_ARM64_FEATURE_ISA_AES {
+            features |= Aes::mask();
+        }
+        if caps & ZX_ARM64_FEATURE_ISA_PMULL == ZX_ARM64_FEATURE_ISA_PMULL {
+            features |= PMull::mask();
+        }
+        if caps & ZX_ARM64_FEATURE_ISA_SHA256 == ZX_ARM64_FEATURE_ISA_SHA256 {
+            features |= Sha256::mask();
+        }
+        if caps & ZX_ARM64_FEATURE_ISA_SHA512 == ZX_ARM64_FEATURE_ISA_SHA512 {
+            features |= Sha512::mask();
+        }
+    }
+
+    features
+}
diff --git a/ring-0.17.14/src/cpu/arm/linux.rs b/ring-0.17.14/src/cpu/arm/linux.rs
new file mode 100644
index 0000000000..db9549c7b9
--- /dev/null
+++ b/ring-0.17.14/src/cpu/arm/linux.rs
@@ -0,0 +1,107 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::Neon;
+
+// Work around a bug in LLVM/rustc where `-C target_cpu=cortex-a72`--
+// and `-C target_cpu=native` on Cortex-A72 Raspberry PI devices in
+// particular--enables crypto features even though not all Cortex-A72
+// CPUs have crypto features:
+//
+// ```
+// $ rustc --print cfg --target=aarch64-unknown-linux-gnu | grep feature
+// target_feature="neon"
+// $ rustc --print cfg --target=aarch64-unknown-linux-gnu -C target_cpu=cortex-a72 | grep feature
+// target_feature="aes"
+// target_feature="crc"
+// target_feature="neon"
+// target_feature="pmuv3"
+// target_feature="sha2"
+// ```
+//
+// XXX/TODO(MSRV https://github.com/llvm/llvm-project/issues/90365): This
+// workaround is heavy-handed since it forces extra branches for devices that
+// have correctly-modeled feature sets, so it should be removed.
+pub const FORCE_DYNAMIC_DETECTION: u32 = !Neon::mask();
+
+// `uclibc` does not provide `getauxval` so just use static feature detection
+// for it.
+#[cfg(target_env = "uclibc")]
+pub fn detect_features() -> u32 {
+    0
+}
+
+#[cfg(all(
+    not(target_env = "uclibc"),
+    all(target_arch = "aarch64", target_endian = "little")
+))]
+pub fn detect_features() -> u32 {
+    use super::{Aes, PMull, Sha256, Sha512, CAPS_STATIC};
+    use libc::{getauxval, AT_HWCAP, HWCAP_AES, HWCAP_PMULL, HWCAP_SHA2, HWCAP_SHA512};
+
+    let mut features = 0;
+
+    // We do not need to check for the presence of NEON, as Armv8-A always has it
+    const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask());
+
+    let caps = unsafe { getauxval(AT_HWCAP) };
+
+    if caps & HWCAP_AES == HWCAP_AES {
+        features |= Aes::mask();
+    }
+    if caps & HWCAP_PMULL == HWCAP_PMULL {
+        features |= PMull::mask();
+    }
+    if caps & HWCAP_SHA2 == HWCAP_SHA2 {
+        features |= Sha256::mask();
+    }
+    if caps & HWCAP_SHA512 == HWCAP_SHA512 {
+        features |= Sha512::mask();
+    }
+
+    features
+}
+
+#[cfg(all(
+    not(target_env = "uclibc"),
+    all(target_arch = "arm", target_endian = "little")
+))]
+pub fn detect_features() -> u32 {
+    use super::CAPS_STATIC;
+
+    // The `libc` crate doesn't provide this functionality on all
+    // 32-bit Linux targets, like Android or -musl. Use this polyfill
+    // for all 32-bit ARM targets so that testing on one of them will
+    // be more meaningful to the others.
+    use libc::c_ulong;
+    extern "C" {
+        pub fn getauxval(type_: c_ulong) -> c_ulong;
+    }
+    const AT_HWCAP: c_ulong = 16;
+    const HWCAP_NEON: c_ulong = 1 << 12;
+
+    let mut features = 0;
+
+    if CAPS_STATIC & Neon::mask() != Neon::mask() {
+        let caps = unsafe { getauxval(AT_HWCAP) };
+
+        // OpenSSL and BoringSSL don't enable any other features if NEON isn't
+        // available. We don't enable any hardware implementations for 32-bit ARM.
+        if caps & HWCAP_NEON == HWCAP_NEON {
+            features |= Neon::mask();
+        }
+    }
+
+    features
+}
diff --git a/ring-0.17.14/src/cpu/arm/windows.rs b/ring-0.17.14/src/cpu/arm/windows.rs
new file mode 100644
index 0000000000..a753706935
--- /dev/null
+++ b/ring-0.17.14/src/cpu/arm/windows.rs
@@ -0,0 +1,38 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Aes, Neon, PMull, Sha256, CAPS_STATIC};
+use windows_sys::Win32::System::Threading::{
+    IsProcessorFeaturePresent, PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE,
+};
+
+pub const FORCE_DYNAMIC_DETECTION: u32 = 0;
+
+pub fn detect_features() -> u32 {
+    // We do not need to check for the presence of NEON, as Armv8-A always has it
+    const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask());
+
+    let mut features = 0;
+
+    let result = unsafe { IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) };
+
+    if result != 0 {
+        // These are all covered by one call in Windows
+        features |= Aes::mask();
+        features |= PMull::mask();
+        features |= Sha256::mask();
+    }
+
+    features
+}
diff --git a/ring-0.17.14/src/cpu/intel.rs b/ring-0.17.14/src/cpu/intel.rs
new file mode 100644
index 0000000000..f45052fe7f
--- /dev/null
+++ b/ring-0.17.14/src/cpu/intel.rs
@@ -0,0 +1,382 @@
+// Copyright 2016-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use cfg_if::cfg_if;
+
+mod abi_assumptions {
+    use core::mem::size_of;
+
+    // TOOD: Support targets that do not have SSE and SSE2 enabled, such as
+    // x86_64-unknown-linux-none. See
+    // https://github.com/briansmith/ring/issues/1793#issuecomment-1793243725,
+    // https://github.com/briansmith/ring/issues/1832,
+    // https://github.com/briansmith/ring/issues/1833.
+    const _ASSUMES_SSE2: () =
+        assert!(cfg!(target_feature = "sse") && cfg!(target_feature = "sse2"));
+
+    #[cfg(target_arch = "x86_64")]
+    const _ASSUMED_POINTER_SIZE: usize = 8;
+    #[cfg(target_arch = "x86")]
+    const _ASSUMED_POINTER_SIZE: usize = 4;
+    const _ASSUMED_USIZE_SIZE: () = assert!(size_of::<usize>() == _ASSUMED_POINTER_SIZE);
+    const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE);
+
+    const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little"));
+}
+
+pub(super) mod featureflags {
+    use super::super::CAPS_STATIC;
+    use crate::{
+        cpu,
+        polyfill::{once_cell::race, usize_from_u32},
+    };
+    use core::num::NonZeroUsize;
+
+    pub(in super::super) fn get_or_init() -> cpu::Features {
+        // SAFETY: `OPENSSL_cpuid_setup` must be called only in
+        // `INIT.call_once()` below.
+        prefixed_extern! {
+            fn OPENSSL_cpuid_setup(out: &mut [u32; 4]);
+        }
+
+        let _: NonZeroUsize = FEATURES.get_or_init(|| {
+            let mut cpuid = [0; 4];
+            // SAFETY: We assume that it is safe to execute CPUID and XGETBV.
+            unsafe {
+                OPENSSL_cpuid_setup(&mut cpuid);
+            }
+            let detected = super::cpuid_to_caps_and_set_c_flags(&cpuid);
+            let merged = CAPS_STATIC | detected;
+
+            let merged = usize_from_u32(merged) | (1 << (super::Shift::Initialized as u32));
+            NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit.
+        });
+
+        // SAFETY: We initialized the CPU features as required.
+        // `INIT.call_once` has `happens-before` semantics.
+        unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() }
+    }
+
+    pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 {
+        // SAFETY: Since only `get_or_init()` could have created
+        // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`,
+        // we know we are reading from `FEATURES` after initializing it.
+        //
+        // Also, 0 means "no features detected" to users, which is designed to
+        // be a safe configuration.
+        let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0);
+
+        // The truncation is lossless, as we set the value with a u32.
+        #[allow(clippy::cast_possible_truncation)]
+        let features = features as u32;
+
+        features
+    }
+
+    static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new();
+
+    #[cfg(target_arch = "x86")]
+    #[rustfmt::skip]
+    pub const STATIC_DETECTED: u32 = 0
+        | (if cfg!(target_feature = "sse2") { super::Sse2::mask() } else { 0 })
+        ;
+
+    // Limited to x86_64-v2 features.
+    // TODO: Add missing x86-64-v3 features if we find real-world use of x86-64-v3.
+    // TODO: Add all features we use.
+    #[cfg(target_arch = "x86_64")]
+    #[rustfmt::skip]
+    pub const STATIC_DETECTED: u32 = 0
+        | if cfg!(target_feature = "sse4.1") { super::Sse41::mask() } else { 0 }
+        | if cfg!(target_feature = "ssse3") { super::Ssse3::mask() } else { 0 }
+        ;
+
+    pub const FORCE_DYNAMIC_DETECTION: u32 = 0;
+}
+
+fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
+    // "Intel" citations are for "Intel 64 and IA-32 Architectures Software
+    // Developer’s Manual", Combined Volumes, December 2024.
+    // "AMD" citations are for "AMD64 Technology AMD64 Architecture
+    // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024.
+
+    // The `prefixed_extern!` uses below assume this
+    #[cfg(target_arch = "x86_64")]
+    use core::{mem::align_of, sync::atomic::AtomicU32};
+    #[cfg(target_arch = "x86_64")]
+    const _ATOMIC32_ALIGNMENT_EQUALS_U32_ALIGNMENT: () =
+        assert!(align_of::<AtomicU32>() == align_of::<u32>());
+
+    fn check(leaf: u32, bit: u32) -> bool {
+        let shifted = 1 << bit;
+        (leaf & shifted) == shifted
+    }
+    fn set(out: &mut u32, shift: Shift) {
+        let shifted = 1 << (shift as u32);
+        debug_assert_eq!(*out & shifted, 0);
+        *out |= shifted;
+        debug_assert_eq!(*out & shifted, shifted);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    let is_intel = check(cpuid[0], 30); // Synthesized by `OPENSSL_cpuid_setup`
+
+    // CPUID leaf 1.
+    let leaf1_ecx = cpuid[1];
+
+    // Intel: "Structured Extended Feature Flags Enumeration Leaf"
+    #[cfg(target_arch = "x86_64")]
+    let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]);
+
+    let mut caps = 0;
+
+    // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE
+    // instructions. All legacy SSE instructions support 128-bit vector
+    // operands."
+
+    // Intel: "11.6.2 Checking for Intel SSE and SSE2 Support"
+    // We have to assume the prerequisites for SSE/SSE2 are met since we're
+    // already almost definitely using SSE registers if these target features
+    // are enabled.
+    //
+    // These also seem to help ensure CMOV support; There doesn't seem to be
+    // a `cfg!(target_feature = "cmov")`. It is likely that removing these
+    // assertions will remove the requirement for CMOV. With our without
+    // CMOV, it is likely that some of our timing side channel prevention does
+    // not work. Presumably the people who delete these are verifying that it
+    // all works fine.
+    const _SSE_REQUIRED: () = assert!(cfg!(target_feature = "sse"));
+    const _SSE2_REQUIRED: () = assert!(cfg!(target_feature = "sse2"));
+
+    #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+    {
+        // If somebody is trying to compile for an x86 target without SSE2
+        // and they deleted the `_SSE2_REQUIRED` const assertion above then
+        // they're probably trying to support a Linux/BSD/etc. distro that
+        // tries to support ancient x86 systems without SSE/SSE2. Try to
+        // reduce the harm caused, by implementing dynamic feature detection
+        // for them so that most systems will work like normal.
+        //
+        // Note that usually an x86-64 target with SSE2 disabled by default,
+        // usually `-none-` targets, will not support dynamically-detected use
+        // of SIMD registers via CPUID. A whole different mechanism is needed
+        // to support them. Same for i*86-*-none targets.
+        let leaf1_edx = cpuid[0];
+        let sse1_available = check(leaf1_edx, 25);
+        let sse2_available = check(leaf1_edx, 26);
+        if sse1_available && sse2_available {
+            set(&mut caps, Shift::Sse2);
+        }
+    }
+
+    // Sometimes people delete the `_SSE_REQUIRED`/`_SSE2_REQUIRED` const
+    // assertions in an attempt to support pre-SSE2 32-bit x86 systems. If they
+    // do, hopefully they won't delete these redundant assertions, so that
+    // x86_64 isn't affected.
+    #[cfg(target_arch = "x86_64")]
+    const _SSE2_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2"));
+    #[cfg(target_arch = "x86_64")]
+    const _SSE_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2"));
+
+    // Intel: "12.7.2 Checking for SSSE3 Support"
+    // If/when we support dynamic detection of SSE/SSE2, make this conditional
+    // on SSE/SSE2.
+    if check(leaf1_ecx, 9) {
+        set(&mut caps, Shift::Ssse3);
+    }
+
+    // Intel: "12.12.2 Checking for Intel SSE4.1 Support"
+    // If/when we support dynamic detection of SSE/SSE2, make this conditional
+    // on SSE/SSE2.
+    // XXX: We don't check for SSE3 and we're not sure if it is compatible for
+    //      us to do so; does AMD advertise SSE3? TODO: address this.
+    // XXX: We don't condition this on SSSE3 being available. TODO: address
+    //      this.
+    #[cfg(target_arch = "x86_64")]
+    if check(leaf1_ecx, 19) {
+        set(&mut caps, Shift::Sse41);
+    }
+
+    // AMD: "The extended SSE instructions include [...]."
+
+    // Intel: "14.3 DETECTION OF INTEL AVX INSTRUCTIONS"
+    // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't
+    // support AVX state.
+    let avx_available = check(leaf1_ecx, 28);
+    if avx_available {
+        set(&mut caps, Shift::Avx);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    if avx_available {
+        // The Intel docs don't seem to document the detection. The instruction
+        // definitions of the VEX.256 instructions reference the
+        // VAES/VPCLMULQDQ features and the documentation for the extended
+        // features gives the values. We combine these into one feature because
+        // we never use them independently.
+        let vaes_available = check(extended_features_ecx, 9);
+        let vclmul_available = check(extended_features_ecx, 10);
+        if vaes_available && vclmul_available {
+            set(&mut caps, Shift::VAesClmul);
+        }
+    }
+
+    // "14.7.1 Detection of Intel AVX2 Hardware support"
+    // XXX: We don't condition AVX2 on AVX. TODO: Address this.
+    // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't
+    // support AVX state.
+    #[cfg(target_arch = "x86_64")]
+    if check(extended_features_ebx, 5) {
+        set(&mut caps, Shift::Avx2);
+
+        // Declared as `uint32_t` in the C code.
+        prefixed_extern! {
+            static avx2_available: AtomicU32;
+        }
+        // SAFETY: The C code only reads `avx2_available`, and its reads are
+        // synchronized through the `OnceNonZeroUsize` Acquire/Release
+        // semantics as we ensure we have a `cpu::Features` instance before
+        // calling into the C code.
+        let flag = unsafe { &avx2_available };
+        flag.store(1, core::sync::atomic::Ordering::Relaxed);
+    }
+
+    // Intel: "12.13.4 Checking for Intel AES-NI Support"
+    // If/when we support dynamic detection of SSE/SSE2, revisit this.
+    // TODO: Clarify "interesting" states like (!SSE && AVX && AES-NI)
+    // and AES-NI & !AVX.
+    // Each check of `ClMul`, `Aes`, and `Sha` must be paired with a check for
+    // an AVX feature (e.g. `Avx`) or an SSE feature (e.g. `Ssse3`), as every
+    // use will either be supported by SSE* or AVX* instructions. We then
+    // assume that those supporting instructions' prerequisites (e.g. OS
+    // support for AVX or SSE state, respectively) are the only prerequisites
+    // for these features.
+    if check(leaf1_ecx, 1) {
+        set(&mut caps, Shift::ClMul);
+    }
+    if check(leaf1_ecx, 25) {
+        set(&mut caps, Shift::Aes);
+    }
+    // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling
+    // static feature detection for this.
+    #[cfg(target_arch = "x86_64")]
+    if check(extended_features_ebx, 29) {
+        set(&mut caps, Shift::Sha);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_intel {
+            set(&mut caps, Shift::IntelCpu);
+        }
+
+        if check(leaf1_ecx, 22) {
+            set(&mut caps, Shift::Movbe);
+        }
+
+        let adx_available = check(extended_features_ebx, 19);
+        if adx_available {
+            set(&mut caps, Shift::Adx);
+        }
+
+        // Some 6th Generation (Skylake) CPUs claim to support BMI1 and BMI2
+        // when they don't; see erratum "SKD052". The Intel document at
+        // https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/6th-gen-core-u-y-spec-update.pdf
+        // contains the footnote "Affects 6th Generation Intel Pentium processor
+        // family and Intel Celeron processor family". Further research indicates
+        // that Skylake Pentium/Celeron do not implement AVX or ADX. It turns
+        // out that we only use BMI1 and BMI2 in combination with ADX and/or
+        // AVX.
+        //
+        // rust `std::arch::is_x86_feature_detected` does a very similar thing
+        // but only looks at AVX, not ADX. Note that they reference an older
+        // version of the erratum labeled SKL052.
+        let believe_bmi_bits = !is_intel || (adx_available || avx_available);
+
+        if check(extended_features_ebx, 3) && believe_bmi_bits {
+            set(&mut caps, Shift::Bmi1);
+        }
+
+        let bmi2_available = check(extended_features_ebx, 8) && believe_bmi_bits;
+        if bmi2_available {
+            set(&mut caps, Shift::Bmi2);
+        }
+
+        if adx_available && bmi2_available {
+            // Declared as `uint32_t` in the C code.
+            prefixed_extern! {
+                static adx_bmi2_available: AtomicU32;
+            }
+            // SAFETY: The C code only reads `adx_bmi2_available`, and its
+            // reads are synchronized through the `OnceNonZeroUsize`
+            // Acquire/Release semantics as we ensure we have a
+            // `cpu::Features` instance before calling into the C code.
+            let flag = unsafe { &adx_bmi2_available };
+            flag.store(1, core::sync::atomic::Ordering::Relaxed);
+        }
+    }
+
+    caps
+}
+
+impl_get_feature! {
+    features: [
+        { ("x86_64") => VAesClmul },
+        { ("x86", "x86_64") => ClMul },
+        { ("x86", "x86_64") => Ssse3 },
+        { ("x86_64") => Sse41 },
+        { ("x86_64") => Movbe },
+        { ("x86", "x86_64") => Aes },
+        { ("x86", "x86_64") => Avx },
+        { ("x86_64") => Bmi1 },
+        { ("x86_64") => Avx2 },
+        { ("x86_64") => Bmi2 },
+        { ("x86_64") => Adx },
+        // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling
+        // static feature detection for this.
+        { ("x86_64") => Sha },
+        // x86_64 can just assume SSE2 is available.
+        { ("x86") => Sse2 },
+    ],
+}
+
+cfg_if! {
+    if #[cfg(target_arch = "x86_64")] {
+        #[derive(Clone, Copy)]
+        pub(crate) struct IntelCpu(super::Features);
+
+        impl super::GetFeature<IntelCpu> for super::features::Values {
+            fn get_feature(&self) -> Option<IntelCpu> {
+                const MASK: u32 = 1 << (Shift::IntelCpu as u32);
+                if (self.values() & MASK) == MASK {
+                    Some(IntelCpu(self.cpu()))
+                } else {
+                    None
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    // This should always pass on any x86 system except very, very, old ones.
+    #[cfg(target_arch = "x86")]
+    #[test]
+    fn x86_has_sse2() {
+        use super::*;
+        use crate::cpu::{self, GetFeature as _};
+        assert!(matches!(cpu::features().get_feature(), Some(Sse2 { .. })))
+    }
+}
diff --git a/ring-0.17.14/src/data/alg-rsa-encryption.der b/ring-0.17.14/src/data/alg-rsa-encryption.der
new file mode 100644
index 0000000000..77d159a1c6
Binary files /dev/null and b/ring-0.17.14/src/data/alg-rsa-encryption.der differ
diff --git a/ring-0.17.14/src/debug.rs b/ring-0.17.14/src/debug.rs
new file mode 100644
index 0000000000..5de58be461
--- /dev/null
+++ b/ring-0.17.14/src/debug.rs
@@ -0,0 +1,84 @@
+// Copyright 2018 Trent Clarke.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// Generates an implementation of the Debug trait for a type that defers to the
+// Debug implementation for a given field.
+macro_rules! derive_debug_via_id {
+    ($typename:ident) => {
+        impl ::core::fmt::Debug for $typename {
+            fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> {
+                ::core::fmt::Debug::fmt(&self.id, f)
+            }
+        }
+    };
+}
+
+macro_rules! derive_debug_via_field {
+    ($type:ty, $field:ident) => {
+        derive_debug_via_field!($type, stringify!($type), $field);
+    };
+
+    ($type:ty, $typename:expr, $field:ident) => {
+        impl ::core::fmt::Debug for $type {
+            fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> {
+                f.debug_struct($typename)
+                    .field(stringify!($field), &self.$field)
+                    .finish()
+            }
+        }
+    };
+}
+
+// Generates an implementation of the Debug trait for a type that outputs the
+// hex encoding of the byte slice representation of the value.
+macro_rules! derive_debug_self_as_ref_hex_bytes {
+    ($typename:ident) => {
+        impl ::core::fmt::Debug for $typename {
+            fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> {
+                crate::debug::write_hex_tuple(f, stringify!($typename), self)
+            }
+        }
+    };
+}
+
+pub(crate) fn write_hex_tuple(
+    fmt: &mut core::fmt::Formatter,
+    type_name: &str,
+    value: &dyn AsRef<[u8]>,
+) -> Result<(), ::core::fmt::Error> {
+    fmt.debug_tuple(type_name)
+        .field(&HexStr(value.as_ref()))
+        .finish()
+}
+
+pub struct HexStr<'a>(pub &'a [u8]);
+
+impl core::fmt::Debug for HexStr<'_> {
+    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        fmt.write_str("\"")?;
+        write_hex_bytes(fmt, self.0)?;
+        fmt.write_str("\"")?;
+        Ok(())
+    }
+}
+
+pub(crate) fn write_hex_bytes(
+    fmt: &mut core::fmt::Formatter,
+    bytes: &[u8],
+) -> Result<(), ::core::fmt::Error> {
+    for byte in bytes {
+        write!(fmt, "{:02x}", byte)?;
+    }
+    Ok(())
+}
diff --git a/ring-0.17.14/src/deprecated_constant_time.rs b/ring-0.17.14/src/deprecated_constant_time.rs
new file mode 100644
index 0000000000..5703c6f08e
--- /dev/null
+++ b/ring-0.17.14/src/deprecated_constant_time.rs
@@ -0,0 +1,22 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{bb, error};
+
+#[deprecated(
+    note = "To be removed. Internal function not intended for external use with no promises regarding side channels."
+)]
+pub fn verify_slices_are_equal(a: &[u8], b: &[u8]) -> Result<(), error::Unspecified> {
+    bb::verify_slices_are_equal(a, b)
+}
diff --git a/ring-0.17.14/src/deprecated_test.rs b/ring-0.17.14/src/deprecated_test.rs
new file mode 100644
index 0000000000..3758aa2b0b
--- /dev/null
+++ b/ring-0.17.14/src/deprecated_test.rs
@@ -0,0 +1,45 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![doc(hidden)]
+
+/// References a test input file.
+#[macro_export]
+macro_rules! test_file {
+    ($file_name:expr) => {
+        $crate::test::File {
+            file_name: $file_name,
+            contents: include_str!($file_name),
+        }
+    };
+}
+
+pub use crate::testutil::{
+    compile_time_assert_clone, compile_time_assert_copy, compile_time_assert_eq,
+    compile_time_assert_send, compile_time_assert_sync, from_hex, run, File, TestCase,
+};
+
+#[cfg(feature = "std")]
+pub use crate::testutil::compile_time_assert_std_error_error;
+
+#[deprecated(note = "internal API that will be removed")]
+#[doc(hidden)]
+pub mod rand {
+    #[deprecated(note = "internal API that will be removed")]
+    pub type FixedByteRandom = crate::testutil::rand::FixedByteRandom;
+    #[deprecated(note = "internal API that will be removed")]
+    pub type FixedSliceRandom<'a> = crate::testutil::rand::FixedSliceRandom<'a>;
+    #[deprecated(note = "internal API that will be removed")]
+    pub type FixedSliceSequenceRandom<'a> = crate::testutil::rand::FixedSliceSequenceRandom<'a>;
+}
diff --git a/ring-0.17.14/src/digest.rs b/ring-0.17.14/src/digest.rs
new file mode 100644
index 0000000000..e771237786
--- /dev/null
+++ b/ring-0.17.14/src/digest.rs
@@ -0,0 +1,680 @@
+// Copyright 2015-2019 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! SHA-2 and the legacy SHA-1 digest algorithm.
+//!
+//! If all the data is available in a single contiguous slice then the `digest`
+//! function should be used. Otherwise, the digest can be calculated in
+//! multiple steps using `Context`.
+
+use self::{
+    dynstate::DynState,
+    sha2::{SHA256_BLOCK_LEN, SHA512_BLOCK_LEN},
+};
+use crate::{
+    bits::{BitLength, FromByteLen as _},
+    cpu, debug, error,
+    polyfill::{self, slice, sliceutil},
+};
+use core::num::Wrapping;
+
+pub(crate) use self::finish_error::FinishError;
+
+mod dynstate;
+mod sha1;
+mod sha2;
+
+#[derive(Clone)]
+pub(crate) struct BlockContext {
+    state: DynState,
+
+    // Note that SHA-512 has a 128-bit input bit counter, but this
+    // implementation only supports up to 2^64-1 input bits for all algorithms,
+    // so a 64-bit counter is more than sufficient.
+    completed_bytes: u64,
+
+    /// The context's algorithm.
+    pub algorithm: &'static Algorithm,
+}
+
+impl BlockContext {
+    pub(crate) fn new(algorithm: &'static Algorithm) -> Self {
+        Self {
+            state: algorithm.initial_state.clone(),
+            completed_bytes: 0,
+            algorithm,
+        }
+    }
+
+    /// Processes all the full blocks in `input`, returning the partial block
+    /// at the end, which may be empty.
+    pub(crate) fn update<'i>(&mut self, input: &'i [u8], cpu_features: cpu::Features) -> &'i [u8] {
+        let (completed_bytes, leftover) = self.block_data_order(input, cpu_features);
+        // Using saturated addition here allows `update` to be infallible and
+        // panic-free. If we were to reach the maximum value here then `finish`
+        // will detect that we processed too much data when it converts this to
+        // a bit length.
+        self.completed_bytes = self
+            .completed_bytes
+            .saturating_add(polyfill::u64_from_usize(completed_bytes));
+        leftover
+    }
+
+    // On input, `block[..num_pending]` is the (possibly-empty) last *partial*
+    // chunk of input. It *must* be partial; that is, it is required that
+    // `num_pending < self.algorithm.block_len`.
+    //
+    // `block` may be arbitrarily overwritten.
+    pub(crate) fn try_finish(
+        mut self,
+        block: &mut [u8; MAX_BLOCK_LEN],
+        num_pending: usize,
+        cpu_features: cpu::Features,
+    ) -> Result<Digest, FinishError> {
+        let completed_bits = self
+            .completed_bytes
+            .checked_add(polyfill::u64_from_usize(num_pending))
+            .ok_or_else(|| {
+                // Choosing self.completed_bytes here is lossy & somewhat arbitrary.
+                InputTooLongError::new(self.completed_bytes)
+            })
+            .and_then(BitLength::from_byte_len)
+            .map_err(FinishError::input_too_long)?;
+
+        let block_len = self.algorithm.block_len();
+        let block = &mut block[..block_len];
+
+        let padding = match block.get_mut(num_pending..) {
+            Some([separator, padding @ ..]) => {
+                *separator = 0x80;
+                padding
+            }
+            // Precondition violated.
+            unreachable => {
+                return Err(FinishError::pending_not_a_partial_block(
+                    unreachable.as_deref(),
+                ));
+            }
+        };
+
+        let padding = match padding
+            .len()
+            .checked_sub(self.algorithm.block_len.len_len())
+        {
+            Some(_) => padding,
+            None => {
+                padding.fill(0);
+                let (completed_bytes, leftover) = self.block_data_order(block, cpu_features);
+                debug_assert_eq!((completed_bytes, leftover.len()), (block_len, 0));
+                // We don't increase |self.completed_bytes| because the padding
+                // isn't data, and so it isn't included in the data length.
+                &mut block[..]
+            }
+        };
+
+        let (to_zero, len) = padding.split_at_mut(padding.len() - 8);
+        to_zero.fill(0);
+        len.copy_from_slice(&completed_bits.to_be_bytes());
+
+        let (completed_bytes, leftover) = self.block_data_order(block, cpu_features);
+        debug_assert_eq!((completed_bytes, leftover.len()), (block_len, 0));
+
+        Ok(Digest {
+            algorithm: self.algorithm,
+            value: self.state.format_output(),
+        })
+    }
+
+    #[must_use]
+    fn block_data_order<'d>(
+        &mut self,
+        data: &'d [u8],
+        cpu_features: cpu::Features,
+    ) -> (usize, &'d [u8]) {
+        (self.algorithm.block_data_order)(&mut self.state, data, cpu_features)
+    }
+}
+
+pub(crate) type InputTooLongError = error::InputTooLongError<u64>;
+
+cold_exhaustive_error! {
+    enum finish_error::FinishError {
+        input_too_long => InputTooLong(InputTooLongError),
+        pending_not_a_partial_block_inner => PendingNotAPartialBlock(usize),
+    }
+}
+
+impl FinishError {
+    #[cold]
+    #[inline(never)]
+    fn pending_not_a_partial_block(padding: Option<&[u8]>) -> Self {
+        match padding {
+            None => Self::pending_not_a_partial_block_inner(0),
+            Some(padding) => Self::pending_not_a_partial_block_inner(padding.len()),
+        }
+    }
+}
+
+/// A context for multi-step (Init-Update-Finish) digest calculations.
+///
+/// # Examples
+///
+/// ```
+/// use ring::digest;
+///
+/// let one_shot = digest::digest(&digest::SHA384, b"hello, world");
+///
+/// let mut ctx = digest::Context::new(&digest::SHA384);
+/// ctx.update(b"hello");
+/// ctx.update(b", ");
+/// ctx.update(b"world");
+/// let multi_part = ctx.finish();
+///
+/// assert_eq!(&one_shot.as_ref(), &multi_part.as_ref());
+/// ```
+#[derive(Clone)]
+pub struct Context {
+    block: BlockContext,
+    // TODO: More explicitly force 64-bit alignment for |pending|.
+    pending: [u8; MAX_BLOCK_LEN],
+
+    // Invariant: `self.num_pending < self.block.algorithm.block_len`.
+    num_pending: usize,
+}
+
+impl Context {
+    /// Constructs a new context.
+    pub fn new(algorithm: &'static Algorithm) -> Self {
+        Self {
+            block: BlockContext::new(algorithm),
+            pending: [0u8; MAX_BLOCK_LEN],
+            num_pending: 0,
+        }
+    }
+
+    pub(crate) fn clone_from(block: &BlockContext) -> Self {
+        Self {
+            block: block.clone(),
+            pending: [0u8; MAX_BLOCK_LEN],
+            num_pending: 0,
+        }
+    }
+
+    /// Updates the digest with all the data in `data`.
+    pub fn update(&mut self, data: &[u8]) {
+        let cpu_features = cpu::features();
+
+        let block_len = self.block.algorithm.block_len();
+        let buffer = &mut self.pending[..block_len];
+
+        let to_digest = if self.num_pending == 0 {
+            data
+        } else {
+            let buffer_to_fill = match buffer.get_mut(self.num_pending..) {
+                Some(buffer_to_fill) => buffer_to_fill,
+                None => {
+                    // Impossible because of the invariant.
+                    unreachable!();
+                }
+            };
+            sliceutil::overwrite_at_start(buffer_to_fill, data);
+            match slice::split_at_checked(data, buffer_to_fill.len()) {
+                Some((just_copied, to_digest)) => {
+                    debug_assert_eq!(buffer_to_fill.len(), just_copied.len());
+                    debug_assert_eq!(self.num_pending + just_copied.len(), block_len);
+                    let leftover = self.block.update(buffer, cpu_features);
+                    debug_assert_eq!(leftover.len(), 0);
+                    self.num_pending = 0;
+                    to_digest
+                }
+                None => {
+                    self.num_pending += data.len();
+                    // If `data` isn't enough to complete a block, buffer it and stop.
+                    debug_assert!(self.num_pending < block_len);
+                    return;
+                }
+            }
+        };
+
+        let leftover = self.block.update(to_digest, cpu_features);
+        sliceutil::overwrite_at_start(buffer, leftover);
+        self.num_pending = leftover.len();
+        debug_assert!(self.num_pending < block_len);
+    }
+
+    /// Finalizes the digest calculation and returns the digest value.
+    ///
+    /// `finish` consumes the context so it cannot be (mis-)used after `finish`
+    /// has been called.
+    pub fn finish(self) -> Digest {
+        let cpu = cpu::features();
+        self.try_finish(cpu)
+            .map_err(error::erase::<InputTooLongError>)
+            .unwrap()
+    }
+
+    pub(crate) fn try_finish(
+        mut self,
+        cpu_features: cpu::Features,
+    ) -> Result<Digest, InputTooLongError> {
+        self.block
+            .try_finish(&mut self.pending, self.num_pending, cpu_features)
+            .map_err(|err| match err {
+                FinishError::InputTooLong(i) => i,
+                FinishError::PendingNotAPartialBlock(_) => {
+                    // Due to invariant.
+                    unreachable!()
+                }
+            })
+    }
+
+    /// The algorithm that this context is using.
+    #[inline(always)]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.block.algorithm
+    }
+}
+
+/// Returns the digest of `data` using the given digest algorithm.
+pub fn digest(algorithm: &'static Algorithm, data: &[u8]) -> Digest {
+    let cpu = cpu::features();
+    Digest::compute_from(algorithm, data, cpu)
+        .map_err(error::erase::<InputTooLongError>)
+        .unwrap()
+}
+
+/// A calculated digest value.
+///
+/// Use [`Self::as_ref`] to get the value as a `&[u8]`.
+#[derive(Clone, Copy)]
+pub struct Digest {
+    value: Output,
+    algorithm: &'static Algorithm,
+}
+
+impl Digest {
+    pub(crate) fn compute_from(
+        algorithm: &'static Algorithm,
+        data: &[u8],
+        cpu: cpu::Features,
+    ) -> Result<Self, InputTooLongError> {
+        let mut ctx = Context::new(algorithm);
+        ctx.update(data);
+        ctx.try_finish(cpu)
+    }
+
+    /// The algorithm that was used to calculate the digest value.
+    #[inline(always)]
+    pub fn algorithm(&self) -> &'static Algorithm {
+        self.algorithm
+    }
+}
+
+impl AsRef<[u8]> for Digest {
+    #[inline(always)]
+    fn as_ref(&self) -> &[u8] {
+        &self.value.0[..self.algorithm.output_len()]
+    }
+}
+
+impl core::fmt::Debug for Digest {
+    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(fmt, "{:?}:", self.algorithm)?;
+        debug::write_hex_bytes(fmt, self.as_ref())
+    }
+}
+
+/// A digest algorithm.
+pub struct Algorithm {
+    output_len: OutputLen,
+    chaining_len: usize,
+    block_len: BlockLen,
+
+    /// `block_data_order` processes all the full blocks of data in `data`. It
+    /// returns the number of bytes processed and the unprocessed data, which
+    /// is guaranteed to be less than `block_len` bytes long.
+    block_data_order: for<'d> fn(
+        state: &mut DynState,
+        data: &'d [u8],
+        cpu_features: cpu::Features,
+    ) -> (usize, &'d [u8]),
+
+    initial_state: DynState,
+
+    id: AlgorithmID,
+}
+
+#[derive(Debug, Eq, PartialEq)]
+enum AlgorithmID {
+    SHA1,
+    SHA256,
+    SHA384,
+    SHA512,
+    SHA512_256,
+}
+
+impl PartialEq for Algorithm {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+    }
+}
+
+impl Eq for Algorithm {}
+
+derive_debug_via_id!(Algorithm);
+
+impl Algorithm {
+    /// The internal block length.
+    pub fn block_len(&self) -> usize {
+        self.block_len.into()
+    }
+
+    /// The size of the chaining value of the digest function, in bytes.
+    ///
+    /// For non-truncated algorithms (SHA-1, SHA-256, SHA-512), this is equal
+    /// to [`Self::output_len()`]. For truncated algorithms (e.g. SHA-384,
+    /// SHA-512/256), this is equal to the length before truncation. This is
+    /// mostly helpful for determining the size of an HMAC key that is
+    /// appropriate for the digest algorithm.
+    pub fn chaining_len(&self) -> usize {
+        self.chaining_len
+    }
+
+    /// The length of a finalized digest.
+    pub fn output_len(&self) -> usize {
+        self.output_len.into()
+    }
+}
+
+/// SHA-1 as specified in [FIPS 180-4]. Deprecated.
+///
+/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+pub static SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm {
+    output_len: sha1::OUTPUT_LEN,
+    chaining_len: sha1::CHAINING_LEN,
+    block_len: sha1::BLOCK_LEN,
+    block_data_order: dynstate::sha1_block_data_order,
+    initial_state: DynState::new32([
+        Wrapping(0x67452301u32),
+        Wrapping(0xefcdab89u32),
+        Wrapping(0x98badcfeu32),
+        Wrapping(0x10325476u32),
+        Wrapping(0xc3d2e1f0u32),
+        Wrapping(0),
+        Wrapping(0),
+        Wrapping(0),
+    ]),
+    id: AlgorithmID::SHA1,
+};
+
+/// SHA-256 as specified in [FIPS 180-4].
+///
+/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+pub static SHA256: Algorithm = Algorithm {
+    output_len: OutputLen::_256,
+    chaining_len: SHA256_OUTPUT_LEN,
+    block_len: SHA256_BLOCK_LEN,
+    block_data_order: dynstate::sha256_block_data_order,
+    initial_state: DynState::new32([
+        Wrapping(0x6a09e667u32),
+        Wrapping(0xbb67ae85u32),
+        Wrapping(0x3c6ef372u32),
+        Wrapping(0xa54ff53au32),
+        Wrapping(0x510e527fu32),
+        Wrapping(0x9b05688cu32),
+        Wrapping(0x1f83d9abu32),
+        Wrapping(0x5be0cd19u32),
+    ]),
+    id: AlgorithmID::SHA256,
+};
+
+/// SHA-384 as specified in [FIPS 180-4].
+///
+/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+pub static SHA384: Algorithm = Algorithm {
+    output_len: OutputLen::_384,
+    chaining_len: SHA512_OUTPUT_LEN,
+    block_len: SHA512_BLOCK_LEN,
+    block_data_order: dynstate::sha512_block_data_order,
+    initial_state: DynState::new64([
+        Wrapping(0xcbbb9d5dc1059ed8),
+        Wrapping(0x629a292a367cd507),
+        Wrapping(0x9159015a3070dd17),
+        Wrapping(0x152fecd8f70e5939),
+        Wrapping(0x67332667ffc00b31),
+        Wrapping(0x8eb44a8768581511),
+        Wrapping(0xdb0c2e0d64f98fa7),
+        Wrapping(0x47b5481dbefa4fa4),
+    ]),
+    id: AlgorithmID::SHA384,
+};
+
+/// SHA-512 as specified in [FIPS 180-4].
+///
+/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+pub static SHA512: Algorithm = Algorithm {
+    output_len: OutputLen::_512,
+    chaining_len: SHA512_OUTPUT_LEN,
+    block_len: SHA512_BLOCK_LEN,
+    block_data_order: dynstate::sha512_block_data_order,
+    initial_state: DynState::new64([
+        Wrapping(0x6a09e667f3bcc908),
+        Wrapping(0xbb67ae8584caa73b),
+        Wrapping(0x3c6ef372fe94f82b),
+        Wrapping(0xa54ff53a5f1d36f1),
+        Wrapping(0x510e527fade682d1),
+        Wrapping(0x9b05688c2b3e6c1f),
+        Wrapping(0x1f83d9abfb41bd6b),
+        Wrapping(0x5be0cd19137e2179),
+    ]),
+    id: AlgorithmID::SHA512,
+};
+
+/// SHA-512/256 as specified in [FIPS 180-4].
+///
+/// This is *not* the same as just truncating the output of SHA-512, as
+/// SHA-512/256 has its own initial state distinct from SHA-512's initial
+/// state.
+///
+/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+pub static SHA512_256: Algorithm = Algorithm {
+    output_len: OutputLen::_256,
+    chaining_len: SHA512_OUTPUT_LEN,
+    block_len: SHA512_BLOCK_LEN,
+    block_data_order: dynstate::sha512_block_data_order,
+    initial_state: DynState::new64([
+        Wrapping(0x22312194fc2bf72c),
+        Wrapping(0x9f555fa3c84c64c2),
+        Wrapping(0x2393b86b6f53b151),
+        Wrapping(0x963877195940eabd),
+        Wrapping(0x96283ee2a88effe3),
+        Wrapping(0xbe5e1e2553863992),
+        Wrapping(0x2b0199fc2c85b8aa),
+        Wrapping(0x0eb72ddc81c52ca2),
+    ]),
+    id: AlgorithmID::SHA512_256,
+};
+
+#[derive(Clone, Copy)]
+struct Output([u8; MAX_OUTPUT_LEN]);
+
+/// The maximum block length ([`Algorithm::block_len()`]) of all the algorithms
+/// in this module.
+pub const MAX_BLOCK_LEN: usize = BlockLen::MAX.into();
+
+/// The maximum output length ([`Algorithm::output_len()`]) of all the
+/// algorithms in this module.
+pub const MAX_OUTPUT_LEN: usize = OutputLen::MAX.into();
+
+/// The maximum chaining length ([`Algorithm::chaining_len()`]) of all the
+/// algorithms in this module.
+pub const MAX_CHAINING_LEN: usize = MAX_OUTPUT_LEN;
+
+#[inline]
+fn format_output<T, F, const N: usize>(input: [Wrapping<T>; sha2::CHAINING_WORDS], f: F) -> Output
+where
+    F: Fn(T) -> [u8; N],
+    T: Copy,
+{
+    let mut output = Output([0; MAX_OUTPUT_LEN]);
+    output
+        .0
+        .chunks_mut(N)
+        .zip(input.iter().copied().map(|Wrapping(w)| f(w)))
+        .for_each(|(o, i)| {
+            o.copy_from_slice(&i);
+        });
+    output
+}
+
+/// The length of the output of SHA-1, in bytes.
+pub const SHA1_OUTPUT_LEN: usize = sha1::OUTPUT_LEN.into();
+
+/// The length of the output of SHA-256, in bytes.
+pub const SHA256_OUTPUT_LEN: usize = OutputLen::_256.into();
+
+/// The length of the output of SHA-384, in bytes.
+pub const SHA384_OUTPUT_LEN: usize = OutputLen::_384.into();
+
+/// The length of the output of SHA-512, in bytes.
+pub const SHA512_OUTPUT_LEN: usize = OutputLen::_512.into();
+
+/// The length of the output of SHA-512/256, in bytes.
+pub const SHA512_256_OUTPUT_LEN: usize = OutputLen::_256.into();
+
+#[derive(Clone, Copy)]
+enum BlockLen {
+    _512 = 512 / 8,
+    _1024 = 1024 / 8, // MAX
+}
+
+impl BlockLen {
+    const MAX: Self = Self::_1024;
+    #[inline(always)]
+    const fn into(self) -> usize {
+        self as usize
+    }
+
+    #[inline(always)]
+    const fn len_len(self) -> usize {
+        let len_len = match self {
+            BlockLen::_512 => LenLen::_64,
+            BlockLen::_1024 => LenLen::_128,
+        };
+        len_len as usize
+    }
+}
+
+#[derive(Clone, Copy)]
+enum LenLen {
+    _64 = 64 / 8,
+    _128 = 128 / 8,
+}
+
+#[derive(Clone, Copy)]
+enum OutputLen {
+    _160 = 160 / 8,
+    _256 = 256 / 8,
+    _384 = 384 / 8,
+    _512 = 512 / 8, // MAX
+}
+
+impl OutputLen {
+    const MAX: Self = Self::_512;
+
+    #[inline(always)]
+    const fn into(self) -> usize {
+        self as usize
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    mod max_input {
+        extern crate alloc;
+        use super::super::super::digest;
+        use crate::polyfill::u64_from_usize;
+        use alloc::vec;
+
+        macro_rules! max_input_tests {
+            ( $algorithm_name:ident ) => {
+                mod $algorithm_name {
+                    use super::super::super::super::digest;
+
+                    #[test]
+                    fn max_input_test() {
+                        super::max_input_test(&digest::$algorithm_name);
+                    }
+
+                    #[test]
+                    #[should_panic]
+                    fn too_long_input_test_block() {
+                        super::too_long_input_test_block(&digest::$algorithm_name);
+                    }
+
+                    #[test]
+                    #[should_panic]
+                    fn too_long_input_test_byte() {
+                        super::too_long_input_test_byte(&digest::$algorithm_name);
+                    }
+                }
+            };
+        }
+
+        fn max_input_test(alg: &'static digest::Algorithm) {
+            let mut context = nearly_full_context(alg);
+            let next_input = vec![0u8; alg.block_len() - 1];
+            context.update(&next_input);
+            let _ = context.finish(); // no panic
+        }
+
+        fn too_long_input_test_block(alg: &'static digest::Algorithm) {
+            let mut context = nearly_full_context(alg);
+            let next_input = vec![0u8; alg.block_len()];
+            context.update(&next_input);
+            let _ = context.finish(); // should panic
+        }
+
+        fn too_long_input_test_byte(alg: &'static digest::Algorithm) {
+            let mut context = nearly_full_context(alg);
+            let next_input = vec![0u8; alg.block_len() - 1];
+            context.update(&next_input);
+            context.update(&[0]);
+            let _ = context.finish(); // should panic
+        }
+
+        fn nearly_full_context(alg: &'static digest::Algorithm) -> digest::Context {
+            // All implementations currently support up to 2^64-1 bits
+            // of input; according to the spec, SHA-384 and SHA-512
+            // support up to 2^128-1, but that's not implemented yet.
+            let max_bytes = 1u64 << (64 - 3);
+            let max_blocks = max_bytes / u64_from_usize(alg.block_len());
+            let completed_bytes = (max_blocks - 1) * u64_from_usize(alg.block_len());
+            digest::Context {
+                block: digest::BlockContext {
+                    state: alg.initial_state.clone(),
+                    completed_bytes,
+                    algorithm: alg,
+                },
+                pending: [0u8; digest::MAX_BLOCK_LEN],
+                num_pending: 0,
+            }
+        }
+
+        max_input_tests!(SHA1_FOR_LEGACY_USE_ONLY);
+        max_input_tests!(SHA256);
+        max_input_tests!(SHA384);
+        max_input_tests!(SHA512);
+    }
+}
diff --git a/ring-0.17.14/src/digest/dynstate.rs b/ring-0.17.14/src/digest/dynstate.rs
new file mode 100644
index 0000000000..c74f69f906
--- /dev/null
+++ b/ring-0.17.14/src/digest/dynstate.rs
@@ -0,0 +1,98 @@
+// Copyright 2015-2019 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{format_output, sha1, sha2, Output};
+use crate::{cpu, polyfill::slice};
+use core::mem::size_of;
+
+// Invariant: When constructed with `new32` (resp. `new64`), `As32` (resp.
+// `As64`) is the active variant.
+// Invariant: The active variant never changes after initialization.
+#[derive(Clone)]
+pub(super) enum DynState {
+    As64(sha2::State64),
+    As32(sha2::State32),
+}
+
+impl DynState {
+    pub const fn new32(initial_state: sha2::State32) -> Self {
+        Self::As32(initial_state)
+    }
+
+    pub const fn new64(initial_state: sha2::State64) -> Self {
+        Self::As64(initial_state)
+    }
+
+    pub fn format_output(self) -> Output {
+        match self {
+            Self::As64(state) => {
+                format_output::<_, _, { size_of::<u64>() }>(state, u64::to_be_bytes)
+            }
+            Self::As32(state) => {
+                format_output::<_, _, { size_of::<u32>() }>(state, u32::to_be_bytes)
+            }
+        }
+    }
+}
+
+pub(super) fn sha1_block_data_order<'d>(
+    state: &mut DynState,
+    data: &'d [u8],
+    _cpu_features: cpu::Features,
+) -> (usize, &'d [u8]) {
+    let state = match state {
+        DynState::As32(state) => state,
+        _ => {
+            unreachable!();
+        }
+    };
+
+    let (full_blocks, leftover) = slice::as_chunks(data);
+    sha1::sha1_block_data_order(state, full_blocks);
+    (full_blocks.as_flattened().len(), leftover)
+}
+
+pub(super) fn sha256_block_data_order<'d>(
+    state: &mut DynState,
+    data: &'d [u8],
+    cpu_features: cpu::Features,
+) -> (usize, &'d [u8]) {
+    let state = match state {
+        DynState::As32(state) => state,
+        _ => {
+            unreachable!();
+        }
+    };
+
+    let (full_blocks, leftover) = slice::as_chunks(data);
+    sha2::block_data_order_32(state, full_blocks, cpu_features);
+    (full_blocks.len() * sha2::SHA256_BLOCK_LEN.into(), leftover)
+}
+
+pub(super) fn sha512_block_data_order<'d>(
+    state: &mut DynState,
+    data: &'d [u8],
+    cpu_features: cpu::Features,
+) -> (usize, &'d [u8]) {
+    let state = match state {
+        DynState::As64(state) => state,
+        _ => {
+            unreachable!();
+        }
+    };
+
+    let (full_blocks, leftover) = slice::as_chunks(data);
+    sha2::block_data_order_64(state, full_blocks, cpu_features);
+    (full_blocks.len() * sha2::SHA512_BLOCK_LEN.into(), leftover)
+}
diff --git a/ring-0.17.14/src/digest/sha1.rs b/ring-0.17.14/src/digest/sha1.rs
new file mode 100644
index 0000000000..e8a4616d15
--- /dev/null
+++ b/ring-0.17.14/src/digest/sha1.rs
@@ -0,0 +1,119 @@
+// Copyright 2015-2025 Brian Smith.
+// Copyright 2016 Simon Sapin.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    sha2::{
+        fallback::{ch, maj, Word},
+        State32,
+    },
+    BlockLen, OutputLen,
+};
+use crate::polyfill::slice::{self, AsChunks};
+use core::{mem::size_of, num::Wrapping};
+
+pub(super) const BLOCK_LEN: BlockLen = BlockLen::_512;
+pub const CHAINING_LEN: usize = 160 / 8;
+pub(super) const OUTPUT_LEN: OutputLen = OutputLen::_160;
+const CHAINING_WORDS: usize = CHAINING_LEN / 4;
+
+type W32 = Wrapping<u32>;
+
+// FIPS 180-4 4.1.1
+#[inline]
+fn parity(x: W32, y: W32, z: W32) -> W32 {
+    x ^ y ^ z
+}
+
+type State = [W32; CHAINING_WORDS];
+const ROUNDS: usize = 80;
+
+pub fn sha1_block_data_order(state: &mut State32, data: AsChunks<u8, { BLOCK_LEN.into() }>) {
+    // The unwrap won't fail because `CHAINING_WORDS` is smaller than the
+    // length.
+    let state: &mut State = (&mut state[..CHAINING_WORDS]).try_into().unwrap();
+    // SAFETY: The caller guarantees that this is called with data pointing to `num`
+    // `BLOCK_LEN`-long blocks.
+    *state = block_data_order(*state, data)
+}
+
+#[inline]
+#[rustfmt::skip]
+fn block_data_order(
+    mut H: [W32; CHAINING_WORDS],
+    M: AsChunks<u8, { BLOCK_LEN.into() }>,
+) -> [W32; CHAINING_WORDS]
+{
+    for M in M {
+        let (M, remainder): (AsChunks<u8, {size_of::<W32>()}>, &[u8]) = slice::as_chunks(M);
+        debug_assert!(remainder.is_empty());
+
+        // FIPS 180-4 6.1.2 Step 1
+        let mut W: [W32; ROUNDS] = [W32::ZERO; ROUNDS];
+        W.iter_mut().zip(M).for_each(|(Wt, Mt)| {
+            *Wt = W32::from_be_bytes(*Mt);
+        });
+        for t in 16..ROUNDS {
+            let wt = W[t - 3] ^ W[t - 8] ^ W[t - 14] ^ W[t - 16];
+            W[t] = rotl(wt, 1);
+        }
+
+        // FIPS 180-4 6.1.2 Step 2
+        let [a, b, c, d, e] = H;
+
+        // FIPS 180-4 6.1.2 Step 3 with constants and functions from FIPS 180-4 {4.1.1, 4.2.1}
+        let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 0, Wrapping(0x5a827999), ch);
+        let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 20, Wrapping(0x6ed9eba1), parity);
+        let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 40, Wrapping(0x8f1bbcdc), maj);
+        let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 60, Wrapping(0xca62c1d6), parity);
+
+        // FIPS 180-4 6.1.2 Step 4
+        H[0] += a;
+        H[1] += b;
+        H[2] += c;
+        H[3] += d;
+        H[4] += e;
+    }
+
+    H
+}
+
+#[inline(always)]
+fn step3(
+    mut a: W32,
+    mut b: W32,
+    mut c: W32,
+    mut d: W32,
+    mut e: W32,
+    W: &[W32; 80],
+    t: usize,
+    k: W32,
+    f: impl Fn(W32, W32, W32) -> W32,
+) -> (W32, W32, W32, W32, W32) {
+    let W = &W[t..(t + 20)];
+    for W_t in W.iter() {
+        let T = rotl(a, 5) + f(b, c, d) + e + k + W_t;
+        e = d;
+        d = c;
+        c = rotl(b, 30);
+        b = a;
+        a = T;
+    }
+    (a, b, c, d, e)
+}
+
+#[inline(always)]
+fn rotl(x: W32, n: u32) -> W32 {
+    Wrapping(x.0.rotate_left(n))
+}
diff --git a/ring-0.17.14/src/digest/sha2/fallback.rs b/ring-0.17.14/src/digest/sha2/fallback.rs
new file mode 100644
index 0000000000..556f57db51
--- /dev/null
+++ b/ring-0.17.14/src/digest/sha2/fallback.rs
@@ -0,0 +1,372 @@
+// Copyright 2019-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::CHAINING_WORDS;
+use crate::polyfill::slice::{self, AsChunks};
+use core::{
+    num::Wrapping,
+    ops::{Add, AddAssign, BitAnd, BitOr, BitXor, Not, Shr},
+};
+
+#[cfg_attr(
+    any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86_64"
+    ),
+    allow(dead_code)
+)]
+#[inline]
+pub(super) fn block_data_order<S: Sha2, const BLOCK_LEN: usize, const BYTES_LEN: usize>(
+    mut H: [S; CHAINING_WORDS],
+    M: AsChunks<u8, BLOCK_LEN>,
+) -> [S; CHAINING_WORDS]
+where
+    for<'a> &'a S::InputBytes: From<&'a [u8; BYTES_LEN]>,
+{
+    for M in M {
+        let (M, remainder): (AsChunks<u8, BYTES_LEN>, &[u8]) = slice::as_chunks(M);
+        debug_assert!(remainder.is_empty());
+
+        // FIPS 180-4 {6.2.2, 6.4.2} Step 1
+        //
+        // TODO(MSRV): Use `let W: [S::from(0); S::ROUNDS]` instead; depends on
+        // https://github.com/rust-lang/rust/issues/43408.
+        let mut W = S::zero_w();
+        let W = W.as_mut();
+        W.iter_mut().zip(M).for_each(|(Wt, Mt)| {
+            let Mt: &S::InputBytes = Mt.into();
+            *Wt = S::from_be_bytes(*Mt);
+        });
+        for t in 16..S::ROUNDS {
+            W[t] = sigma_1(W[t - 2]) + W[t - 7] + sigma_0(W[t - 15]) + W[t - 16]
+        }
+
+        // FIPS 180-4 {6.2.2, 6.4.2} Step 2
+        let [mut a, mut b, mut c, mut d, mut e, mut f, mut g, mut h] = H;
+
+        // FIPS 180-4 {6.2.2, 6.4.2} Step 3
+        for (Kt, Wt) in S::K.as_ref().iter().zip(W.iter()) {
+            let T1 = h + SIGMA_1(e) + ch(e, f, g) + *Kt + *Wt;
+            let T2 = SIGMA_0(a) + maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+        }
+
+        // FIPS 180-4 {6.2.2, 6.4.2} Step 4
+        H[0] += a;
+        H[1] += b;
+        H[2] += c;
+        H[3] += d;
+        H[4] += e;
+        H[5] += f;
+        H[6] += g;
+        H[7] += h;
+    }
+
+    H
+}
+
+// FIPS 180-4 {4.1.1, 4.1.2, 4.1.3}
+#[inline(always)]
+pub(in super::super) fn ch<W: Word>(x: W, y: W, z: W) -> W {
+    (x & y) | (!x & z)
+}
+
+// FIPS 180-4 {4.1.1, 4.1.2, 4.1.3}
+#[inline(always)]
+pub(in super::super) fn maj<W: Word>(x: W, y: W, z: W) -> W {
+    (x & y) | (x & z) | (y & z)
+}
+
+// FIPS 180-4 {4.1.2, 4.1.3}
+#[inline(always)]
+fn SIGMA_0<S: Sha2>(x: S) -> S {
+    x.rotr(S::BIG_SIGMA_0.0) ^ x.rotr(S::BIG_SIGMA_0.1) ^ x.rotr(S::BIG_SIGMA_0.2)
+}
+
+// FIPS 180-4 {4.1.2, 4.1.3}
+#[inline(always)]
+fn SIGMA_1<S: Sha2>(x: S) -> S {
+    x.rotr(S::BIG_SIGMA_1.0) ^ x.rotr(S::BIG_SIGMA_1.1) ^ x.rotr(S::BIG_SIGMA_1.2)
+}
+
+// FIPS 180-4 {4.1.2, 4.1.3}
+#[inline(always)]
+fn sigma_0<S: Sha2>(x: S) -> S {
+    x.rotr(S::SMALL_SIGMA_0.0) ^ x.rotr(S::SMALL_SIGMA_0.1) ^ (x >> S::SMALL_SIGMA_0.2)
+}
+
+// FIPS 180-4 {4.1.2, 4.1.3}
+#[inline(always)]
+fn sigma_1<S: Sha2>(x: S) -> S {
+    x.rotr(S::SMALL_SIGMA_1.0) ^ x.rotr(S::SMALL_SIGMA_1.1) ^ (x >> S::SMALL_SIGMA_1.2)
+}
+
+// Commonality between SHA-1 and SHA-2 words.
+pub(in super::super) trait Word:
+    'static
+    + Sized
+    + Copy
+    + Add<Output = Self>
+    + AddAssign
+    + BitAnd<Output = Self>
+    + BitOr<Output = Self>
+    + Not<Output = Self>
+{
+    const ZERO: Self;
+
+    type InputBytes: Copy;
+
+    fn from_be_bytes(input: Self::InputBytes) -> Self;
+
+    fn rotr(self, count: u32) -> Self;
+}
+
+/// A SHA-2 input word.
+pub(super) trait Sha2: Word + BitXor<Output = Self> + Shr<usize, Output = Self> {
+    const BIG_SIGMA_0: (u32, u32, u32);
+    const BIG_SIGMA_1: (u32, u32, u32);
+    const SMALL_SIGMA_0: (u32, u32, usize);
+    const SMALL_SIGMA_1: (u32, u32, usize);
+
+    const ROUNDS: usize;
+
+    type W: AsRef<[Self]> + AsMut<[Self]>;
+    fn zero_w() -> Self::W;
+
+    const K: &'static Self::W;
+}
+
+impl Word for Wrapping<u32> {
+    const ZERO: Self = Self(0);
+    type InputBytes = [u8; 4];
+
+    #[inline(always)]
+    fn from_be_bytes(input: Self::InputBytes) -> Self {
+        Self(u32::from_be_bytes(input))
+    }
+
+    #[inline(always)]
+    fn rotr(self, count: u32) -> Self {
+        Self(self.0.rotate_right(count))
+    }
+}
+
+// SHA-256
+impl Sha2 for Wrapping<u32> {
+    // FIPS 180-4 4.1.2
+    const BIG_SIGMA_0: (u32, u32, u32) = (2, 13, 22);
+    const BIG_SIGMA_1: (u32, u32, u32) = (6, 11, 25);
+    const SMALL_SIGMA_0: (u32, u32, usize) = (7, 18, 3);
+    const SMALL_SIGMA_1: (u32, u32, usize) = (17, 19, 10);
+
+    // FIPS 180-4 {6.2.2} Step 1
+    const ROUNDS: usize = 64;
+
+    type W = [Self; Self::ROUNDS];
+    fn zero_w() -> Self::W {
+        [Self::ZERO; Self::ROUNDS]
+    }
+
+    // FIPS 180-4 4.2.2
+    const K: &'static Self::W = &[
+        Self(0x428a2f98),
+        Self(0x71374491),
+        Self(0xb5c0fbcf),
+        Self(0xe9b5dba5),
+        Self(0x3956c25b),
+        Self(0x59f111f1),
+        Self(0x923f82a4),
+        Self(0xab1c5ed5),
+        Self(0xd807aa98),
+        Self(0x12835b01),
+        Self(0x243185be),
+        Self(0x550c7dc3),
+        Self(0x72be5d74),
+        Self(0x80deb1fe),
+        Self(0x9bdc06a7),
+        Self(0xc19bf174),
+        Self(0xe49b69c1),
+        Self(0xefbe4786),
+        Self(0x0fc19dc6),
+        Self(0x240ca1cc),
+        Self(0x2de92c6f),
+        Self(0x4a7484aa),
+        Self(0x5cb0a9dc),
+        Self(0x76f988da),
+        Self(0x983e5152),
+        Self(0xa831c66d),
+        Self(0xb00327c8),
+        Self(0xbf597fc7),
+        Self(0xc6e00bf3),
+        Self(0xd5a79147),
+        Self(0x06ca6351),
+        Self(0x14292967),
+        Self(0x27b70a85),
+        Self(0x2e1b2138),
+        Self(0x4d2c6dfc),
+        Self(0x53380d13),
+        Self(0x650a7354),
+        Self(0x766a0abb),
+        Self(0x81c2c92e),
+        Self(0x92722c85),
+        Self(0xa2bfe8a1),
+        Self(0xa81a664b),
+        Self(0xc24b8b70),
+        Self(0xc76c51a3),
+        Self(0xd192e819),
+        Self(0xd6990624),
+        Self(0xf40e3585),
+        Self(0x106aa070),
+        Self(0x19a4c116),
+        Self(0x1e376c08),
+        Self(0x2748774c),
+        Self(0x34b0bcb5),
+        Self(0x391c0cb3),
+        Self(0x4ed8aa4a),
+        Self(0x5b9cca4f),
+        Self(0x682e6ff3),
+        Self(0x748f82ee),
+        Self(0x78a5636f),
+        Self(0x84c87814),
+        Self(0x8cc70208),
+        Self(0x90befffa),
+        Self(0xa4506ceb),
+        Self(0xbef9a3f7),
+        Self(0xc67178f2),
+    ];
+}
+
+impl Word for Wrapping<u64> {
+    const ZERO: Self = Self(0);
+    type InputBytes = [u8; 8];
+
+    #[inline(always)]
+    fn from_be_bytes(input: Self::InputBytes) -> Self {
+        Self(u64::from_be_bytes(input))
+    }
+
+    #[inline(always)]
+    fn rotr(self, count: u32) -> Self {
+        Self(self.0.rotate_right(count))
+    }
+}
+
+// SHA-384 and SHA-512
+impl Sha2 for Wrapping<u64> {
+    // FIPS 180-4 4.1.3
+    const BIG_SIGMA_0: (u32, u32, u32) = (28, 34, 39);
+    const BIG_SIGMA_1: (u32, u32, u32) = (14, 18, 41);
+    const SMALL_SIGMA_0: (u32, u32, usize) = (1, 8, 7);
+    const SMALL_SIGMA_1: (u32, u32, usize) = (19, 61, 6);
+
+    // FIPS 180-4 {6.4.2} Step 1
+    const ROUNDS: usize = 80;
+
+    type W = [Self; Self::ROUNDS];
+    fn zero_w() -> Self::W {
+        [Self::ZERO; Self::ROUNDS]
+    }
+
+    // FIPS 180-4 4.2.3
+    const K: &'static Self::W = &[
+        Self(0x428a2f98d728ae22),
+        Self(0x7137449123ef65cd),
+        Self(0xb5c0fbcfec4d3b2f),
+        Self(0xe9b5dba58189dbbc),
+        Self(0x3956c25bf348b538),
+        Self(0x59f111f1b605d019),
+        Self(0x923f82a4af194f9b),
+        Self(0xab1c5ed5da6d8118),
+        Self(0xd807aa98a3030242),
+        Self(0x12835b0145706fbe),
+        Self(0x243185be4ee4b28c),
+        Self(0x550c7dc3d5ffb4e2),
+        Self(0x72be5d74f27b896f),
+        Self(0x80deb1fe3b1696b1),
+        Self(0x9bdc06a725c71235),
+        Self(0xc19bf174cf692694),
+        Self(0xe49b69c19ef14ad2),
+        Self(0xefbe4786384f25e3),
+        Self(0x0fc19dc68b8cd5b5),
+        Self(0x240ca1cc77ac9c65),
+        Self(0x2de92c6f592b0275),
+        Self(0x4a7484aa6ea6e483),
+        Self(0x5cb0a9dcbd41fbd4),
+        Self(0x76f988da831153b5),
+        Self(0x983e5152ee66dfab),
+        Self(0xa831c66d2db43210),
+        Self(0xb00327c898fb213f),
+        Self(0xbf597fc7beef0ee4),
+        Self(0xc6e00bf33da88fc2),
+        Self(0xd5a79147930aa725),
+        Self(0x06ca6351e003826f),
+        Self(0x142929670a0e6e70),
+        Self(0x27b70a8546d22ffc),
+        Self(0x2e1b21385c26c926),
+        Self(0x4d2c6dfc5ac42aed),
+        Self(0x53380d139d95b3df),
+        Self(0x650a73548baf63de),
+        Self(0x766a0abb3c77b2a8),
+        Self(0x81c2c92e47edaee6),
+        Self(0x92722c851482353b),
+        Self(0xa2bfe8a14cf10364),
+        Self(0xa81a664bbc423001),
+        Self(0xc24b8b70d0f89791),
+        Self(0xc76c51a30654be30),
+        Self(0xd192e819d6ef5218),
+        Self(0xd69906245565a910),
+        Self(0xf40e35855771202a),
+        Self(0x106aa07032bbd1b8),
+        Self(0x19a4c116b8d2d0c8),
+        Self(0x1e376c085141ab53),
+        Self(0x2748774cdf8eeb99),
+        Self(0x34b0bcb5e19b48a8),
+        Self(0x391c0cb3c5c95a63),
+        Self(0x4ed8aa4ae3418acb),
+        Self(0x5b9cca4f7763e373),
+        Self(0x682e6ff3d6b2b8a3),
+        Self(0x748f82ee5defb2fc),
+        Self(0x78a5636f43172f60),
+        Self(0x84c87814a1f0ab72),
+        Self(0x8cc702081a6439ec),
+        Self(0x90befffa23631e28),
+        Self(0xa4506cebde82bde9),
+        Self(0xbef9a3f7b2c67915),
+        Self(0xc67178f2e372532b),
+        Self(0xca273eceea26619c),
+        Self(0xd186b8c721c0c207),
+        Self(0xeada7dd6cde0eb1e),
+        Self(0xf57d4f7fee6ed178),
+        Self(0x06f067aa72176fba),
+        Self(0x0a637dc5a2c898a6),
+        Self(0x113f9804bef90dae),
+        Self(0x1b710b35131c471b),
+        Self(0x28db77f523047d84),
+        Self(0x32caab7b40c72493),
+        Self(0x3c9ebe0a15c9bebc),
+        Self(0x431d67c49c100d4c),
+        Self(0x4cc5d4becb3e42b6),
+        Self(0x597f299cfc657e2a),
+        Self(0x5fcb6fab3ad6faec),
+        Self(0x6c44198c4a475817),
+    ];
+}
diff --git a/ring-0.17.14/src/digest/sha2/ffi.rs b/ring-0.17.14/src/digest/sha2/ffi.rs
new file mode 100644
index 0000000000..3ee8044e3b
--- /dev/null
+++ b/ring-0.17.14/src/digest/sha2/ffi.rs
@@ -0,0 +1,71 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::CHAINING_WORDS;
+use crate::polyfill::slice::AsChunks;
+use core::num::{NonZeroUsize, Wrapping};
+
+/// `unsafe { T => f }` means it is safe to call `f` iff we can construct
+/// a value of type `T`.
+macro_rules! sha2_ffi {
+    ( $U:ty, $BLOCK_LEN:expr, unsafe { $Cpu:ty => $f:ident },
+      $state:expr, $data:expr, $cpu:expr $(,)? ) => {{
+        prefixed_extern! {
+            fn $f(
+                state: *mut [core::num::Wrapping<$U>; crate::digest::sha2::CHAINING_WORDS],
+                data: *const [u8; $BLOCK_LEN],
+                num: crate::c::NonZero_size_t);
+        }
+        // SAFETY: The user asserts that $f has the signature above and is safe
+        // to call if additionally we have a value of type `$Cpu`, which we do.
+        unsafe {
+            crate::digest::sha2::ffi::sha2_ffi::<$U, $Cpu, { $BLOCK_LEN }>($state, $data, $cpu, $f)
+        }
+    }};
+}
+
+macro_rules! sha2_32_ffi {
+    ( unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => {
+        sha2_ffi!(u32, crate::digest::sha2::SHA256_BLOCK_LEN.into(),
+                  unsafe { $Cpu => $f }, $state, $data, $cpu)
+    }
+}
+
+macro_rules! sha2_64_ffi {
+    ( unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => {
+        sha2_ffi!(u64, SHA512_BLOCK_LEN.into(), unsafe { $Cpu => $f }, $state, $data, $cpu)
+    }
+}
+
+pub(super) unsafe fn sha2_ffi<U, Cpu, const BLOCK_LEN: usize>(
+    state: &mut [Wrapping<U>; CHAINING_WORDS],
+    data: AsChunks<u8, BLOCK_LEN>,
+    cpu: Cpu,
+    f: unsafe extern "C" fn(
+        *mut [Wrapping<U>; CHAINING_WORDS],
+        *const [u8; BLOCK_LEN],
+        crate::c::NonZero_size_t,
+    ),
+) {
+    if let Some(blocks) = NonZeroUsize::new(data.len()) {
+        let data = data.as_ptr();
+        let _: Cpu = cpu;
+        // SAFETY:
+        //   * `blocks` is non-zero.
+        //   * `data` is non-NULL and points to `blocks` blocks.
+        //   * The caller asserted that `f` meets this contract if we have
+        //     an instance of `Cpu`.
+        unsafe { f(state, data, blocks) }
+    }
+}
diff --git a/ring-0.17.14/src/digest/sha2/mod.rs b/ring-0.17.14/src/digest/sha2/mod.rs
new file mode 100644
index 0000000000..78ca27b6da
--- /dev/null
+++ b/ring-0.17.14/src/digest/sha2/mod.rs
@@ -0,0 +1,34 @@
+// Copyright 2019-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::BlockLen;
+
+pub(super) use self::{
+    sha2_32::{block_data_order_32, State32, SHA256_BLOCK_LEN},
+    sha2_64::{block_data_order_64, State64, SHA512_BLOCK_LEN},
+};
+
+pub(super) const CHAINING_WORDS: usize = 8;
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+#[macro_use]
+mod ffi;
+
+pub(super) mod fallback;
+mod sha2_32;
+mod sha2_64;
diff --git a/ring-0.17.14/src/digest/sha2/sha2_32.rs b/ring-0.17.14/src/digest/sha2/sha2_32.rs
new file mode 100644
index 0000000000..4f762d1587
--- /dev/null
+++ b/ring-0.17.14/src/digest/sha2/sha2_32.rs
@@ -0,0 +1,63 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{BlockLen, CHAINING_WORDS};
+use crate::{cpu, polyfill::slice::AsChunks};
+use cfg_if::cfg_if;
+use core::num::Wrapping;
+
+pub(in super::super) const SHA256_BLOCK_LEN: BlockLen = BlockLen::_512;
+
+pub type State32 = [Wrapping<u32>; CHAINING_WORDS];
+
+pub(crate) fn block_data_order_32(
+    state: &mut State32,
+    data: AsChunks<u8, { SHA256_BLOCK_LEN.into() }>,
+    cpu: cpu::Features,
+) {
+    cfg_if! {
+        if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+            use cpu::{GetFeature as _, arm::Sha256};
+            if let Some(cpu) = cpu.get_feature() {
+                sha2_32_ffi!(unsafe { Sha256 => sha256_block_data_order_hw }, state, data, cpu)
+            } else {
+                sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ())
+            }
+        } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] {
+            use cpu::{GetFeature as _, arm::Neon};
+            if let Some(cpu) = cpu.get_feature() {
+                sha2_32_ffi!(unsafe { Neon => sha256_block_data_order_neon }, state, data, cpu)
+            } else {
+                sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ())
+            }
+        } else if #[cfg(target_arch = "x86_64")] {
+            use cpu::{GetFeature as _, intel::{Avx, IntelCpu, Sha, Ssse3 }};
+            let cpu = cpu.values();
+            if let Some(cpu) = cpu.get_feature() {
+                sha2_32_ffi!(unsafe { (Sha, Ssse3) => sha256_block_data_order_hw }, state, data, cpu)
+            } else if let Some(cpu) = cpu.get_feature() {
+                // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA
+                // extension; see the discussion in upstream's sha1-586.pl.
+                sha2_32_ffi!(unsafe { (Avx, IntelCpu) => sha256_block_data_order_avx }, state, data, cpu)
+            } else if let Some(cpu) = cpu.get_feature() {
+                sha2_32_ffi!(unsafe { Ssse3 => sha256_block_data_order_ssse3 }, state, data, cpu)
+            } else {
+                sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ())
+            }
+        } else {
+            let _ = cpu; // Unneeded.
+            *state = super::fallback::block_data_order(*state, data)
+        }
+    }
+}
diff --git a/ring-0.17.14/src/digest/sha2/sha2_64.rs b/ring-0.17.14/src/digest/sha2/sha2_64.rs
new file mode 100644
index 0000000000..63c254e1d9
--- /dev/null
+++ b/ring-0.17.14/src/digest/sha2/sha2_64.rs
@@ -0,0 +1,60 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{BlockLen, CHAINING_WORDS};
+use crate::{cpu, polyfill::slice::AsChunks};
+use cfg_if::cfg_if;
+use core::num::Wrapping;
+
+pub(in super::super) const SHA512_BLOCK_LEN: BlockLen = BlockLen::_1024;
+
+pub type State64 = [Wrapping<u64>; CHAINING_WORDS];
+
+pub(crate) fn block_data_order_64(
+    state: &mut State64,
+    data: AsChunks<u8, { SHA512_BLOCK_LEN.into() }>,
+    cpu: cpu::Features,
+) {
+    cfg_if! {
+        if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] {
+            use cpu::{GetFeature as _, arm::Sha512};
+            if let Some(cpu) = cpu.get_feature() {
+                sha2_64_ffi!(unsafe { Sha512 => sha512_block_data_order_hw }, state, data, cpu)
+            } else {
+                sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ())
+            }
+        } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] {
+            use cpu::{GetFeature as _, arm::Neon};
+            if let Some(cpu) = cpu.get_feature() {
+                sha2_64_ffi!(unsafe { Neon => sha512_block_data_order_neon }, state, data, cpu)
+            } else {
+                sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ())
+            }
+        } else if #[cfg(target_arch = "x86_64")] {
+            use cpu::{GetFeature as _, intel::{Avx, IntelCpu}};
+            if let Some(cpu) = cpu.get_feature() {
+                // Pre-Zen AMD CPUs had microcoded SHLD/SHRD which makes the
+                // AVX version slow. We're also unsure of the side channel
+                // ramifications of those microcoded instructions.
+                sha2_64_ffi!(unsafe { (Avx, IntelCpu) => sha512_block_data_order_avx },
+                    state, data, cpu);
+            } else {
+                sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ())
+            }
+        } else {
+            let _ = cpu; // Unneeded.
+            *state = super::fallback::block_data_order(*state, data)
+        }
+    }
+}
diff --git a/ring-0.17.14/src/ec.rs b/ring-0.17.14/src/ec.rs
new file mode 100644
index 0000000000..33e7fc2feb
--- /dev/null
+++ b/ring-0.17.14/src/ec.rs
@@ -0,0 +1,68 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{cpu, error, rand};
+
+pub use self::keys::{KeyPair, PublicKey, Seed};
+
+pub struct Curve {
+    pub public_key_len: usize,
+    pub elem_scalar_seed_len: usize,
+
+    pub id: CurveID,
+
+    // Precondition: `bytes` is the correct length.
+    check_private_key_bytes: fn(bytes: &[u8], cpu: cpu::Features) -> Result<(), error::Unspecified>,
+
+    generate_private_key: fn(
+        rng: &dyn rand::SecureRandom,
+        &mut [u8],
+        cpu: cpu::Features,
+    ) -> Result<(), error::Unspecified>,
+
+    public_from_private: fn(
+        public_out: &mut [u8],
+        private_key: &Seed,
+        cpu: cpu::Features,
+    ) -> Result<(), error::Unspecified>,
+}
+
+derive_debug_via_id!(Curve);
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum CurveID {
+    Curve25519,
+    P256,
+    P384,
+}
+
+const ELEM_MAX_BITS: usize = 384;
+pub const ELEM_MAX_BYTES: usize = (ELEM_MAX_BITS + 7) / 8;
+
+pub const SCALAR_MAX_BYTES: usize = ELEM_MAX_BYTES;
+const SEED_MAX_BYTES: usize = ELEM_MAX_BYTES;
+
+/// The maximum length of a PKCS#8 documents generated by *ring* for ECC keys.
+///
+/// This is NOT the maximum length of a PKCS#8 document that can be consumed by
+/// `pkcs8::unwrap_key()`.
+///
+/// `40` is the length of the P-384 template. It is actually one byte shorter
+/// than the P-256 template, but the private key and the public key are much
+/// longer.
+pub const PKCS8_DOCUMENT_MAX_LEN: usize = 40 + SCALAR_MAX_BYTES + keys::PUBLIC_KEY_MAX_LEN;
+
+pub mod curve25519;
+mod keys;
+pub mod suite_b;
diff --git a/ring-0.17.14/src/ec/curve25519.rs b/ring-0.17.14/src/ec/curve25519.rs
new file mode 100644
index 0000000000..4f4596d083
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519.rs
@@ -0,0 +1,22 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Elliptic curve operations and schemes using Curve25519.
+
+pub mod ed25519;
+
+pub mod x25519;
+
+mod ops;
+mod scalar;
diff --git a/ring-0.17.14/src/ec/curve25519/ed25519.rs b/ring-0.17.14/src/ec/curve25519/ed25519.rs
new file mode 100644
index 0000000000..fe1a9ff607
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519/ed25519.rs
@@ -0,0 +1,32 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! EdDSA Signatures.
+
+use super::ops::ELEM_LEN;
+use crate::digest;
+
+pub mod signing;
+pub mod verification;
+
+/// The length of an Ed25519 public key.
+pub const ED25519_PUBLIC_KEY_LEN: usize = ELEM_LEN;
+
+pub fn eddsa_digest(signature_r: &[u8], public_key: &[u8], msg: &[u8]) -> digest::Digest {
+    let mut ctx = digest::Context::new(&digest::SHA512);
+    ctx.update(signature_r);
+    ctx.update(public_key);
+    ctx.update(msg);
+    ctx.finish()
+}
diff --git a/ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der b/ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der
new file mode 100644
index 0000000000..7230086e9c
Binary files /dev/null and b/ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der differ
diff --git a/ring-0.17.14/src/ec/curve25519/ed25519/signing.rs b/ring-0.17.14/src/ec/curve25519/ed25519/signing.rs
new file mode 100644
index 0000000000..d6d249d782
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519/ed25519/signing.rs
@@ -0,0 +1,275 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! EdDSA Signatures.
+
+use super::{super::ops::*, eddsa_digest, ED25519_PUBLIC_KEY_LEN};
+use crate::{
+    cpu, digest, error,
+    io::der,
+    pkcs8, rand,
+    signature::{self, KeyPair as SigningKeyPair},
+};
+
+/// An Ed25519 key pair, for signing.
+pub struct Ed25519KeyPair {
+    // RFC 8032 Section 5.1.6 calls this *s*.
+    private_scalar: Scalar,
+
+    // RFC 8032 Section 5.1.6 calls this *prefix*.
+    private_prefix: Prefix,
+
+    // RFC 8032 Section 5.1.5 calls this *A*.
+    public_key: PublicKey,
+}
+
+derive_debug_via_field!(Ed25519KeyPair, stringify!(Ed25519KeyPair), public_key);
+
+impl Ed25519KeyPair {
+    /// Generates a new key pair and returns the key pair serialized as a
+    /// PKCS#8 document.
+    ///
+    /// The PKCS#8 document will be a v2 `OneAsymmetricKey` with the public key,
+    /// as described in [RFC 5958 Section 2]; see [RFC 8410 Section 10.3] for an
+    /// example.
+    ///
+    /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2
+    /// [RFC 8410 Section 10.3]: https://tools.ietf.org/html/rfc8410#section-10.3
+    pub fn generate_pkcs8(
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<pkcs8::Document, error::Unspecified> {
+        let cpu_features = cpu::features();
+        let seed: [u8; SEED_LEN] = rand::generate(rng)?.expose();
+        let key_pair = Self::from_seed_(&seed, cpu_features);
+        Ok(pkcs8::wrap_key(
+            &PKCS8_TEMPLATE,
+            &seed[..],
+            key_pair.public_key().as_ref(),
+        ))
+    }
+
+    /// Constructs an Ed25519 key pair by parsing an unencrypted PKCS#8 v2
+    /// Ed25519 private key.
+    ///
+    /// `openssl genpkey -algorithm ED25519` generates PKCS# v1 keys, which
+    /// require the use of `Ed25519KeyPair::from_pkcs8_maybe_unchecked()`
+    /// instead of `Ed25519KeyPair::from_pkcs8()`.
+    ///
+    /// The input must be in PKCS#8 v2 format, and in particular it must contain
+    /// the public key in addition to the private key. `from_pkcs8()` will
+    /// verify that the public key and the private key are consistent with each
+    /// other.
+    ///
+    /// Some early implementations of PKCS#8 v2, including earlier versions of
+    /// *ring* and other implementations, wrapped the public key in the wrong
+    /// ASN.1 tags. Both that incorrect form and the standardized form are
+    /// accepted.
+    ///
+    /// If you need to parse PKCS#8 v1 files (without the public key) then use
+    /// `Ed25519KeyPair::from_pkcs8_maybe_unchecked()` instead.
+    pub fn from_pkcs8(pkcs8: &[u8]) -> Result<Self, error::KeyRejected> {
+        let version = pkcs8::Version::V2Only(pkcs8::PublicKeyOptions {
+            accept_legacy_ed25519_public_key_tag: true,
+        });
+        let (seed, public_key) = unwrap_pkcs8(version, untrusted::Input::from(pkcs8))?;
+        Self::from_seed_and_public_key(
+            seed.as_slice_less_safe(),
+            public_key.unwrap().as_slice_less_safe(),
+        )
+    }
+
+    /// Constructs an Ed25519 key pair by parsing an unencrypted PKCS#8 v1 or v2
+    /// Ed25519 private key.
+    ///
+    /// `openssl genpkey -algorithm ED25519` generates PKCS# v1 keys.
+    ///
+    /// It is recommended to use `Ed25519KeyPair::from_pkcs8()`, which accepts
+    /// only PKCS#8 v2 files that contain the public key.
+    /// `from_pkcs8_maybe_unchecked()` parses PKCS#2 files exactly like
+    /// `from_pkcs8()`. It also accepts v1 files. PKCS#8 v1 files do not contain
+    /// the public key, so when a v1 file is parsed the public key will be
+    /// computed from the private key, and there will be no consistency check
+    /// between the public key and the private key.
+    ///
+    /// Some early implementations of PKCS#8 v2, including earlier versions of
+    /// *ring* and other implementations, wrapped the public key in the wrong
+    /// ASN.1 tags. Both that incorrect form and the standardized form are
+    /// accepted.
+    ///
+    /// PKCS#8 v2 files are parsed exactly like `Ed25519KeyPair::from_pkcs8()`.
+    pub fn from_pkcs8_maybe_unchecked(pkcs8: &[u8]) -> Result<Self, error::KeyRejected> {
+        let version = pkcs8::Version::V1OrV2(pkcs8::PublicKeyOptions {
+            accept_legacy_ed25519_public_key_tag: true,
+        });
+        let (seed, public_key) = unwrap_pkcs8(version, untrusted::Input::from(pkcs8))?;
+        if let Some(public_key) = public_key {
+            Self::from_seed_and_public_key(
+                seed.as_slice_less_safe(),
+                public_key.as_slice_less_safe(),
+            )
+        } else {
+            Self::from_seed_unchecked(seed.as_slice_less_safe())
+        }
+    }
+
+    /// Constructs an Ed25519 key pair from the private key seed `seed` and its
+    /// public key `public_key`.
+    ///
+    /// It is recommended to use `Ed25519KeyPair::from_pkcs8()` instead.
+    ///
+    /// The private and public keys will be verified to be consistent with each
+    /// other. This helps avoid misuse of the key (e.g. accidentally swapping
+    /// the private key and public key, or using the wrong private key for the
+    /// public key). This also detects any corruption of the public or private
+    /// key.
+    pub fn from_seed_and_public_key(
+        seed: &[u8],
+        public_key: &[u8],
+    ) -> Result<Self, error::KeyRejected> {
+        let pair = Self::from_seed_unchecked(seed)?;
+
+        // This implicitly verifies that `public_key` is the right length.
+        // XXX: This rejects ~18 keys when they are partially reduced, though
+        // those keys are virtually impossible to find.
+        if public_key != pair.public_key.as_ref() {
+            let err = if public_key.len() != pair.public_key.as_ref().len() {
+                error::KeyRejected::invalid_encoding()
+            } else {
+                error::KeyRejected::inconsistent_components()
+            };
+            return Err(err);
+        }
+
+        Ok(pair)
+    }
+
+    /// Constructs a Ed25519 key pair from the private key seed `seed`.
+    ///
+    /// It is recommended to use `Ed25519KeyPair::from_pkcs8()` instead. When
+    /// that is not practical, it is recommended to use
+    /// `Ed25519KeyPair::from_seed_and_public_key()` instead.
+    ///
+    /// Since the public key is not given, the public key will be computed from
+    /// the private key. It is not possible to detect misuse or corruption of
+    /// the private key since the public key isn't given as input.
+    pub fn from_seed_unchecked(seed: &[u8]) -> Result<Self, error::KeyRejected> {
+        let seed = seed
+            .try_into()
+            .map_err(|_| error::KeyRejected::invalid_encoding())?;
+        Ok(Self::from_seed_(seed, cpu::features()))
+    }
+
+    fn from_seed_(seed: &Seed, cpu_features: cpu::Features) -> Self {
+        let h = digest::digest(&digest::SHA512, seed);
+        let (private_scalar, private_prefix) = h.as_ref().split_at(SCALAR_LEN);
+
+        let private_scalar =
+            MaskedScalar::from_bytes_masked(private_scalar.try_into().unwrap()).into();
+
+        let a = ExtPoint::from_scalarmult_base(&private_scalar, cpu_features);
+
+        Self {
+            private_scalar,
+            private_prefix: private_prefix.try_into().unwrap(),
+            public_key: PublicKey(a.into_encoded_point(cpu_features)),
+        }
+    }
+
+    /// Returns the signature of the message `msg`.
+    pub fn sign(&self, msg: &[u8]) -> signature::Signature {
+        let cpu_features = cpu::features();
+        signature::Signature::new(|signature_bytes| {
+            prefixed_extern! {
+                fn x25519_sc_muladd(
+                    s: &mut [u8; SCALAR_LEN],
+                    a: &Scalar,
+                    b: &Scalar,
+                    c: &Scalar,
+                );
+            }
+
+            let (signature_bytes, _unused) = signature_bytes.split_at_mut(ELEM_LEN + SCALAR_LEN);
+            let (signature_r, signature_s) = signature_bytes.split_at_mut(ELEM_LEN);
+            let nonce = {
+                let mut ctx = digest::Context::new(&digest::SHA512);
+                ctx.update(&self.private_prefix);
+                ctx.update(msg);
+                ctx.finish()
+            };
+            let nonce = Scalar::from_sha512_digest_reduced(nonce);
+
+            let r = ExtPoint::from_scalarmult_base(&nonce, cpu_features);
+            signature_r.copy_from_slice(&r.into_encoded_point(cpu_features));
+            let hram_digest = eddsa_digest(signature_r, self.public_key.as_ref(), msg);
+            let hram = Scalar::from_sha512_digest_reduced(hram_digest);
+            unsafe {
+                x25519_sc_muladd(
+                    signature_s.try_into().unwrap(),
+                    &hram,
+                    &self.private_scalar,
+                    &nonce,
+                );
+            }
+
+            SIGNATURE_LEN
+        })
+    }
+}
+
+impl signature::KeyPair for Ed25519KeyPair {
+    type PublicKey = PublicKey;
+
+    fn public_key(&self) -> &Self::PublicKey {
+        &self.public_key
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct PublicKey([u8; ED25519_PUBLIC_KEY_LEN]);
+
+impl AsRef<[u8]> for PublicKey {
+    fn as_ref(&self) -> &[u8] {
+        self.0.as_ref()
+    }
+}
+
+derive_debug_self_as_ref_hex_bytes!(PublicKey);
+
+fn unwrap_pkcs8(
+    version: pkcs8::Version,
+    input: untrusted::Input,
+) -> Result<(untrusted::Input, Option<untrusted::Input>), error::KeyRejected> {
+    let (private_key, public_key) = pkcs8::unwrap_key(&PKCS8_TEMPLATE, version, input)?;
+    let private_key = private_key
+        .read_all(error::Unspecified, |input| {
+            der::expect_tag_and_get_value(input, der::Tag::OctetString)
+        })
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+    Ok((private_key, public_key))
+}
+
+type Prefix = [u8; PREFIX_LEN];
+const PREFIX_LEN: usize = digest::SHA512_OUTPUT_LEN - SCALAR_LEN;
+
+const SIGNATURE_LEN: usize = ELEM_LEN + SCALAR_LEN;
+
+type Seed = [u8; SEED_LEN];
+const SEED_LEN: usize = 32;
+
+static PKCS8_TEMPLATE: pkcs8::Template = pkcs8::Template {
+    bytes: include_bytes!("ed25519_pkcs8_v2_template.der"),
+    alg_id_range: core::ops::Range { start: 7, end: 12 },
+    curve_id_index: 0,
+    private_key_index: 0x10,
+};
diff --git a/ring-0.17.14/src/ec/curve25519/ed25519/verification.rs b/ring-0.17.14/src/ec/curve25519/ed25519/verification.rs
new file mode 100644
index 0000000000..8a81f42f51
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519/ed25519/verification.rs
@@ -0,0 +1,85 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! EdDSA Signatures.
+
+use super::{super::ops::*, eddsa_digest};
+use crate::{cpu, error, sealed, signature};
+
+/// Parameters for EdDSA signing and verification.
+pub struct EdDSAParameters;
+
+impl core::fmt::Debug for EdDSAParameters {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        write!(f, "ring::signature::ED25519")
+    }
+}
+
+/// Verification of [Ed25519] signatures.
+///
+/// Ed25519 uses SHA-512 as the digest algorithm.
+///
+/// [Ed25519]: https://ed25519.cr.yp.to/
+pub static ED25519: EdDSAParameters = EdDSAParameters {};
+
+impl signature::VerificationAlgorithm for EdDSAParameters {
+    fn verify(
+        &self,
+        public_key: untrusted::Input,
+        msg: untrusted::Input,
+        signature: untrusted::Input,
+    ) -> Result<(), error::Unspecified> {
+        let cpu_features = cpu::features();
+
+        let public_key: &[u8; ELEM_LEN] = public_key.as_slice_less_safe().try_into()?;
+        let (signature_r, signature_s) = signature.read_all(error::Unspecified, |input| {
+            let signature_r: &[u8; ELEM_LEN] = input
+                .read_bytes(ELEM_LEN)?
+                .as_slice_less_safe()
+                .try_into()?;
+            let signature_s: &[u8; SCALAR_LEN] = input
+                .read_bytes(SCALAR_LEN)?
+                .as_slice_less_safe()
+                .try_into()?;
+            Ok((signature_r, signature_s))
+        })?;
+
+        let signature_s = Scalar::from_bytes_checked(*signature_s)?;
+
+        let mut a = ExtPoint::from_encoded_point_vartime(public_key)?;
+        a.invert_vartime();
+
+        let h_digest = eddsa_digest(signature_r, public_key, msg.as_slice_less_safe());
+        let h = Scalar::from_sha512_digest_reduced(h_digest);
+
+        let mut r = Point::new_at_infinity();
+        unsafe { x25519_ge_double_scalarmult_vartime(&mut r, &h, &a, &signature_s) };
+        let r_check = r.into_encoded_point(cpu_features);
+        if *signature_r != r_check {
+            return Err(error::Unspecified);
+        }
+        Ok(())
+    }
+}
+
+impl sealed::Sealed for EdDSAParameters {}
+
+prefixed_extern! {
+    fn x25519_ge_double_scalarmult_vartime(
+        r: &mut Point,
+        a_coeff: &Scalar,
+        a: &ExtPoint,
+        b_coeff: &Scalar,
+    );
+}
diff --git a/ring-0.17.14/src/ec/curve25519/ops.rs b/ring-0.17.14/src/ec/curve25519/ops.rs
new file mode 100644
index 0000000000..b34d68c8f2
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519/ops.rs
@@ -0,0 +1,180 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Elliptic curve operations on the birationally equivalent curves Curve25519
+//! and Edwards25519.
+
+pub use super::scalar::{MaskedScalar, Scalar, SCALAR_LEN};
+use crate::{
+    bssl, cpu, error,
+    limb::{Limb, LIMB_BITS},
+};
+use core::{ffi::c_int, marker::PhantomData};
+
+// Elem<T>` is `fe` in curve25519/internal.h.
+// Elem<L> is `fe_loose` in curve25519/internal.h.
+// Keep this in sync with curve25519/internal.h.
+#[repr(C)]
+pub struct Elem<E: Encoding> {
+    limbs: [Limb; ELEM_LIMBS], // This is called `v` in the C code.
+    encoding: PhantomData<E>,
+}
+
+pub trait Encoding {}
+pub struct T;
+impl Encoding for T {}
+
+const ELEM_LIMBS: usize = 5 * 64 / LIMB_BITS;
+
+impl<E: Encoding> Elem<E> {
+    fn zero() -> Self {
+        Self {
+            limbs: Default::default(),
+            encoding: PhantomData,
+        }
+    }
+}
+
+impl Elem<T> {
+    fn negate(&mut self) {
+        unsafe {
+            x25519_fe_neg(self);
+        }
+    }
+}
+
+// An encoding of a curve point. If on Curve25519, it should be encoded as
+// described in Section 5 of [RFC 7748]. If on Edwards25519, it should be
+// encoded as described in section 5.1.2 of [RFC 8032].
+//
+// [RFC 7748] https://tools.ietf.org/html/rfc7748#section-5
+// [RFC 8032] https://tools.ietf.org/html/rfc8032#section-5.1.2
+pub type EncodedPoint = [u8; ELEM_LEN];
+pub const ELEM_LEN: usize = 32;
+
+// Keep this in sync with `ge_p3` in curve25519/internal.h.
+#[repr(C)]
+pub struct ExtPoint {
+    x: Elem<T>,
+    y: Elem<T>,
+    z: Elem<T>,
+    t: Elem<T>,
+}
+
+impl ExtPoint {
+    // Returns the result of multiplying the base point by the scalar in constant time.
+    pub(super) fn from_scalarmult_base(scalar: &Scalar, cpu: cpu::Features) -> Self {
+        let mut r = Self {
+            x: Elem::zero(),
+            y: Elem::zero(),
+            z: Elem::zero(),
+            t: Elem::zero(),
+        };
+        prefixed_extern! {
+            fn x25519_ge_scalarmult_base(h: &mut ExtPoint, a: &Scalar, has_fe25519_adx: c_int);
+        }
+        unsafe {
+            x25519_ge_scalarmult_base(&mut r, scalar, has_fe25519_adx(cpu).into());
+        }
+        r
+    }
+
+    pub fn from_encoded_point_vartime(encoded: &EncodedPoint) -> Result<Self, error::Unspecified> {
+        let mut point = Self {
+            x: Elem::zero(),
+            y: Elem::zero(),
+            z: Elem::zero(),
+            t: Elem::zero(),
+        };
+
+        Result::from(unsafe { x25519_ge_frombytes_vartime(&mut point, encoded) }).map(|()| point)
+    }
+
+    pub(super) fn into_encoded_point(self, cpu_features: cpu::Features) -> EncodedPoint {
+        encode_point(self.x, self.y, self.z, cpu_features)
+    }
+
+    pub(super) fn invert_vartime(&mut self) {
+        self.x.negate();
+        self.t.negate();
+    }
+}
+
+// Keep this in sync with `ge_p2` in curve25519/internal.h.
+#[repr(C)]
+pub struct Point {
+    x: Elem<T>,
+    y: Elem<T>,
+    z: Elem<T>,
+}
+
+impl Point {
+    pub fn new_at_infinity() -> Self {
+        Self {
+            x: Elem::zero(),
+            y: Elem::zero(),
+            z: Elem::zero(),
+        }
+    }
+
+    pub(super) fn into_encoded_point(self, cpu_features: cpu::Features) -> EncodedPoint {
+        encode_point(self.x, self.y, self.z, cpu_features)
+    }
+}
+
+fn encode_point(x: Elem<T>, y: Elem<T>, z: Elem<T>, _cpu_features: cpu::Features) -> EncodedPoint {
+    let mut bytes = [0; ELEM_LEN];
+
+    let sign_bit: u8 = unsafe {
+        let mut recip = Elem::zero();
+        x25519_fe_invert(&mut recip, &z);
+
+        let mut x_over_z = Elem::zero();
+        x25519_fe_mul_ttt(&mut x_over_z, &x, &recip);
+
+        let mut y_over_z = Elem::zero();
+        x25519_fe_mul_ttt(&mut y_over_z, &y, &recip);
+        x25519_fe_tobytes(&mut bytes, &y_over_z);
+
+        x25519_fe_isnegative(&x_over_z)
+    };
+
+    // The preceding computations must execute in constant time, but this
+    // doesn't need to.
+    bytes[ELEM_LEN - 1] ^= sign_bit << 7;
+
+    bytes
+}
+
+#[inline(always)]
+pub(super) fn has_fe25519_adx(cpu: cpu::Features) -> bool {
+    cfg_if::cfg_if! {
+        if #[cfg(all(target_arch = "x86_64", not(target_os = "windows")))] {
+            use cpu::{intel::{Adx, Bmi1, Bmi2}, GetFeature as _};
+            matches!(cpu.get_feature(), Some((Adx { .. }, Bmi1 { .. }, Bmi2 { .. })))
+        } else {
+            let _ = cpu;
+            false
+        }
+    }
+}
+
+prefixed_extern! {
+    fn x25519_fe_invert(out: &mut Elem<T>, z: &Elem<T>);
+    fn x25519_fe_isnegative(elem: &Elem<T>) -> u8;
+    fn x25519_fe_mul_ttt(h: &mut Elem<T>, f: &Elem<T>, g: &Elem<T>);
+    fn x25519_fe_neg(f: &mut Elem<T>);
+    fn x25519_fe_tobytes(bytes: &mut EncodedPoint, elem: &Elem<T>);
+    fn x25519_ge_frombytes_vartime(h: &mut ExtPoint, s: &EncodedPoint) -> bssl::Result;
+}
diff --git a/ring-0.17.14/src/ec/curve25519/scalar.rs b/ring-0.17.14/src/ec/curve25519/scalar.rs
new file mode 100644
index 0000000000..a56e281d52
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519/scalar.rs
@@ -0,0 +1,78 @@
+// Copyright 2015-2019 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{
+    arithmetic::limbs_from_hex,
+    digest, error, limb,
+    polyfill::slice::{self, AsChunks},
+};
+use core::array;
+
+#[repr(transparent)]
+pub struct Scalar([u8; SCALAR_LEN]);
+
+pub const SCALAR_LEN: usize = 32;
+
+impl Scalar {
+    // Constructs a `Scalar` from `bytes`, failing if `bytes` encodes a scalar
+    // that is not in the range [0, n).
+    pub fn from_bytes_checked(bytes: [u8; SCALAR_LEN]) -> Result<Self, error::Unspecified> {
+        const ORDER: [limb::Limb; SCALAR_LEN / limb::LIMB_BYTES] =
+            limbs_from_hex("1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed");
+        let order = ORDER.map(limb::Limb::from);
+
+        let (limbs_as_bytes, _empty): (AsChunks<u8, { limb::LIMB_BYTES }>, _) =
+            slice::as_chunks(&bytes);
+        debug_assert!(_empty.is_empty());
+        let limbs: [limb::Limb; SCALAR_LEN / limb::LIMB_BYTES] =
+            array::from_fn(|i| limb::Limb::from_le_bytes(limbs_as_bytes[i]));
+        limb::verify_limbs_less_than_limbs_leak_bit(&limbs, &order)?;
+
+        Ok(Self(bytes))
+    }
+
+    // Constructs a `Scalar` from `digest` reduced modulo n.
+    pub fn from_sha512_digest_reduced(digest: digest::Digest) -> Self {
+        prefixed_extern! {
+            fn x25519_sc_reduce(s: &mut UnreducedScalar);
+        }
+        let mut unreduced = [0u8; digest::SHA512_OUTPUT_LEN];
+        unreduced.copy_from_slice(digest.as_ref());
+        unsafe { x25519_sc_reduce(&mut unreduced) };
+        Self((&unreduced[..SCALAR_LEN]).try_into().unwrap())
+    }
+}
+
+#[repr(transparent)]
+pub struct MaskedScalar([u8; SCALAR_LEN]);
+
+impl MaskedScalar {
+    pub fn from_bytes_masked(bytes: [u8; SCALAR_LEN]) -> Self {
+        prefixed_extern! {
+            fn x25519_sc_mask(a: &mut [u8; SCALAR_LEN]);
+        }
+        let mut r = Self(bytes);
+        unsafe { x25519_sc_mask(&mut r.0) };
+        r
+    }
+}
+
+impl From<MaskedScalar> for Scalar {
+    fn from(MaskedScalar(scalar): MaskedScalar) -> Self {
+        Self(scalar)
+    }
+}
+
+type UnreducedScalar = [u8; UNREDUCED_SCALAR_LEN];
+const UNREDUCED_SCALAR_LEN: usize = SCALAR_LEN * 2;
diff --git a/ring-0.17.14/src/ec/curve25519/x25519.rs b/ring-0.17.14/src/ec/curve25519/x25519.rs
new file mode 100644
index 0000000000..216648a5bd
--- /dev/null
+++ b/ring-0.17.14/src/ec/curve25519/x25519.rs
@@ -0,0 +1,249 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! X25519 Key agreement.
+
+use super::{ops, scalar::SCALAR_LEN};
+use crate::{agreement, bb, cpu, ec, error, rand};
+use core::ffi::c_int;
+
+static CURVE25519: ec::Curve = ec::Curve {
+    public_key_len: PUBLIC_KEY_LEN,
+    elem_scalar_seed_len: ELEM_AND_SCALAR_LEN,
+    id: ec::CurveID::Curve25519,
+    check_private_key_bytes: x25519_check_private_key_bytes,
+    generate_private_key: x25519_generate_private_key,
+    public_from_private: x25519_public_from_private,
+};
+
+/// X25519 (ECDH using Curve25519) as described in [RFC 7748].
+///
+/// Everything is as described in RFC 7748. Key agreement will fail if the
+/// result of the X25519 operation is zero; see the notes on the
+/// "all-zero value" in [RFC 7748 section 6.1].
+///
+/// [RFC 7748]: https://tools.ietf.org/html/rfc7748
+/// [RFC 7748 section 6.1]: https://tools.ietf.org/html/rfc7748#section-6.1
+pub static X25519: agreement::Algorithm = agreement::Algorithm {
+    curve: &CURVE25519,
+    ecdh: x25519_ecdh,
+};
+
+#[allow(clippy::unnecessary_wraps)]
+fn x25519_check_private_key_bytes(
+    bytes: &[u8],
+    _: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    debug_assert_eq!(bytes.len(), PRIVATE_KEY_LEN);
+    Ok(())
+}
+
+fn x25519_generate_private_key(
+    rng: &dyn rand::SecureRandom,
+    out: &mut [u8],
+    _: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    rng.fill(out)
+}
+
+fn x25519_public_from_private(
+    public_out: &mut [u8],
+    private_key: &ec::Seed,
+    cpu_features: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    let public_out = public_out.try_into()?;
+
+    let private_key: &[u8; SCALAR_LEN] = private_key.bytes_less_safe().try_into()?;
+    let private_key = ops::MaskedScalar::from_bytes_masked(*private_key);
+
+    #[cfg(all(
+        all(target_arch = "arm", target_endian = "little"),
+        any(target_os = "android", target_os = "linux")
+    ))]
+    if let Some(cpu) = <cpu::Features as cpu::GetFeature<_>>::get_feature(&cpu_features) {
+        static MONTGOMERY_BASE_POINT: [u8; 32] = [
+            9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        ];
+        x25519_neon(public_out, &private_key, &MONTGOMERY_BASE_POINT, cpu);
+        return Ok(());
+    }
+
+    prefixed_extern! {
+        fn x25519_public_from_private_generic_masked(
+            public_key_out: &mut PublicKey,
+            private_key: &PrivateKey,
+            use_adx: c_int,
+        );
+    }
+    unsafe {
+        x25519_public_from_private_generic_masked(
+            public_out,
+            &private_key,
+            ops::has_fe25519_adx(cpu_features).into(),
+        );
+    }
+
+    Ok(())
+}
+
+fn x25519_ecdh(
+    out: &mut [u8],
+    my_private_key: &ec::Seed,
+    peer_public_key: untrusted::Input,
+    cpu_features: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    let my_private_key: &[u8; SCALAR_LEN] = my_private_key.bytes_less_safe().try_into()?;
+    let my_private_key = ops::MaskedScalar::from_bytes_masked(*my_private_key);
+    let peer_public_key: &[u8; PUBLIC_KEY_LEN] = peer_public_key.as_slice_less_safe().try_into()?;
+
+    fn scalar_mult(
+        out: &mut ops::EncodedPoint,
+        scalar: &ops::MaskedScalar,
+        point: &ops::EncodedPoint,
+        #[allow(unused_variables)] cpu_features: cpu::Features,
+    ) {
+        #[cfg(all(
+            all(target_arch = "arm", target_endian = "little"),
+            any(target_os = "android", target_os = "linux")
+        ))]
+        if let Some(cpu) = <cpu::Features as cpu::GetFeature<_>>::get_feature(&cpu_features) {
+            return x25519_neon(out, scalar, point, cpu);
+        }
+
+        #[cfg(all(target_arch = "x86_64", not(target_os = "windows")))]
+        {
+            if ops::has_fe25519_adx(cpu_features) {
+                prefixed_extern! {
+                    fn x25519_scalar_mult_adx(
+                        out: &mut ops::EncodedPoint,
+                        scalar: &ops::MaskedScalar,
+                        point: &ops::EncodedPoint,
+                    );
+                }
+                return unsafe { x25519_scalar_mult_adx(out, scalar, point) };
+            }
+        }
+
+        prefixed_extern! {
+            fn x25519_scalar_mult_generic_masked(
+                out: &mut ops::EncodedPoint,
+                scalar: &ops::MaskedScalar,
+                point: &ops::EncodedPoint,
+            );
+        }
+        unsafe {
+            x25519_scalar_mult_generic_masked(out, scalar, point);
+        }
+    }
+
+    scalar_mult(
+        out.try_into()?,
+        &my_private_key,
+        peer_public_key,
+        cpu_features,
+    );
+
+    let zeros: SharedSecret = [0; SHARED_SECRET_LEN];
+    if bb::verify_slices_are_equal(out, &zeros).is_ok() {
+        // All-zero output results when the input is a point of small order.
+        return Err(error::Unspecified);
+    }
+
+    Ok(())
+}
+
+// BoringSSL uses `!defined(OPENSSL_APPLE)`.
+#[cfg(all(
+    all(target_arch = "arm", target_endian = "little"),
+    any(target_os = "android", target_os = "linux")
+))]
+fn x25519_neon(
+    out: &mut ops::EncodedPoint,
+    scalar: &ops::MaskedScalar,
+    point: &ops::EncodedPoint,
+    _cpu: cpu::arm::Neon,
+) {
+    prefixed_extern! {
+        fn x25519_NEON(
+            out: &mut ops::EncodedPoint,
+            scalar: &ops::MaskedScalar,
+            point: &ops::EncodedPoint,
+        );
+    }
+    unsafe { x25519_NEON(out, scalar, point) }
+}
+
+const ELEM_AND_SCALAR_LEN: usize = ops::ELEM_LEN;
+
+type PrivateKey = ops::MaskedScalar;
+const PRIVATE_KEY_LEN: usize = ELEM_AND_SCALAR_LEN;
+
+// An X25519 public key as an encoded Curve25519 point.
+type PublicKey = [u8; PUBLIC_KEY_LEN];
+const PUBLIC_KEY_LEN: usize = ELEM_AND_SCALAR_LEN;
+
+// An X25519 shared secret as an encoded Curve25519 point.
+type SharedSecret = [u8; SHARED_SECRET_LEN];
+const SHARED_SECRET_LEN: usize = ELEM_AND_SCALAR_LEN;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ec;
+    use untrusted::Input;
+
+    #[test]
+    fn test_x25519_public_from_private() {
+        struct TestVector {
+            private: [u8; 32],
+            public: [u8; 32],
+        }
+        static TEST_CASES: &[TestVector] = &[
+            TestVector {
+                private: [
+                    0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51,
+                    0xb2, 0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77,
+                    0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a,
+                ],
+                public: [
+                    0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, 0x74, 0x8b, 0x7d, 0xdc, 0xb4,
+                    0x3e, 0xf7, 0x5a, 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, 0xeb, 0xa4,
+                    0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a,
+                ],
+            },
+            TestVector {
+                private: [
+                    0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83,
+                    0x80, 0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f,
+                    0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb,
+                ],
+                public: [
+                    0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, 0xd3, 0x5b, 0x61, 0xc2, 0xec,
+                    0xe4, 0x35, 0x37, 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, 0xad, 0xfc,
+                    0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f,
+                ],
+            },
+        ];
+        let cpu_features = cpu::features();
+        for test_case in TEST_CASES {
+            let seed =
+                ec::Seed::from_bytes(&CURVE25519, Input::from(&test_case.private), cpu_features)
+                    .unwrap();
+            let mut output = [0u8; 32];
+            x25519_public_from_private(&mut output, &seed, cpu_features).unwrap();
+            assert_eq!(output, test_case.public);
+        }
+    }
+}
diff --git a/ring-0.17.14/src/ec/keys.rs b/ring-0.17.14/src/ec/keys.rs
new file mode 100644
index 0000000000..6cf925035b
--- /dev/null
+++ b/ring-0.17.14/src/ec/keys.rs
@@ -0,0 +1,97 @@
+use super::{Curve, ELEM_MAX_BYTES, SEED_MAX_BYTES};
+use crate::{cpu, error, rand};
+
+pub struct KeyPair {
+    seed: Seed,
+    public_key: PublicKey,
+}
+
+impl KeyPair {
+    pub(super) fn derive(
+        seed: Seed,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::Unspecified> {
+        let public_key = seed.compute_public_key(cpu_features)?;
+        Ok(Self { seed, public_key })
+    }
+
+    pub fn public_key(&self) -> &PublicKey {
+        &self.public_key
+    }
+    pub fn split(self) -> (Seed, PublicKey) {
+        (self.seed, self.public_key)
+    }
+}
+
+pub struct Seed {
+    bytes: [u8; SEED_MAX_BYTES],
+    curve: &'static Curve,
+}
+
+impl Seed {
+    pub(crate) fn generate(
+        curve: &'static Curve,
+        rng: &dyn rand::SecureRandom,
+        cpu: cpu::Features,
+    ) -> Result<Self, error::Unspecified> {
+        let mut r = Self {
+            bytes: [0u8; SEED_MAX_BYTES],
+            curve,
+        };
+        (curve.generate_private_key)(rng, &mut r.bytes[..curve.elem_scalar_seed_len], cpu)?;
+        Ok(r)
+    }
+
+    pub(crate) fn from_bytes(
+        curve: &'static Curve,
+        bytes: untrusted::Input,
+        cpu: cpu::Features,
+    ) -> Result<Self, error::Unspecified> {
+        let bytes = bytes.as_slice_less_safe();
+        if curve.elem_scalar_seed_len != bytes.len() {
+            return Err(error::Unspecified);
+        }
+        (curve.check_private_key_bytes)(bytes, cpu)?;
+        let mut r = Self {
+            bytes: [0; SEED_MAX_BYTES],
+            curve,
+        };
+        r.bytes[..curve.elem_scalar_seed_len].copy_from_slice(bytes);
+        Ok(r)
+    }
+
+    pub fn bytes_less_safe(&self) -> &[u8] {
+        &self.bytes[..self.curve.elem_scalar_seed_len]
+    }
+
+    pub(crate) fn compute_public_key(
+        &self,
+        cpu_features: cpu::Features,
+    ) -> Result<PublicKey, error::Unspecified> {
+        let mut public_key = PublicKey {
+            bytes: [0u8; PUBLIC_KEY_MAX_LEN],
+            len: self.curve.public_key_len,
+        };
+        (self.curve.public_from_private)(
+            &mut public_key.bytes[..public_key.len],
+            self,
+            cpu_features,
+        )?;
+        Ok(public_key)
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct PublicKey {
+    bytes: [u8; PUBLIC_KEY_MAX_LEN],
+    len: usize,
+}
+
+impl AsRef<[u8]> for PublicKey {
+    fn as_ref(&self) -> &[u8] {
+        &self.bytes[..self.len]
+    }
+}
+
+/// The maximum length, in bytes, of an encoded public key.
+pub const PUBLIC_KEY_MAX_LEN: usize = 1 + (2 * ELEM_MAX_BYTES);
diff --git a/ring-0.17.14/src/ec/suite_b.rs b/ring-0.17.14/src/ec/suite_b.rs
new file mode 100644
index 0000000000..41adcfdf0b
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b.rs
@@ -0,0 +1,239 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Elliptic curve operations on P-256 & P-384.
+
+use self::ops::*;
+use crate::{arithmetic::montgomery::*, cpu, ec, error, io::der, pkcs8};
+
+// NIST SP 800-56A Step 3: "If q is an odd prime p, verify that
+// yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed modulo
+// p."
+//
+// That is, verify that (x, y) is on the curve, which is true iif:
+//
+//     y**2 == x**3 + a*x + b (mod q)
+//
+// Or, equivalently, but more efficiently:
+//
+//     y**2 == (x**2 + a)*x + b  (mod q)
+//
+fn verify_affine_point_is_on_the_curve(
+    q: &Modulus<Q>,
+    (x, y): (&Elem<R>, &Elem<R>),
+) -> Result<(), error::Unspecified> {
+    verify_affine_point_is_on_the_curve_scaled(q, (x, y), &Elem::from(q.a()), &Elem::from(q.b()))
+}
+
+// Use `verify_affine_point_is_on_the_curve` instead of this function whenever
+// the affine coordinates are available or will become available. This function
+// should only be used then the affine coordinates are never calculated. See
+// the notes for `verify_affine_point_is_on_the_curve_scaled`.
+//
+// The value `z**2` is returned on success because it is useful for ECDSA
+// verification.
+//
+// This function also verifies that the point is not at infinity.
+fn verify_jacobian_point_is_on_the_curve(
+    q: &Modulus<Q>,
+    p: &Point,
+) -> Result<Elem<R>, error::Unspecified> {
+    let z = q.point_z(p);
+
+    // Verify that the point is not at infinity.
+    q.elem_verify_is_not_zero(&z)?;
+
+    let x = q.point_x(p);
+    let y = q.point_y(p);
+
+    // We are given Jacobian coordinates (x, y, z). So, we have:
+    //
+    //    (x/z**2, y/z**3) == (x', y'),
+    //
+    // where (x', y') are the affine coordinates. The curve equation is:
+    //
+    //     y'**2  ==  x'**3 + a*x' + b  ==  (x'**2 + a)*x' + b
+    //
+    // Substituting our Jacobian coordinates, we get:
+    //
+    //    /   y  \**2       /  /   x  \**2       \   /   x  \
+    //    | ---- |      ==  |  | ---- |    +  a  | * | ---- |  +  b
+    //    \ z**3 /          \  \ z**2 /          /   \ z**2 /
+    //
+    // Simplify:
+    //
+    //            y**2      / x**2       \     x
+    //            ----  ==  | ----  +  a | * ----  +  b
+    //            z**6      \ z**4       /   z**2
+    //
+    // Multiply both sides by z**6:
+    //
+    //     z**6             / x**2       \   z**6
+    //     ---- * y**2  ==  | ----  +  a | * ---- * x  +  (z**6) * b
+    //     z**6             \ z**4       /   z**2
+    //
+    // Simplify:
+    //
+    //                      / x**2       \
+    //            y**2  ==  | ----  +  a | * z**4 * x  +  (z**6) * b
+    //                      \ z**4       /
+    //
+    // Distribute z**4:
+    //
+    //                      / z**4                     \
+    //            y**2  ==  | ---- * x**2  +  z**4 * a | * x  +  (z**6) * b
+    //                      \ z**4                     /
+    //
+    // Simplify:
+    //
+    //            y**2  ==  (x**2  +  z**4 * a) * x  +  (z**6) * b
+    //
+    let z2 = q.elem_squared(&z);
+    let z4 = q.elem_squared(&z2);
+    let z4_a = q.elem_product(&z4, &Elem::from(q.a()));
+    let z6 = q.elem_product(&z4, &z2);
+    let z6_b = q.elem_product(&z6, &Elem::from(q.b()));
+    verify_affine_point_is_on_the_curve_scaled(q, (&x, &y), &z4_a, &z6_b)?;
+    Ok(z2)
+}
+
+// Handles the common logic of point-is-on-the-curve checks for both affine and
+// Jacobian cases.
+//
+// When doing the check that the point is on the curve after a computation,
+// to avoid fault attacks or mitigate potential bugs, it is better for security
+// to use `verify_affine_point_is_on_the_curve` on the affine coordinates,
+// because it provides some protection against faults that occur in the
+// computation of the inverse of `z`. See the paper and presentation "Fault
+// Attacks on Projective-to-Affine Coordinates Conversion" by Diana Maimuţ,
+// Cédric Murdica, David Naccache, Mehdi Tibouchi. That presentation concluded
+// simply "Check the validity of the result after conversion to affine
+// coordinates." (It seems like a good idea to verify that
+// z_inv * z == 1 mod q too).
+//
+// In the case of affine coordinates (x, y), `a_scaled` and `b_scaled` are
+// `a` and `b`, respectively. In the case of Jacobian coordinates (x, y, z),
+// the computation and comparison is the same, except `a_scaled` and `b_scaled`
+// are (z**4 * a) and (z**6 * b), respectively. Thus, performance is another
+// reason to prefer doing the check on the affine coordinates, as Jacobian
+// computation requires 3 extra multiplications and 2 extra squarings.
+//
+// An example of a fault attack that isn't mitigated by a point-on-the-curve
+// check after multiplication is given in "Sign Change Fault Attacks On
+// Elliptic Curve Cryptosystems" by Johannes Blömer, Martin Otto, and
+// Jean-Pierre Seifert.
+fn verify_affine_point_is_on_the_curve_scaled(
+    q: &Modulus<Q>,
+    (x, y): (&Elem<R>, &Elem<R>),
+    a_scaled: &Elem<R>,
+    b_scaled: &Elem<R>,
+) -> Result<(), error::Unspecified> {
+    let lhs = q.elem_squared(y);
+
+    let mut rhs = q.elem_squared(x);
+    q.add_assign(&mut rhs, a_scaled);
+    q.elem_mul(&mut rhs, x);
+    q.add_assign(&mut rhs, b_scaled);
+
+    if !q.elems_are_equal(&lhs, &rhs).leak() {
+        return Err(error::Unspecified);
+    }
+
+    Ok(())
+}
+
+pub(crate) fn key_pair_from_pkcs8(
+    curve: &'static ec::Curve,
+    template: &pkcs8::Template,
+    input: untrusted::Input,
+    cpu_features: cpu::Features,
+) -> Result<ec::KeyPair, error::KeyRejected> {
+    let (ec_private_key, _) = pkcs8::unwrap_key(template, pkcs8::Version::V1Only, input)?;
+    let (private_key, public_key) =
+        ec_private_key.read_all(error::KeyRejected::invalid_encoding(), |input| {
+            // https://tools.ietf.org/html/rfc5915#section-3
+            der::nested(
+                input,
+                der::Tag::Sequence,
+                error::KeyRejected::invalid_encoding(),
+                |input| key_pair_from_pkcs8_(template, input),
+            )
+        })?;
+    key_pair_from_bytes(curve, private_key, public_key, cpu_features)
+}
+
+fn key_pair_from_pkcs8_<'a>(
+    template: &pkcs8::Template,
+    input: &mut untrusted::Reader<'a>,
+) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::KeyRejected> {
+    let version = der::small_nonnegative_integer(input)
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+    if version != 1 {
+        return Err(error::KeyRejected::version_not_supported());
+    }
+
+    let private_key = der::expect_tag_and_get_value(input, der::Tag::OctetString)
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+
+    // [0] parameters (optional).
+    if input.peek(u8::from(der::Tag::ContextSpecificConstructed0)) {
+        let actual_alg_id =
+            der::expect_tag_and_get_value(input, der::Tag::ContextSpecificConstructed0)
+                .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+        if actual_alg_id.as_slice_less_safe() != template.curve_oid().as_slice_less_safe() {
+            return Err(error::KeyRejected::wrong_algorithm());
+        }
+    }
+
+    // [1] publicKey. The RFC says it is optional, but we require it
+    // to be present.
+    let public_key = der::nested(
+        input,
+        der::Tag::ContextSpecificConstructed1,
+        error::Unspecified,
+        der::bit_string_with_no_unused_bits,
+    )
+    .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+
+    Ok((private_key, public_key))
+}
+
+pub(crate) fn key_pair_from_bytes(
+    curve: &'static ec::Curve,
+    private_key_bytes: untrusted::Input,
+    public_key_bytes: untrusted::Input,
+    cpu_features: cpu::Features,
+) -> Result<ec::KeyPair, error::KeyRejected> {
+    let seed = ec::Seed::from_bytes(curve, private_key_bytes, cpu_features)
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_component())?;
+
+    let r = ec::KeyPair::derive(seed, cpu_features)
+        .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?;
+    if public_key_bytes.as_slice_less_safe() != r.public_key().as_ref() {
+        return Err(error::KeyRejected::inconsistent_components());
+    }
+
+    Ok(r)
+}
+
+pub mod curve;
+
+pub mod ecdh;
+
+pub mod ecdsa;
+
+mod ops;
+
+mod private_key;
+mod public_key;
diff --git a/ring-0.17.14/src/ec/suite_b/curve.rs b/ring-0.17.14/src/ec/suite_b/curve.rs
new file mode 100644
index 0000000000..0e9f1a9e62
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/curve.rs
@@ -0,0 +1,93 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{cpu, ec, error, rand};
+
+/// A key agreement algorithm.
+macro_rules! suite_b_curve {
+    ( $NAME:ident, $bits:expr, $private_key_ops:expr, $id:expr,
+      $check_private_key_bytes:ident, $generate_private_key:ident,
+      $public_from_private:ident) => {
+        /// Public keys are encoding in uncompressed form using the
+        /// Octet-String-to-Elliptic-Curve-Point algorithm in
+        /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. Public keys are
+        /// validated during key agreement according to
+        /// [NIST Special Publication 800-56A, revision 2] and Appendix B.3 of
+        /// the NSA's [Suite B Implementer's Guide to NIST SP 800-56A].
+        ///
+        /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]:
+        ///     http://www.secg.org/sec1-v2.pdf
+        /// [NIST Special Publication 800-56A, revision 2]:
+        ///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
+        /// [Suite B Implementer's Guide to NIST SP 800-56A]:
+        ///     https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf
+        pub static $NAME: ec::Curve = ec::Curve {
+            public_key_len: 1 + (2 * (($bits + 7) / 8)),
+            elem_scalar_seed_len: ($bits + 7) / 8,
+            id: $id,
+            check_private_key_bytes: $check_private_key_bytes,
+            generate_private_key: $generate_private_key,
+            public_from_private: $public_from_private,
+        };
+
+        fn $check_private_key_bytes(
+            bytes: &[u8],
+            cpu: cpu::Features,
+        ) -> Result<(), error::Unspecified> {
+            debug_assert_eq!(bytes.len(), $bits / 8);
+            ec::suite_b::private_key::check_scalar_big_endian_bytes($private_key_ops, bytes, cpu)
+        }
+
+        fn $generate_private_key(
+            rng: &dyn rand::SecureRandom,
+            out: &mut [u8],
+            cpu: cpu::Features,
+        ) -> Result<(), error::Unspecified> {
+            ec::suite_b::private_key::generate_private_scalar_bytes($private_key_ops, rng, out, cpu)
+        }
+
+        fn $public_from_private(
+            public_out: &mut [u8],
+            private_key: &ec::Seed,
+            cpu: cpu::Features,
+        ) -> Result<(), error::Unspecified> {
+            ec::suite_b::private_key::public_from_private(
+                $private_key_ops,
+                public_out,
+                private_key,
+                cpu,
+            )
+        }
+    };
+}
+
+suite_b_curve!(
+    P256,
+    256,
+    &ec::suite_b::ops::p256::PRIVATE_KEY_OPS,
+    ec::CurveID::P256,
+    p256_check_private_key_bytes,
+    p256_generate_private_key,
+    p256_public_from_private
+);
+
+suite_b_curve!(
+    P384,
+    384,
+    &ec::suite_b::ops::p384::PRIVATE_KEY_OPS,
+    ec::CurveID::P384,
+    p384_check_private_key_bytes,
+    p384_generate_private_key,
+    p384_public_from_private
+);
diff --git a/ring-0.17.14/src/ec/suite_b/ecdh.rs b/ring-0.17.14/src/ec/suite_b/ecdh.rs
new file mode 100644
index 0000000000..59b4374b8f
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ecdh.rs
@@ -0,0 +1,243 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! ECDH key agreement using the P-256 and P-384 curves.
+
+use super::{ops::*, private_key::*, public_key::*};
+use crate::{agreement, cpu, ec, error};
+
+/// A key agreement algorithm.
+macro_rules! ecdh {
+    ( $NAME:ident, $curve:expr, $name_str:expr, $private_key_ops:expr,
+      $public_key_ops:expr, $ecdh:ident ) => {
+        #[doc = "ECDH using the NSA Suite B"]
+        #[doc=$name_str]
+        #[doc = "curve."]
+        ///
+        /// Public keys are encoding in uncompressed form using the
+        /// Octet-String-to-Elliptic-Curve-Point algorithm in
+        /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. Public keys are
+        /// validated during key agreement according to
+        /// [NIST Special Publication 800-56A, revision 2] and Appendix B.3 of
+        /// the NSA's [Suite B Implementer's Guide to NIST SP 800-56A].
+        ///
+        /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]:
+        ///     http://www.secg.org/sec1-v2.pdf
+        /// [NIST Special Publication 800-56A, revision 2]:
+        ///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
+        /// [Suite B Implementer's Guide to NIST SP 800-56A]:
+        ///     https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf
+        pub static $NAME: agreement::Algorithm = agreement::Algorithm {
+            curve: $curve,
+            ecdh: $ecdh,
+        };
+
+        fn $ecdh(
+            out: &mut [u8],
+            my_private_key: &ec::Seed,
+            peer_public_key: untrusted::Input,
+            cpu: cpu::Features,
+        ) -> Result<(), error::Unspecified> {
+            ecdh(
+                $private_key_ops,
+                $public_key_ops,
+                out,
+                my_private_key,
+                peer_public_key,
+                cpu,
+            )
+        }
+    };
+}
+
+ecdh!(
+    ECDH_P256,
+    &ec::suite_b::curve::P256,
+    "P-256 (secp256r1)",
+    &p256::PRIVATE_KEY_OPS,
+    &p256::PUBLIC_KEY_OPS,
+    p256_ecdh
+);
+
+ecdh!(
+    ECDH_P384,
+    &ec::suite_b::curve::P384,
+    "P-384 (secp384r1)",
+    &p384::PRIVATE_KEY_OPS,
+    &p384::PUBLIC_KEY_OPS,
+    p384_ecdh
+);
+
+fn ecdh(
+    private_key_ops: &PrivateKeyOps,
+    public_key_ops: &PublicKeyOps,
+    out: &mut [u8],
+    my_private_key: &ec::Seed,
+    peer_public_key: untrusted::Input,
+    cpu: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    // The NIST SP 800-56Ar2 steps are from section 5.7.1.2 Elliptic Curve
+    // Cryptography Cofactor Diffie-Hellman (ECC CDH) Primitive.
+    //
+    // The "NSA Guide" steps are from section 3.1 of the NSA guide, "Ephemeral
+    // Unified Model."
+
+    let q = &public_key_ops.common.elem_modulus(cpu);
+
+    // NSA Guide Step 1 is handled separately.
+
+    // NIST SP 800-56Ar2 5.6.2.2.2.
+    // NSA Guide Step 2.
+    //
+    // `parse_uncompressed_point` verifies that the point is not at infinity
+    // and that it is on the curve, using the Partial Public-Key Validation
+    // Routine.
+    let peer_public_key = parse_uncompressed_point(public_key_ops, q, peer_public_key)?;
+
+    // NIST SP 800-56Ar2 Step 1.
+    // NSA Guide Step 3 (except point at infinity check).
+    //
+    // Note that the cofactor (h) is one since we only support prime-order
+    // curves, so we can safely ignore the cofactor.
+    //
+    // It is impossible for the result to be the point at infinity because our
+    // private key is in the range [1, n) and the curve has prime order and
+    // `parse_uncompressed_point` verified that the peer public key is on the
+    // curve and not at infinity. However, since the standards require the
+    // check, we do it using `assert!`.
+    //
+    // NIST SP 800-56Ar2 defines "Destroy" thusly: "In this Recommendation, to
+    // destroy is an action applied to a key or a piece of secret data. After
+    // a key or a piece of secret data is destroyed, no information about its
+    // value can be recovered." We interpret "destroy" somewhat liberally: we
+    // assume that since we throw away the values to be destroyed, no
+    // information about their values can be recovered. This doesn't meet the
+    // NSA guide's explicit requirement to "zeroize" them though.
+    // TODO: this only needs common scalar ops
+    let n = &private_key_ops.common.scalar_modulus(cpu);
+    let my_private_key = private_key_as_scalar(n, my_private_key);
+    let product = private_key_ops.point_mul(&my_private_key, &peer_public_key, cpu);
+
+    // NIST SP 800-56Ar2 Steps 2, 3, 4, and 5.
+    // NSA Guide Steps 3 (point at infinity check) and 4.
+    //
+    // Again, we have a pretty liberal interpretation of the NIST's spec's
+    // "Destroy" that doesn't meet the NSA requirement to "zeroize."
+    // `big_endian_affine_from_jacobian` verifies that the result is not at
+    // infinity and also does an extra check to verify that the point is on
+    // the curve.
+    big_endian_affine_from_jacobian(private_key_ops, q, out, None, &product)
+
+    // NSA Guide Step 5 & 6 are deferred to the caller. Again, we have a
+    // pretty liberal interpretation of the NIST's spec's "Destroy" that
+    // doesn't meet the NSA requirement to "zeroize."
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::ops;
+    use crate::testutil as test;
+    use crate::{agreement, ec, limb};
+
+    static SUPPORTED_SUITE_B_ALGS: [(&str, &agreement::Algorithm, &ec::Curve, &ops::CommonOps); 2] = [
+        (
+            "P-256",
+            &agreement::ECDH_P256,
+            &super::super::curve::P256,
+            &ops::p256::COMMON_OPS,
+        ),
+        (
+            "P-384",
+            &agreement::ECDH_P384,
+            &super::super::curve::P384,
+            &ops::p384::COMMON_OPS,
+        ),
+    ];
+
+    #[test]
+    fn test_agreement_suite_b_ecdh_generate() {
+        // Generates a string of bytes 0x00...00, which will always result in
+        // a scalar value of zero.
+        let random_00 = test::rand::FixedByteRandom { byte: 0x00 };
+
+        // Generates a string of bytes 0xFF...FF, which will be larger than the
+        // group order of any curve that is supported.
+        let random_ff = test::rand::FixedByteRandom { byte: 0xff };
+
+        for &(_, alg, curve, ops) in SUPPORTED_SUITE_B_ALGS.iter() {
+            // Test that the private key value zero is rejected and that
+            // `generate` gives up after a while of only getting zeros.
+            assert!(agreement::EphemeralPrivateKey::generate(alg, &random_00).is_err());
+
+            // Test that the private key value larger than the group order is
+            // rejected and that `generate` gives up after a while of only
+            // getting values larger than the group order.
+            assert!(agreement::EphemeralPrivateKey::generate(alg, &random_ff).is_err());
+
+            // Test that a private key value exactly equal to the group order
+            // is rejected and that `generate` gives up after a while of only
+            // getting that value from the PRNG.
+            let mut n_bytes = [0u8; ec::SCALAR_MAX_BYTES];
+            let num_bytes = curve.elem_scalar_seed_len;
+            limb::big_endian_from_limbs(ops.n_limbs(), &mut n_bytes[..num_bytes]);
+            {
+                let n_bytes = &mut n_bytes[..num_bytes];
+                let rng = test::rand::FixedSliceRandom { bytes: n_bytes };
+                assert!(agreement::EphemeralPrivateKey::generate(alg, &rng).is_err());
+            }
+
+            // Test that a private key value exactly equal to the group order
+            // minus 1 is accepted.
+            let mut n_minus_1_bytes = n_bytes;
+            {
+                let n_minus_1_bytes = &mut n_minus_1_bytes[..num_bytes];
+                n_minus_1_bytes[num_bytes - 1] -= 1;
+                let rng = test::rand::FixedSliceRandom {
+                    bytes: n_minus_1_bytes,
+                };
+                let key = agreement::EphemeralPrivateKey::generate(alg, &rng).unwrap();
+                assert_eq!(n_minus_1_bytes, key.bytes_for_test());
+            }
+
+            // Test that n + 1 also fails.
+            let mut n_plus_1_bytes = n_bytes;
+            {
+                let n_plus_1_bytes = &mut n_plus_1_bytes[..num_bytes];
+                n_plus_1_bytes[num_bytes - 1] += 1;
+                let rng = test::rand::FixedSliceRandom {
+                    bytes: n_plus_1_bytes,
+                };
+                assert!(agreement::EphemeralPrivateKey::generate(alg, &rng).is_err());
+            }
+
+            // Test recovery from initial RNG failure. The first value will be
+            // n, then n + 1, then zero, the next value will be n - 1, which
+            // will be accepted.
+            {
+                let bytes = [
+                    &n_bytes[..num_bytes],
+                    &n_plus_1_bytes[..num_bytes],
+                    &[0u8; ec::SCALAR_MAX_BYTES][..num_bytes],
+                    &n_minus_1_bytes[..num_bytes],
+                ];
+                let rng = test::rand::FixedSliceSequenceRandom {
+                    bytes: &bytes,
+                    current: core::cell::UnsafeCell::new(0),
+                };
+                let key = agreement::EphemeralPrivateKey::generate(alg, &rng).unwrap();
+                assert_eq!(&n_minus_1_bytes[..num_bytes], key.bytes_for_test());
+            }
+        }
+    }
+}
diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa.rs b/ring-0.17.14/src/ec/suite_b/ecdsa.rs
new file mode 100644
index 0000000000..274c991626
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ecdsa.rs
@@ -0,0 +1,3 @@
+mod digest_scalar;
+pub mod signing;
+pub mod verification;
diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs b/ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs
new file mode 100644
index 0000000000..6479c9490c
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs
@@ -0,0 +1,119 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! ECDSA Signatures using the P-256 and P-384 curves.
+
+use crate::{digest, ec::suite_b::ops::*};
+
+/// Calculate the digest of `msg` using the digest algorithm `digest_alg`. Then
+/// convert the digest to a scalar in the range [0, n) as described in
+/// NIST's FIPS 186-4 Section 4.2. Note that this is one of the few cases where
+/// a `Scalar` is allowed to have the value zero.
+///
+/// NIST's FIPS 186-4 4.2 says "When the length of the output of the hash
+/// function is greater than N (i.e., the bit length of q), then the leftmost N
+/// bits of the hash function output block shall be used in any calculation
+/// using the hash function output during the generation or verification of a
+/// digital signature."
+///
+/// "Leftmost N bits" means "N most significant bits" because we interpret the
+/// digest as a bit-endian encoded integer.
+///
+/// The NSA guide instead vaguely suggests that we should convert the digest
+/// value to an integer and then reduce it mod `n`. However, real-world
+/// implementations (e.g. `digest_to_bn` in OpenSSL and `hashToInt` in Go) do
+/// what FIPS 186-4 says to do, not what the NSA guide suggests.
+///
+/// Why shifting the value right by at most one bit is sufficient: P-256's `n`
+/// has its 256th bit set; i.e. 2**255 < n < 2**256. Once we've truncated the
+/// digest to 256 bits and converted it to an integer, it will have a value
+/// less than 2**256. If the value is larger than `n` then shifting it one bit
+/// right will give a value less than 2**255, which is less than `n`. The
+/// analogous argument applies for P-384. However, it does *not* apply in
+/// general; for example, it doesn't apply to P-521.
+pub(super) fn digest_scalar(n: &Modulus<N>, msg: digest::Digest) -> Scalar {
+    digest_scalar_(n, msg.as_ref())
+}
+
+#[cfg(test)]
+pub(super) fn digest_bytes_scalar(n: &Modulus<N>, digest: &[u8]) -> Scalar {
+    digest_scalar_(n, digest)
+}
+
+// This is a separate function solely so that we can test specific digest
+// values like all-zero values and values larger than `n`.
+fn digest_scalar_(n: &Modulus<N>, digest: &[u8]) -> Scalar {
+    let len = n.bytes_len();
+    let digest = if digest.len() > len {
+        &digest[..len]
+    } else {
+        digest
+    };
+
+    scalar_parse_big_endian_partially_reduced_variable_consttime(n, untrusted::Input::from(digest))
+        .unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::digest_bytes_scalar;
+    use crate::testutil as test;
+    use crate::{cpu, digest, ec::suite_b::ops::*, limb};
+
+    #[test]
+    fn test() {
+        let cpu = cpu::features();
+        test::run(
+            test_vector_file!("ecdsa_digest_scalar_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let curve_name = test_case.consume_string("Curve");
+                let digest_name = test_case.consume_string("Digest");
+                let input = test_case.consume_bytes("Input");
+                let output = test_case.consume_bytes("Output");
+
+                let (ops, digest_alg) = match (curve_name.as_str(), digest_name.as_str()) {
+                    ("P-256", "SHA256") => (&p256::PUBLIC_SCALAR_OPS, &digest::SHA256),
+                    ("P-256", "SHA384") => (&p256::PUBLIC_SCALAR_OPS, &digest::SHA384),
+                    ("P-384", "SHA256") => (&p384::PUBLIC_SCALAR_OPS, &digest::SHA256),
+                    ("P-384", "SHA384") => (&p384::PUBLIC_SCALAR_OPS, &digest::SHA384),
+                    _ => {
+                        panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                    }
+                };
+                let n = &ops.scalar_ops.scalar_modulus(cpu);
+
+                assert_eq!(input.len(), digest_alg.output_len());
+                assert_eq!(output.len(), ops.scalar_ops.scalar_bytes_len());
+                assert_eq!(output.len(), n.bytes_len());
+
+                let expected = scalar_parse_big_endian_variable(
+                    n,
+                    limb::AllowZero::Yes,
+                    untrusted::Input::from(&output),
+                )
+                .unwrap();
+
+                let actual = digest_bytes_scalar(n, &input);
+                assert_eq!(
+                    ops.scalar_ops.leak_limbs(&actual),
+                    ops.scalar_ops.leak_limbs(&expected)
+                );
+
+                Ok(())
+            },
+        );
+    }
+}
diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der
new file mode 100644
index 0000000000..d579082387
Binary files /dev/null and b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der differ
diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der
new file mode 100644
index 0000000000..76cc36d403
Binary files /dev/null and b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der differ
diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/signing.rs b/ring-0.17.14/src/ec/suite_b/ecdsa/signing.rs
new file mode 100644
index 0000000000..c2bc733c8a
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ecdsa/signing.rs
@@ -0,0 +1,615 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! ECDSA Signatures using the P-256 and P-384 curves.
+
+use super::digest_scalar::digest_scalar;
+use crate::{
+    arithmetic::montgomery::*,
+    cpu, digest,
+    ec::{
+        self,
+        suite_b::{ops::*, private_key},
+    },
+    error,
+    io::der,
+    limb, pkcs8, rand, sealed, signature,
+};
+/// An ECDSA signing algorithm.
+pub struct EcdsaSigningAlgorithm {
+    curve: &'static ec::Curve,
+    private_scalar_ops: &'static PrivateScalarOps,
+    private_key_ops: &'static PrivateKeyOps,
+    digest_alg: &'static digest::Algorithm,
+    pkcs8_template: &'static pkcs8::Template,
+    format_rs: fn(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize,
+    id: AlgorithmID,
+}
+
+#[derive(Debug, Eq, PartialEq)]
+enum AlgorithmID {
+    ECDSA_P256_SHA256_FIXED_SIGNING,
+    ECDSA_P384_SHA384_FIXED_SIGNING,
+    ECDSA_P256_SHA256_ASN1_SIGNING,
+    ECDSA_P384_SHA384_ASN1_SIGNING,
+}
+
+derive_debug_via_id!(EcdsaSigningAlgorithm);
+
+impl PartialEq for EcdsaSigningAlgorithm {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+    }
+}
+
+impl Eq for EcdsaSigningAlgorithm {}
+
+impl sealed::Sealed for EcdsaSigningAlgorithm {}
+
+/// An ECDSA key pair, used for signing.
+pub struct EcdsaKeyPair {
+    d: Scalar<R>,
+    nonce_key: NonceRandomKey,
+    alg: &'static EcdsaSigningAlgorithm,
+    public_key: PublicKey,
+}
+
+derive_debug_via_field!(EcdsaKeyPair, stringify!(EcdsaKeyPair), public_key);
+
+impl EcdsaKeyPair {
+    /// Generates a new key pair and returns the key pair serialized as a
+    /// PKCS#8 document.
+    ///
+    /// The PKCS#8 document will be a v1 `OneAsymmetricKey` with the public key
+    /// included in the `ECPrivateKey` structure, as described in
+    /// [RFC 5958 Section 2] and [RFC 5915]. The `ECPrivateKey` structure will
+    /// not have a `parameters` field so the generated key is compatible with
+    /// PKCS#11.
+    ///
+    /// [RFC 5915]: https://tools.ietf.org/html/rfc5915
+    /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2
+    pub fn generate_pkcs8(
+        alg: &'static EcdsaSigningAlgorithm,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<pkcs8::Document, error::Unspecified> {
+        let cpu = cpu::features();
+        let private_key = ec::Seed::generate(alg.curve, rng, cpu)?;
+        let public_key = private_key.compute_public_key(cpu)?;
+        Ok(pkcs8::wrap_key(
+            alg.pkcs8_template,
+            private_key.bytes_less_safe(),
+            public_key.as_ref(),
+        ))
+    }
+
+    /// Constructs an ECDSA key pair by parsing an unencrypted PKCS#8 v1
+    /// id-ecPublicKey `ECPrivateKey` key.
+    ///
+    /// The input must be in PKCS#8 v1 format. It must contain the public key in
+    /// the `ECPrivateKey` structure; `from_pkcs8()` will verify that the public
+    /// key and the private key are consistent with each other. The algorithm
+    /// identifier must identify the curve by name; it must not use an
+    /// "explicit" encoding of the curve. The `parameters` field of the
+    /// `ECPrivateKey`, if present, must be the same named curve that is in the
+    /// algorithm identifier in the PKCS#8 header.
+    pub fn from_pkcs8(
+        alg: &'static EcdsaSigningAlgorithm,
+        pkcs8: &[u8],
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<Self, error::KeyRejected> {
+        let key_pair = ec::suite_b::key_pair_from_pkcs8(
+            alg.curve,
+            alg.pkcs8_template,
+            untrusted::Input::from(pkcs8),
+            cpu::features(),
+        )?;
+        Self::new(alg, key_pair, rng)
+    }
+
+    /// Constructs an ECDSA key pair from the private key and public key bytes
+    ///
+    /// The private key must encoded as a big-endian fixed-length integer. For
+    /// example, a P-256 private key must be 32 bytes prefixed with leading
+    /// zeros as needed.
+    ///
+    /// The public key is encoding in uncompressed form using the
+    /// Octet-String-to-Elliptic-Curve-Point algorithm in
+    /// [SEC 1: Elliptic Curve Cryptography, Version 2.0].
+    ///
+    /// This is intended for use by code that deserializes key pairs. It is
+    /// recommended to use `EcdsaKeyPair::from_pkcs8()` (with a PKCS#8-encoded
+    /// key) instead.
+    ///
+    /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]:
+    ///     http://www.secg.org/sec1-v2.pdf
+    pub fn from_private_key_and_public_key(
+        alg: &'static EcdsaSigningAlgorithm,
+        private_key: &[u8],
+        public_key: &[u8],
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<Self, error::KeyRejected> {
+        let key_pair = ec::suite_b::key_pair_from_bytes(
+            alg.curve,
+            untrusted::Input::from(private_key),
+            untrusted::Input::from(public_key),
+            cpu::features(),
+        )?;
+        Self::new(alg, key_pair, rng)
+    }
+
+    fn new(
+        alg: &'static EcdsaSigningAlgorithm,
+        key_pair: ec::KeyPair,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<Self, error::KeyRejected> {
+        let cpu = cpu::features();
+
+        let (seed, public_key) = key_pair.split();
+        let n = &alg.private_scalar_ops.scalar_ops.scalar_modulus(cpu);
+        let d = private_key::private_key_as_scalar(n, &seed);
+        let d = alg.private_scalar_ops.to_mont(&d, cpu);
+
+        let nonce_key = NonceRandomKey::new(alg, &seed, rng)?;
+        Ok(Self {
+            d,
+            nonce_key,
+            alg,
+            public_key: PublicKey(public_key),
+        })
+    }
+
+    /// Returns the signature of the `message` using a random nonce generated by `rng`.
+    pub fn sign(
+        &self,
+        rng: &dyn rand::SecureRandom,
+        message: &[u8],
+    ) -> Result<signature::Signature, error::Unspecified> {
+        let cpu = cpu::features();
+
+        // Step 4 (out of order).
+        let h = digest::digest(self.alg.digest_alg, message);
+
+        // Incorporate `h` into the nonce to hedge against faulty RNGs. (This
+        // is not an approved random number generator that is mandated in
+        // the spec.)
+        let nonce_rng = NonceRandom {
+            key: &self.nonce_key,
+            message_digest: &h,
+            rng,
+        };
+
+        self.sign_digest(h, &nonce_rng, cpu)
+    }
+
+    #[cfg(test)]
+    fn sign_with_fixed_nonce_during_test(
+        &self,
+        rng: &dyn rand::SecureRandom,
+        message: &[u8],
+    ) -> Result<signature::Signature, error::Unspecified> {
+        // Step 4 (out of order).
+        let h = digest::digest(self.alg.digest_alg, message);
+
+        self.sign_digest(h, rng, cpu::features())
+    }
+
+    /// Returns the signature of message digest `h` using a "random" nonce
+    /// generated by `rng`.
+    fn sign_digest(
+        &self,
+        h: digest::Digest,
+        rng: &dyn rand::SecureRandom,
+        cpu: cpu::Features,
+    ) -> Result<signature::Signature, error::Unspecified> {
+        // NSA Suite B Implementer's Guide to ECDSA Section 3.4.1: ECDSA
+        // Signature Generation.
+
+        // NSA Guide Prerequisites:
+        //
+        //     Prior to generating an ECDSA signature, the signatory shall
+        //     obtain:
+        //
+        //     1. an authentic copy of the domain parameters,
+        //     2. a digital signature key pair (d,Q), either generated by a
+        //        method from Appendix A.1, or obtained from a trusted third
+        //        party,
+        //     3. assurance of the validity of the public key Q (see Appendix
+        //        A.3), and
+        //     4. assurance that he/she/it actually possesses the associated
+        //        private key d (see [SP800-89] Section 6).
+        //
+        // The domain parameters are hard-coded into the source code.
+        // `EcdsaKeyPair::generate_pkcs8()` can be used to meet the second
+        // requirement; otherwise, it is up to the user to ensure the key pair
+        // was obtained from a trusted private key. The constructors for
+        // `EcdsaKeyPair` ensure that #3 and #4 are met subject to the caveats
+        // in SP800-89 Section 6.
+
+        let ops = self.alg.private_scalar_ops;
+        let scalar_ops = ops.scalar_ops;
+        let cops = scalar_ops.common;
+        let private_key_ops = self.alg.private_key_ops;
+        let q = &cops.elem_modulus(cpu);
+        let n = &scalar_ops.scalar_modulus(cpu);
+
+        for _ in 0..100 {
+            // XXX: iteration conut?
+            // Step 1.
+            let k = private_key::random_scalar(self.alg.private_key_ops, n, rng)?;
+            let k_inv = ops.scalar_inv_to_mont(&k, cpu);
+
+            // Step 2.
+            let r = private_key_ops.point_mul_base(&k, cpu);
+
+            // Step 3.
+            let r = {
+                let (x, _) = private_key::affine_from_jacobian(private_key_ops, q, &r)?;
+                let x = q.elem_unencoded(&x);
+                n.elem_reduced_to_scalar(&x)
+            };
+            if n.is_zero(&r) {
+                continue;
+            }
+
+            // Step 4 is done by the caller.
+
+            // Step 5.
+            let e = digest_scalar(n, h);
+
+            // Step 6.
+            let s = {
+                let mut e_plus_dr = scalar_ops.scalar_product(&self.d, &r, cpu);
+                n.add_assign(&mut e_plus_dr, &e);
+                scalar_ops.scalar_product(&k_inv, &e_plus_dr, cpu)
+            };
+            if n.is_zero(&s) {
+                continue;
+            }
+
+            // Step 7 with encoding.
+            return Ok(signature::Signature::new(|sig_bytes| {
+                (self.alg.format_rs)(scalar_ops, &r, &s, sig_bytes)
+            }));
+        }
+
+        Err(error::Unspecified)
+    }
+}
+
+/// Generates an ECDSA nonce in a way that attempts to protect against a faulty
+/// `SecureRandom`.
+struct NonceRandom<'a> {
+    key: &'a NonceRandomKey,
+    message_digest: &'a digest::Digest,
+    rng: &'a dyn rand::SecureRandom,
+}
+
+impl core::fmt::Debug for NonceRandom<'_> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("NonceRandom").finish()
+    }
+}
+
+impl rand::sealed::SecureRandom for NonceRandom<'_> {
+    fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+        // Use the same digest algorithm that will be used to digest the
+        // message. The digest algorithm's output is exactly the right size;
+        // this is checked below.
+        //
+        // XXX(perf): The single iteration will require two digest block
+        // operations because the amount of data digested is larger than one
+        // block.
+        let digest_alg = self.key.0.algorithm();
+        let mut ctx = digest::Context::new(digest_alg);
+
+        // Digest the randomized digest of the private key.
+        let key = self.key.0.as_ref();
+        ctx.update(key);
+
+        // The random value is digested between the key and the message so that
+        // the key and the message are not directly digested in the same digest
+        // block.
+        assert!(key.len() <= digest_alg.block_len() / 2);
+        {
+            let mut rand = [0u8; digest::MAX_BLOCK_LEN];
+            let rand = &mut rand[..digest_alg.block_len() - key.len()];
+            assert!(rand.len() >= dest.len());
+            self.rng.fill(rand)?;
+            ctx.update(rand);
+        }
+
+        ctx.update(self.message_digest.as_ref());
+
+        let nonce = ctx.finish();
+
+        // `copy_from_slice()` panics if the lengths differ, so we don't have
+        // to separately assert that the lengths are the same.
+        dest.copy_from_slice(nonce.as_ref());
+
+        Ok(())
+    }
+}
+
+impl sealed::Sealed for NonceRandom<'_> {}
+
+struct NonceRandomKey(digest::Digest);
+
+impl NonceRandomKey {
+    fn new(
+        alg: &EcdsaSigningAlgorithm,
+        seed: &ec::Seed,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<Self, error::KeyRejected> {
+        let mut rand = [0; digest::MAX_OUTPUT_LEN];
+        let rand = &mut rand[0..alg.curve.elem_scalar_seed_len];
+
+        // XXX: `KeyRejected` isn't the right way to model  failure of the RNG,
+        // but to fix that we'd need to break the API by changing the result type.
+        // TODO: Fix the API in the next breaking release.
+        rng.fill(rand)
+            .map_err(|error::Unspecified| error::KeyRejected::rng_failed())?;
+
+        let mut ctx = digest::Context::new(alg.digest_alg);
+        ctx.update(rand);
+        ctx.update(seed.bytes_less_safe());
+        Ok(Self(ctx.finish()))
+    }
+}
+
+impl signature::KeyPair for EcdsaKeyPair {
+    type PublicKey = PublicKey;
+
+    fn public_key(&self) -> &Self::PublicKey {
+        &self.public_key
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct PublicKey(ec::PublicKey);
+
+derive_debug_self_as_ref_hex_bytes!(PublicKey);
+
+impl AsRef<[u8]> for PublicKey {
+    fn as_ref(&self) -> &[u8] {
+        self.0.as_ref()
+    }
+}
+
+fn format_rs_fixed(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize {
+    let scalar_len = ops.scalar_bytes_len();
+
+    let (r_out, rest) = out.split_at_mut(scalar_len);
+    limb::big_endian_from_limbs(ops.leak_limbs(r), r_out);
+
+    let (s_out, _) = rest.split_at_mut(scalar_len);
+    limb::big_endian_from_limbs(ops.leak_limbs(s), s_out);
+
+    2 * scalar_len
+}
+
+fn format_rs_asn1(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize {
+    // This assumes `a` is not zero since neither `r` or `s` is allowed to be
+    // zero.
+    fn format_integer_tlv(ops: &ScalarOps, a: &Scalar, out: &mut [u8]) -> usize {
+        let mut fixed = [0u8; ec::SCALAR_MAX_BYTES + 1];
+        let fixed = &mut fixed[..(ops.scalar_bytes_len() + 1)];
+        limb::big_endian_from_limbs(ops.leak_limbs(a), &mut fixed[1..]);
+
+        // Since `a_fixed_out` is an extra byte long, it is guaranteed to start
+        // with a zero.
+        debug_assert_eq!(fixed[0], 0);
+
+        // There must be at least one non-zero byte since `a` isn't zero.
+        let first_index = fixed.iter().position(|b| *b != 0).unwrap();
+
+        // If the first byte has its high bit set, it needs to be prefixed with 0x00.
+        let first_index = if fixed[first_index] & 0x80 != 0 {
+            first_index - 1
+        } else {
+            first_index
+        };
+        let value = &fixed[first_index..];
+
+        out[0] = der::Tag::Integer.into();
+
+        // Lengths less than 128 are encoded in one byte.
+        assert!(value.len() < 128);
+        #[allow(clippy::cast_possible_truncation)]
+        {
+            out[1] = value.len() as u8;
+        }
+
+        out[2..][..value.len()].copy_from_slice(value);
+
+        2 + value.len()
+    }
+
+    out[0] = der::Tag::Sequence.into();
+    let r_tlv_len = format_integer_tlv(ops, r, &mut out[2..]);
+    let s_tlv_len = format_integer_tlv(ops, s, &mut out[2..][r_tlv_len..]);
+
+    // Lengths less than 128 are encoded in one byte.
+    let value_len = r_tlv_len + s_tlv_len;
+    assert!(value_len < 128);
+    #[allow(clippy::cast_possible_truncation)]
+    {
+        out[1] = value_len as u8;
+    }
+
+    2 + value_len
+}
+
+/// Signing of fixed-length (PKCS#11 style) ECDSA signatures using the
+/// P-256 curve and SHA-256.
+///
+/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P256_SHA256_FIXED_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm {
+    curve: &ec::suite_b::curve::P256,
+    private_scalar_ops: &p256::PRIVATE_SCALAR_OPS,
+    private_key_ops: &p256::PRIVATE_KEY_OPS,
+    digest_alg: &digest::SHA256,
+    pkcs8_template: &EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE,
+    format_rs: format_rs_fixed,
+    id: AlgorithmID::ECDSA_P256_SHA256_FIXED_SIGNING,
+};
+
+/// Signing of fixed-length (PKCS#11 style) ECDSA signatures using the
+/// P-384 curve and SHA-384.
+///
+/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P384_SHA384_FIXED_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm {
+    curve: &ec::suite_b::curve::P384,
+    private_scalar_ops: &p384::PRIVATE_SCALAR_OPS,
+    private_key_ops: &p384::PRIVATE_KEY_OPS,
+    digest_alg: &digest::SHA384,
+    pkcs8_template: &EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE,
+    format_rs: format_rs_fixed,
+    id: AlgorithmID::ECDSA_P384_SHA384_FIXED_SIGNING,
+};
+
+/// Signing of ASN.1 DER-encoded ECDSA signatures using the P-256 curve and
+/// SHA-256.
+///
+/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P256_SHA256_ASN1_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm {
+    curve: &ec::suite_b::curve::P256,
+    private_scalar_ops: &p256::PRIVATE_SCALAR_OPS,
+    private_key_ops: &p256::PRIVATE_KEY_OPS,
+    digest_alg: &digest::SHA256,
+    pkcs8_template: &EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE,
+    format_rs: format_rs_asn1,
+    id: AlgorithmID::ECDSA_P256_SHA256_ASN1_SIGNING,
+};
+
+/// Signing of ASN.1 DER-encoded ECDSA signatures using the P-384 curve and
+/// SHA-384.
+///
+/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P384_SHA384_ASN1_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm {
+    curve: &ec::suite_b::curve::P384,
+    private_scalar_ops: &p384::PRIVATE_SCALAR_OPS,
+    private_key_ops: &p384::PRIVATE_KEY_OPS,
+    digest_alg: &digest::SHA384,
+    pkcs8_template: &EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE,
+    format_rs: format_rs_asn1,
+    id: AlgorithmID::ECDSA_P384_SHA384_ASN1_SIGNING,
+};
+
+static EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE: pkcs8::Template = pkcs8::Template {
+    bytes: include_bytes!("ecPublicKey_p256_pkcs8_v1_template.der"),
+    alg_id_range: core::ops::Range { start: 8, end: 27 },
+    curve_id_index: 9,
+    private_key_index: 0x24,
+};
+
+static EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE: pkcs8::Template = pkcs8::Template {
+    bytes: include_bytes!("ecPublicKey_p384_pkcs8_v1_template.der"),
+    alg_id_range: core::ops::Range { start: 8, end: 24 },
+    curve_id_index: 9,
+    private_key_index: 0x23,
+};
+
+#[cfg(test)]
+mod tests {
+    use crate::testutil as test;
+    use crate::{rand, signature};
+
+    #[test]
+    fn signature_ecdsa_sign_fixed_test() {
+        let rng = rand::SystemRandom::new();
+
+        test::run(
+            test_vector_file!("ecdsa_sign_fixed_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let curve_name = test_case.consume_string("Curve");
+                let digest_name = test_case.consume_string("Digest");
+                let msg = test_case.consume_bytes("Msg");
+                let d = test_case.consume_bytes("d");
+                let q = test_case.consume_bytes("Q");
+                let k = test_case.consume_bytes("k");
+
+                let expected_result = test_case.consume_bytes("Sig");
+
+                let alg = match (curve_name.as_str(), digest_name.as_str()) {
+                    ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_FIXED_SIGNING,
+                    ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_FIXED_SIGNING,
+                    _ => {
+                        panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                    }
+                };
+
+                let private_key =
+                    signature::EcdsaKeyPair::from_private_key_and_public_key(alg, &d, &q, &rng)
+                        .unwrap();
+                let rng = test::rand::FixedSliceRandom { bytes: &k };
+
+                let actual_result = private_key
+                    .sign_with_fixed_nonce_during_test(&rng, &msg)
+                    .unwrap();
+
+                assert_eq!(actual_result.as_ref(), &expected_result[..]);
+
+                Ok(())
+            },
+        );
+    }
+
+    #[test]
+    fn signature_ecdsa_sign_asn1_test() {
+        let rng = rand::SystemRandom::new();
+
+        test::run(
+            test_vector_file!("ecdsa_sign_asn1_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let curve_name = test_case.consume_string("Curve");
+                let digest_name = test_case.consume_string("Digest");
+                let msg = test_case.consume_bytes("Msg");
+                let d = test_case.consume_bytes("d");
+                let q = test_case.consume_bytes("Q");
+                let k = test_case.consume_bytes("k");
+
+                let expected_result = test_case.consume_bytes("Sig");
+
+                let alg = match (curve_name.as_str(), digest_name.as_str()) {
+                    ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_ASN1_SIGNING,
+                    ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_ASN1_SIGNING,
+                    _ => {
+                        panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                    }
+                };
+
+                let private_key =
+                    signature::EcdsaKeyPair::from_private_key_and_public_key(alg, &d, &q, &rng)
+                        .unwrap();
+                let rng = test::rand::FixedSliceRandom { bytes: &k };
+
+                let actual_result = private_key
+                    .sign_with_fixed_nonce_during_test(&rng, &msg)
+                    .unwrap();
+
+                assert_eq!(actual_result.as_ref(), &expected_result[..]);
+
+                Ok(())
+            },
+        );
+    }
+}
diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs b/ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs
new file mode 100644
index 0000000000..e500831aa6
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs
@@ -0,0 +1,332 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! ECDSA Signatures using the P-256 and P-384 curves.
+
+use super::digest_scalar::digest_scalar;
+use crate::{
+    arithmetic::montgomery::*,
+    cpu, digest,
+    ec::suite_b::{ops::*, public_key::*, verify_jacobian_point_is_on_the_curve},
+    error,
+    io::der,
+    limb, sealed, signature,
+};
+
+/// An ECDSA verification algorithm.
+pub struct EcdsaVerificationAlgorithm {
+    ops: &'static PublicScalarOps,
+    digest_alg: &'static digest::Algorithm,
+    split_rs:
+        for<'a> fn(
+            ops: &'static ScalarOps,
+            input: &mut untrusted::Reader<'a>,
+        )
+            -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified>,
+    id: AlgorithmID,
+}
+
+#[derive(Debug)]
+enum AlgorithmID {
+    ECDSA_P256_SHA256_ASN1,
+    ECDSA_P256_SHA256_FIXED,
+    ECDSA_P256_SHA384_ASN1,
+    ECDSA_P384_SHA256_ASN1,
+    ECDSA_P384_SHA384_ASN1,
+    ECDSA_P384_SHA384_FIXED,
+}
+
+derive_debug_via_id!(EcdsaVerificationAlgorithm);
+
+impl signature::VerificationAlgorithm for EcdsaVerificationAlgorithm {
+    fn verify(
+        &self,
+        public_key: untrusted::Input,
+        msg: untrusted::Input,
+        signature: untrusted::Input,
+    ) -> Result<(), error::Unspecified> {
+        let cpu = cpu::features();
+        let e = {
+            // NSA Guide Step 2: "Use the selected hash function to compute H =
+            // Hash(M)."
+            let h = digest::digest(self.digest_alg, msg.as_slice_less_safe());
+
+            // NSA Guide Step 3: "Convert the bit string H to an integer e as
+            // described in Appendix B.2."
+            let n = &self.ops.scalar_ops.scalar_modulus(cpu);
+            digest_scalar(n, h)
+        };
+
+        self.verify_digest(public_key, e, signature)
+    }
+}
+
+impl EcdsaVerificationAlgorithm {
+    /// This is intentionally not public.
+    fn verify_digest(
+        &self,
+        public_key: untrusted::Input,
+        e: Scalar,
+        signature: untrusted::Input,
+    ) -> Result<(), error::Unspecified> {
+        let cpu = cpu::features();
+
+        // NSA Suite B Implementer's Guide to ECDSA Section 3.4.2.
+
+        let public_key_ops = self.ops.public_key_ops;
+        let scalar_ops = self.ops.scalar_ops;
+        let q = &public_key_ops.common.elem_modulus(cpu);
+        let n = &scalar_ops.scalar_modulus(cpu);
+
+        // NSA Guide Prerequisites:
+        //
+        //    Prior to accepting a verified digital signature as valid the
+        //    verifier shall have:
+        //
+        //    1. assurance of the signatory’s claimed identity,
+        //    2. an authentic copy of the domain parameters, (q, FR, a, b, SEED,
+        //       G, n, h),
+        //    3. assurance of the validity of the public key, and
+        //    4. assurance that the claimed signatory actually possessed the
+        //       private key that was used to generate the digital signature at
+        //       the time that the signature was generated.
+        //
+        // Prerequisites #1 and #4 are outside the scope of what this function
+        // can do. Prerequisite #2 is handled implicitly as the domain
+        // parameters are hard-coded into the source. Prerequisite #3 is
+        // handled by `parse_uncompressed_point`.
+        let peer_pub_key = parse_uncompressed_point(public_key_ops, q, public_key)?;
+
+        let (r, s) = signature.read_all(error::Unspecified, |input| {
+            (self.split_rs)(scalar_ops, input)
+        })?;
+
+        // NSA Guide Step 1: "If r and s are not both integers in the interval
+        // [1, n − 1], output INVALID."
+        let r = scalar_parse_big_endian_variable(n, limb::AllowZero::No, r)?;
+        let s = scalar_parse_big_endian_variable(n, limb::AllowZero::No, s)?;
+
+        // NSA Guide Step 4: "Compute w = s**−1 mod n, using the routine in
+        // Appendix B.1."
+        let w = self.ops.scalar_inv_to_mont_vartime(&s, cpu);
+
+        // NSA Guide Step 5: "Compute u1 = (e * w) mod n, and compute
+        // u2 = (r * w) mod n."
+        let u1 = scalar_ops.scalar_product(&e, &w, cpu);
+        let u2 = scalar_ops.scalar_product(&r, &w, cpu);
+
+        // NSA Guide Step 6: "Compute the elliptic curve point
+        // R = (xR, yR) = u1*G + u2*Q, using EC scalar multiplication and EC
+        // addition. If R is equal to the point at infinity, output INVALID."
+        let product = (self.ops.twin_mul)(&u1, &u2, &peer_pub_key, cpu);
+
+        // Verify that the point we computed is on the curve; see
+        // `verify_affine_point_is_on_the_curve_scaled` for details on why. It
+        // would be more secure to do the check on the affine coordinates if we
+        // were going to convert to affine form (again, see
+        // `verify_affine_point_is_on_the_curve_scaled` for details on why).
+        // But, we're going to avoid converting to affine for performance
+        // reasons, so we do the verification using the Jacobian coordinates.
+        let z2 = verify_jacobian_point_is_on_the_curve(q, &product)?;
+
+        // NSA Guide Step 7: "Compute v = xR mod n."
+        // NSA Guide Step 8: "Compare v and r0. If v = r0, output VALID;
+        // otherwise, output INVALID."
+        //
+        // Instead, we use Greg Maxwell's trick to avoid the inversion mod `q`
+        // that would be necessary to compute the affine X coordinate.
+        let x = q.point_x(&product);
+        fn sig_r_equals_x(q: &Modulus<Q>, r: &Elem<Unencoded>, x: &Elem<R>, z2: &Elem<R>) -> bool {
+            let r_jacobian = q.elem_product(z2, r);
+            let x = q.elem_unencoded(x);
+            q.elems_are_equal(&r_jacobian, &x).leak()
+        }
+        let mut r = self.ops.scalar_as_elem(&r);
+        if sig_r_equals_x(q, &r, &x, &z2) {
+            return Ok(());
+        }
+        if q.elem_less_than_vartime(&r, &self.ops.q_minus_n) {
+            let n = Elem::from(self.ops.n());
+            q.add_assign(&mut r, &n);
+            if sig_r_equals_x(q, &r, &x, &z2) {
+                return Ok(());
+            }
+        }
+
+        Err(error::Unspecified)
+    }
+}
+
+impl sealed::Sealed for EcdsaVerificationAlgorithm {}
+
+fn split_rs_fixed<'a>(
+    ops: &'static ScalarOps,
+    input: &mut untrusted::Reader<'a>,
+) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified> {
+    let scalar_len = ops.scalar_bytes_len();
+    let r = input.read_bytes(scalar_len)?;
+    let s = input.read_bytes(scalar_len)?;
+    Ok((r, s))
+}
+
+fn split_rs_asn1<'a>(
+    _ops: &'static ScalarOps,
+    input: &mut untrusted::Reader<'a>,
+) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified> {
+    der::nested(input, der::Tag::Sequence, error::Unspecified, |input| {
+        let r = der::positive_integer(input)?.big_endian_without_leading_zero_as_input();
+        let s = der::positive_integer(input)?.big_endian_without_leading_zero_as_input();
+        Ok((r, s))
+    })
+}
+
+/// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the
+/// P-256 curve and SHA-256.
+///
+/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P256_SHA256_FIXED: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm {
+    ops: &p256::PUBLIC_SCALAR_OPS,
+    digest_alg: &digest::SHA256,
+    split_rs: split_rs_fixed,
+    id: AlgorithmID::ECDSA_P256_SHA256_FIXED,
+};
+
+/// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the
+/// P-384 curve and SHA-384.
+///
+/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P384_SHA384_FIXED: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm {
+    ops: &p384::PUBLIC_SCALAR_OPS,
+    digest_alg: &digest::SHA384,
+    split_rs: split_rs_fixed,
+    id: AlgorithmID::ECDSA_P384_SHA384_FIXED,
+};
+
+/// Verification of ASN.1 DER-encoded ECDSA signatures using the P-256 curve
+/// and SHA-256.
+///
+/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P256_SHA256_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm {
+    ops: &p256::PUBLIC_SCALAR_OPS,
+    digest_alg: &digest::SHA256,
+    split_rs: split_rs_asn1,
+    id: AlgorithmID::ECDSA_P256_SHA256_ASN1,
+};
+
+/// *Not recommended*. Verification of ASN.1 DER-encoded ECDSA signatures using
+/// the P-256 curve and SHA-384.
+///
+/// In most situations, P-256 should be used only with SHA-256 and P-384
+/// should be used only with SHA-384. However, in some cases, particularly TLS
+/// on the web, it is necessary to support P-256 with SHA-384 for compatibility
+/// with widely-deployed implementations that do not follow these guidelines.
+///
+/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P256_SHA384_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm {
+    ops: &p256::PUBLIC_SCALAR_OPS,
+    digest_alg: &digest::SHA384,
+    split_rs: split_rs_asn1,
+    id: AlgorithmID::ECDSA_P256_SHA384_ASN1,
+};
+
+/// *Not recommended*. Verification of ASN.1 DER-encoded ECDSA signatures using
+/// the P-384 curve and SHA-256.
+///
+/// In most situations, P-256 should be used only with SHA-256 and P-384
+/// should be used only with SHA-384. However, in some cases, particularly TLS
+/// on the web, it is necessary to support P-256 with SHA-384 for compatibility
+/// with widely-deployed implementations that do not follow these guidelines.
+///
+/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P384_SHA256_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm {
+    ops: &p384::PUBLIC_SCALAR_OPS,
+    digest_alg: &digest::SHA256,
+    split_rs: split_rs_asn1,
+    id: AlgorithmID::ECDSA_P384_SHA256_ASN1,
+};
+
+/// Verification of ASN.1 DER-encoded ECDSA signatures using the P-384 curve
+/// and SHA-384.
+///
+/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level
+/// documentation for more details.
+pub static ECDSA_P384_SHA384_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm {
+    ops: &p384::PUBLIC_SCALAR_OPS,
+    digest_alg: &digest::SHA384,
+    split_rs: split_rs_asn1,
+    id: AlgorithmID::ECDSA_P384_SHA384_ASN1,
+};
+
+#[cfg(test)]
+mod tests {
+    extern crate alloc;
+    use super::*;
+    use crate::testutil as test;
+    use alloc::{vec, vec::Vec};
+
+    #[test]
+    fn test_digest_based_test_vectors() {
+        let cpu = cpu::features();
+        test::run(
+            test_vector_file!("../../../../crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let curve_name = test_case.consume_string("Curve");
+
+                let public_key = {
+                    let mut public_key = vec![0x04];
+                    public_key.extend(&test_case.consume_bytes("X"));
+                    public_key.extend(&test_case.consume_bytes("Y"));
+                    public_key
+                };
+
+                let digest = test_case.consume_bytes("Digest");
+
+                let sig = {
+                    let mut sig = Vec::new();
+                    sig.extend(&test_case.consume_bytes("R"));
+                    sig.extend(&test_case.consume_bytes("S"));
+                    sig
+                };
+
+                let invalid = test_case.consume_optional_string("Invalid");
+
+                let alg = match curve_name.as_str() {
+                    "P-256" => &ECDSA_P256_SHA256_FIXED,
+                    "P-384" => &ECDSA_P384_SHA384_FIXED,
+                    _ => {
+                        panic!("Unsupported curve: {}", curve_name);
+                    }
+                };
+                let n = &alg.ops.scalar_ops.scalar_modulus(cpu);
+
+                let digest = super::super::digest_scalar::digest_bytes_scalar(n, &digest[..]);
+                let actual_result = alg.verify_digest(
+                    untrusted::Input::from(&public_key[..]),
+                    digest,
+                    untrusted::Input::from(&sig[..]),
+                );
+                assert_eq!(actual_result.is_ok(), invalid.is_none());
+
+                Ok(())
+            },
+        );
+    }
+}
diff --git a/ring-0.17.14/src/ec/suite_b/ops.rs b/ring-0.17.14/src/ec/suite_b/ops.rs
new file mode 100644
index 0000000000..7c2ed7f208
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ops.rs
@@ -0,0 +1,1406 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{
+    arithmetic::limbs_from_hex,
+    arithmetic::montgomery::*,
+    bb::LeakyWord,
+    cpu,
+    error::{self, LenMismatchError},
+    limb::*,
+};
+use core::marker::PhantomData;
+
+use elem::{mul_mont, unary_op, unary_op_assign, unary_op_from_binary_op_assign};
+
+/// A field element, i.e. an element of ℤ/qℤ for the curve's field modulus
+/// *q*.
+pub type Elem<E> = elem::Elem<Q, E>;
+type PublicElem<E> = elem::PublicElem<Q, E>;
+
+/// Represents the (prime) order *q* of the curve's prime field.
+#[derive(Clone, Copy)]
+pub enum Q {}
+
+/// A scalar. Its value is in [0, n). Zero-valued scalars are forbidden in most
+/// contexts.
+pub type Scalar<E = Unencoded> = elem::Elem<N, E>;
+type PublicScalar<E> = elem::PublicElem<N, E>;
+
+/// Represents the prime order *n* of the curve's group.
+#[derive(Clone, Copy)]
+pub enum N {}
+
+pub(super) struct Modulus<M> {
+    // TODO: [Limb; elem::NumLimbs::MAX]
+    limbs: &'static [Limb; elem::NumLimbs::MAX],
+    num_limbs: elem::NumLimbs,
+    cops: &'static CommonOps,
+    m: PhantomData<M>,
+    cpu: cpu::Features,
+}
+
+pub struct Point {
+    // The coordinates are stored in a contiguous array, where the first
+    // `ops.num_limbs` elements are the X coordinate, the next
+    // `ops.num_limbs` elements are the Y coordinate, and the next
+    // `ops.num_limbs` elements are the Z coordinate. This layout is dictated
+    // by the requirements of the nistz256 code.
+    xyz: [Limb; 3 * elem::NumLimbs::MAX],
+}
+
+impl Point {
+    pub fn new_at_infinity() -> Self {
+        Self {
+            xyz: [0; 3 * elem::NumLimbs::MAX],
+        }
+    }
+}
+
+/// Operations and values needed by all curve operations.
+pub struct CommonOps {
+    num_limbs: elem::NumLimbs,
+    q: PublicModulus,
+    n: PublicElem<Unencoded>,
+
+    pub a: PublicElem<R>, // Must be -3 mod q
+    pub b: PublicElem<R>,
+
+    // In all cases, `r`, `a`, and `b` may all alias each other.
+    elem_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+    elem_sqr_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
+}
+
+impl CommonOps {
+    pub(super) fn elem_modulus(&'static self, cpu_features: cpu::Features) -> Modulus<Q> {
+        Modulus {
+            // TODO: limbs: self.q.p.map(Limb::from),
+            limbs: &self.q.p,
+            num_limbs: self.num_limbs,
+            cops: self,
+            m: PhantomData,
+            cpu: cpu_features,
+        }
+    }
+
+    pub(super) fn scalar_modulus(&'static self, cpu_features: cpu::Features) -> Modulus<N> {
+        Modulus {
+            // TODO: limbs: self.n.limbs.map(Limb::from),
+            limbs: &self.n.limbs,
+            num_limbs: self.num_limbs,
+            cops: self,
+            m: PhantomData,
+            cpu: cpu_features,
+        }
+    }
+
+    // The length of a field element, which is the same as the length of a
+    // scalar, in bytes.
+    pub fn len(&self) -> usize {
+        // Keep in sync with `Modulus<M>::len()`
+        self.num_limbs.into() * LIMB_BYTES
+    }
+
+    #[cfg(test)]
+    pub(super) fn n_limbs(&self) -> &[Limb] {
+        &self.n.limbs[..self.num_limbs.into()]
+    }
+}
+
+impl<M> Modulus<M> {
+    pub fn cpu(&self) -> cpu::Features {
+        self.cpu
+    }
+
+    // Keep in sync with `CommonOps::len()`.
+    pub fn bytes_len(&self) -> usize {
+        self.num_limbs.into() * LIMB_BYTES
+    }
+}
+
+impl<M> Modulus<M> {
+    #[inline]
+    pub fn add_assign<E: Encoding>(&self, a: &mut elem::Elem<M, E>, b: &elem::Elem<M, E>) {
+        let num_limbs = self.num_limbs.into();
+        limbs_add_assign_mod(
+            &mut a.limbs[..num_limbs],
+            &b.limbs[..num_limbs],
+            &self.limbs[..num_limbs],
+        )
+        .unwrap_or_else(unwrap_impossible_len_mismatch_error)
+    }
+}
+
+impl Modulus<Q> {
+    #[inline]
+    pub fn elems_are_equal<E: Encoding>(&self, a: &Elem<E>, b: &Elem<E>) -> LimbMask {
+        let num_limbs = self.num_limbs.into();
+        limbs_equal_limbs_consttime(&a.limbs[..num_limbs], &b.limbs[..num_limbs])
+            .unwrap_or_else(unwrap_impossible_len_mismatch_error)
+    }
+
+    #[inline]
+    pub fn elem_unencoded(&self, a: &Elem<R>) -> Elem<Unencoded> {
+        self.elem_product(a, &Elem::one())
+    }
+}
+
+impl CommonOps {
+    #[inline]
+    fn is_zero<M, E: Encoding>(&self, a: &elem::Elem<M, E>) -> bool {
+        let num_limbs = self.num_limbs.into();
+        limbs_are_zero(&a.limbs[..num_limbs]).leak()
+    }
+
+    #[inline]
+    fn elem_mul(&self, a: &mut Elem<R>, b: &Elem<R>, _cpu: cpu::Features) {
+        elem::binary_op_assign(self.elem_mul_mont, a, b)
+    }
+
+    #[inline]
+    fn elem_product<EA: Encoding, EB: Encoding>(
+        &self,
+        a: &Elem<EA>,
+        b: &Elem<EB>,
+        _cpu: cpu::Features,
+    ) -> Elem<<(EA, EB) as ProductEncoding>::Output>
+    where
+        (EA, EB): ProductEncoding,
+    {
+        mul_mont(self.elem_mul_mont, a, b)
+    }
+
+    #[inline]
+    fn elem_square(&self, a: &mut Elem<R>, _cpu: cpu::Features) {
+        unary_op_assign(self.elem_sqr_mont, a);
+    }
+
+    #[inline]
+    fn elem_squared(&self, a: &Elem<R>, _cpu: cpu::Features) -> Elem<R> {
+        unary_op(self.elem_sqr_mont, a)
+    }
+}
+
+impl Modulus<Q> {
+    #[inline]
+    pub fn elem_mul(&self, a: &mut Elem<R>, b: &Elem<R>) {
+        self.cops.elem_mul(a, b, self.cpu)
+    }
+
+    #[inline]
+    pub fn elem_product<EA: Encoding, EB: Encoding>(
+        &self,
+        a: &Elem<EA>,
+        b: &Elem<EB>,
+    ) -> Elem<<(EA, EB) as ProductEncoding>::Output>
+    where
+        (EA, EB): ProductEncoding,
+    {
+        self.cops.elem_product(a, b, self.cpu)
+    }
+
+    #[inline]
+    pub fn elem_square(&self, a: &mut Elem<R>) {
+        self.cops.elem_square(a, self.cpu)
+    }
+
+    #[inline]
+    pub fn elem_squared(&self, a: &Elem<R>) -> Elem<R> {
+        self.cops.elem_squared(a, self.cpu)
+    }
+}
+
+impl<M> Modulus<M> {
+    #[inline]
+    pub fn is_zero<E: Encoding>(&self, a: &elem::Elem<M, E>) -> bool {
+        self.cops.is_zero(a)
+    }
+}
+
+impl Modulus<Q> {
+    pub fn elem_verify_is_not_zero(&self, a: &Elem<R>) -> Result<(), error::Unspecified> {
+        if self.is_zero(a) {
+            Err(error::Unspecified)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(super) fn a(&self) -> &'static PublicElem<R> {
+        &self.cops.a
+    }
+    pub(super) fn b(&self) -> &'static PublicElem<R> {
+        &self.cops.b
+    }
+}
+
+impl PrivateKeyOps {
+    pub(super) fn point_sum(&self, a: &Point, b: &Point, _cpu: cpu::Features) -> Point {
+        let mut r = Point::new_at_infinity();
+        unsafe {
+            (self.point_add_jacobian_impl)(r.xyz.as_mut_ptr(), a.xyz.as_ptr(), b.xyz.as_ptr())
+        }
+        r
+    }
+}
+
+impl Modulus<Q> {
+    pub fn point_x(&self, p: &Point) -> Elem<R> {
+        let num_limbs = self.num_limbs.into();
+        let mut r = Elem::zero();
+        r.limbs[..num_limbs].copy_from_slice(&p.xyz[0..num_limbs]);
+        r
+    }
+
+    pub fn point_y(&self, p: &Point) -> Elem<R> {
+        let num_limbs = self.num_limbs.into();
+        let mut r = Elem::zero();
+        r.limbs[..num_limbs].copy_from_slice(&p.xyz[num_limbs..(2 * num_limbs)]);
+        r
+    }
+
+    pub fn point_z(&self, p: &Point) -> Elem<R> {
+        let num_limbs = self.num_limbs.into();
+        let mut r = Elem::zero();
+        r.limbs[..num_limbs].copy_from_slice(&p.xyz[(2 * num_limbs)..(3 * num_limbs)]);
+        r
+    }
+}
+
+struct PublicModulus {
+    p: [LeakyLimb; elem::NumLimbs::MAX],
+    rr: PublicElem<RR>,
+}
+
+/// Operations on private keys, for ECDH and ECDSA signing.
+pub struct PrivateKeyOps {
+    pub common: &'static CommonOps,
+    elem_inv_squared: fn(q: &Modulus<Q>, a: &Elem<R>) -> Elem<R>,
+    point_mul_base_impl: fn(a: &Scalar, cpu: cpu::Features) -> Point,
+    point_mul_impl: unsafe extern "C" fn(
+        r: *mut Limb,          // [3][num_limbs]
+        p_scalar: *const Limb, // [num_limbs]
+        p_x: *const Limb,      // [num_limbs]
+        p_y: *const Limb,      // [num_limbs]
+    ),
+    point_add_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+}
+
+impl PrivateKeyOps {
+    pub fn leak_limbs<'a>(&self, a: &'a Elem<Unencoded>) -> &'a [Limb] {
+        &a.limbs[..self.common.num_limbs.into()]
+    }
+
+    #[inline(always)]
+    pub(super) fn point_mul_base(&self, a: &Scalar, cpu: cpu::Features) -> Point {
+        (self.point_mul_base_impl)(a, cpu)
+    }
+
+    #[inline(always)]
+    pub(super) fn point_mul(
+        &self,
+        p_scalar: &Scalar,
+        (p_x, p_y): &(Elem<R>, Elem<R>),
+        _cpu: cpu::Features,
+    ) -> Point {
+        let mut r = Point::new_at_infinity();
+        unsafe {
+            (self.point_mul_impl)(
+                r.xyz.as_mut_ptr(),
+                p_scalar.limbs.as_ptr(),
+                p_x.limbs.as_ptr(),
+                p_y.limbs.as_ptr(),
+            );
+        }
+        r
+    }
+
+    #[inline]
+    pub(super) fn elem_inverse_squared(&self, q: &Modulus<Q>, a: &Elem<R>) -> Elem<R> {
+        (self.elem_inv_squared)(q, a)
+    }
+}
+
+/// Operations and values needed by all operations on public keys (ECDH
+/// agreement and ECDSA verification).
+pub struct PublicKeyOps {
+    pub common: &'static CommonOps,
+}
+
+impl PublicKeyOps {
+    // The serialized bytes are in big-endian order, zero-padded. The limbs
+    // of `Elem` are in the native endianness, least significant limb to
+    // most significant limb. Besides the parsing, conversion, this also
+    // implements NIST SP 800-56A Step 2: "Verify that xQ and yQ are integers
+    // in the interval [0, p-1] in the case that q is an odd prime p[.]"
+    pub(super) fn elem_parse(
+        &self,
+        q: &Modulus<Q>,
+        input: &mut untrusted::Reader,
+    ) -> Result<Elem<R>, error::Unspecified> {
+        let _cpu = cpu::features();
+        let encoded_value = input.read_bytes(self.common.len())?;
+        let parsed = elem_parse_big_endian_fixed_consttime(q, encoded_value)?;
+        let mut r = Elem::zero();
+        let rr = Elem::from(&self.common.q.rr);
+        // Montgomery encode (elem_to_mont).
+        // TODO: do something about this.
+        unsafe {
+            (self.common.elem_mul_mont)(
+                r.limbs.as_mut_ptr(),
+                parsed.limbs.as_ptr(),
+                rr.limbs.as_ptr(),
+            )
+        }
+        Ok(r)
+    }
+}
+
+// Operations used by both ECDSA signing and ECDSA verification. In general
+// these must be side-channel resistant.
+pub struct ScalarOps {
+    pub common: &'static CommonOps,
+
+    scalar_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+}
+
+impl ScalarOps {
+    pub(super) fn scalar_modulus(&'static self, cpu_features: cpu::Features) -> Modulus<N> {
+        self.common.scalar_modulus(cpu_features)
+    }
+
+    // The (maximum) length of a scalar, not including any padding.
+    pub fn scalar_bytes_len(&self) -> usize {
+        self.common.len()
+    }
+}
+
+impl ScalarOps {
+    pub fn leak_limbs<'s>(&self, s: &'s Scalar) -> &'s [Limb] {
+        &s.limbs[..self.common.num_limbs.into()]
+    }
+
+    #[inline]
+    pub(super) fn scalar_product<EA: Encoding, EB: Encoding>(
+        &self,
+        a: &Scalar<EA>,
+        b: &Scalar<EB>,
+        _cpu: cpu::Features,
+    ) -> Scalar<<(EA, EB) as ProductEncoding>::Output>
+    where
+        (EA, EB): ProductEncoding,
+    {
+        mul_mont(self.scalar_mul_mont, a, b)
+    }
+}
+
+/// Operations on public scalars needed by ECDSA signature verification.
+pub struct PublicScalarOps {
+    pub scalar_ops: &'static ScalarOps,
+    pub public_key_ops: &'static PublicKeyOps,
+
+    pub(super) twin_mul: fn(
+        g_scalar: &Scalar,
+        p_scalar: &Scalar,
+        p_xy: &(Elem<R>, Elem<R>),
+        cpu: cpu::Features,
+    ) -> Point,
+    scalar_inv_to_mont_vartime: fn(s: &Scalar<Unencoded>, cpu: cpu::Features) -> Scalar<R>,
+    pub(super) q_minus_n: PublicElem<Unencoded>,
+}
+
+impl PublicScalarOps {
+    pub fn n(&self) -> &PublicElem<Unencoded> {
+        &self.scalar_ops.common.n
+    }
+
+    #[inline]
+    pub fn scalar_as_elem(&self, a: &Scalar) -> Elem<Unencoded> {
+        Elem {
+            limbs: a.limbs,
+            m: PhantomData,
+            encoding: PhantomData,
+        }
+    }
+}
+
+impl Modulus<Q> {
+    pub fn elem_less_than_vartime(&self, a: &Elem<Unencoded>, b: &PublicElem<Unencoded>) -> bool {
+        let num_limbs = self.num_limbs.into();
+        limbs_less_than_limbs_vartime(&a.limbs[..num_limbs], &b.limbs[..num_limbs])
+            .unwrap_or_else(|LenMismatchError { .. }| unreachable!())
+    }
+}
+
+impl PublicScalarOps {
+    pub(super) fn scalar_inv_to_mont_vartime(
+        &self,
+        s: &Scalar<Unencoded>,
+        cpu: cpu::Features,
+    ) -> Scalar<R> {
+        (self.scalar_inv_to_mont_vartime)(s, cpu)
+    }
+}
+
+#[allow(non_snake_case)]
+pub struct PrivateScalarOps {
+    pub scalar_ops: &'static ScalarOps,
+
+    oneRR_mod_n: PublicScalar<RR>, // 1 * R**2 (mod n). TOOD: Use One<RR>.
+    scalar_inv_to_mont: fn(a: Scalar<R>, cpu: cpu::Features) -> Scalar<R>,
+}
+
+impl PrivateScalarOps {
+    pub(super) fn to_mont(&self, s: &Scalar<Unencoded>, cpu: cpu::Features) -> Scalar<R> {
+        self.scalar_ops
+            .scalar_product(s, &Scalar::from(&self.oneRR_mod_n), cpu)
+    }
+
+    /// Returns the modular inverse of `a` (mod `n`). Panics if `a` is zero.
+    pub(super) fn scalar_inv_to_mont(&self, a: &Scalar, cpu: cpu::Features) -> Scalar<R> {
+        assert!(!self.scalar_ops.common.is_zero(a));
+        let a = self.to_mont(a, cpu);
+        (self.scalar_inv_to_mont)(a, cpu)
+    }
+}
+
+// XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF
+// multiplication.
+fn twin_mul_inefficient(
+    ops: &PrivateKeyOps,
+    g_scalar: &Scalar,
+    p_scalar: &Scalar,
+    p_xy: &(Elem<R>, Elem<R>),
+    cpu: cpu::Features,
+) -> Point {
+    let scaled_g = ops.point_mul_base(g_scalar, cpu);
+    let scaled_p = ops.point_mul(p_scalar, p_xy, cpu);
+    ops.point_sum(&scaled_g, &scaled_p, cpu)
+}
+
+// This assumes n < q < 2*n.
+impl Modulus<N> {
+    pub fn elem_reduced_to_scalar(&self, elem: &Elem<Unencoded>) -> Scalar<Unencoded> {
+        let num_limbs = self.num_limbs.into();
+        let mut r_limbs = elem.limbs;
+        limbs_reduce_once(&mut r_limbs[..num_limbs], &self.limbs[..num_limbs])
+            .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+        Scalar {
+            limbs: r_limbs,
+            m: PhantomData,
+            encoding: PhantomData,
+        }
+    }
+}
+
+// Returns (`a` squared `squarings` times) * `b`.
+fn elem_sqr_mul(
+    ops: &CommonOps,
+    a: &Elem<R>,
+    squarings: LeakyWord,
+    b: &Elem<R>,
+    cpu: cpu::Features,
+) -> Elem<R> {
+    debug_assert!(squarings >= 1);
+    let mut tmp = ops.elem_squared(a, cpu);
+    for _ in 1..squarings {
+        ops.elem_square(&mut tmp, cpu);
+    }
+    ops.elem_product(&tmp, b, cpu)
+}
+
+// Sets `acc` = (`acc` squared `squarings` times) * `b`.
+fn elem_sqr_mul_acc(
+    ops: &CommonOps,
+    acc: &mut Elem<R>,
+    squarings: LeakyWord,
+    b: &Elem<R>,
+    cpu: cpu::Features,
+) {
+    debug_assert!(squarings >= 1);
+    for _ in 0..squarings {
+        ops.elem_square(acc, cpu);
+    }
+    ops.elem_mul(acc, b, cpu)
+}
+
+#[inline]
+pub(super) fn elem_parse_big_endian_fixed_consttime(
+    q: &Modulus<Q>,
+    bytes: untrusted::Input,
+) -> Result<Elem<Unencoded>, error::Unspecified> {
+    parse_big_endian_fixed_consttime(q, bytes, AllowZero::Yes)
+}
+
+#[inline]
+pub(super) fn scalar_parse_big_endian_fixed_consttime(
+    n: &Modulus<N>,
+    bytes: untrusted::Input,
+) -> Result<Scalar, error::Unspecified> {
+    parse_big_endian_fixed_consttime(n, bytes, AllowZero::No)
+}
+
+#[inline]
+pub(super) fn scalar_parse_big_endian_variable(
+    n: &Modulus<N>,
+    allow_zero: AllowZero,
+    bytes: untrusted::Input,
+) -> Result<Scalar, error::Unspecified> {
+    let num_limbs = n.num_limbs.into();
+    let mut r = Scalar::zero();
+    parse_big_endian_in_range_and_pad_consttime(
+        bytes,
+        allow_zero,
+        &n.limbs[..num_limbs],
+        &mut r.limbs[..num_limbs],
+    )?;
+    Ok(r)
+}
+
+pub(super) fn scalar_parse_big_endian_partially_reduced_variable_consttime(
+    n: &Modulus<N>,
+    bytes: untrusted::Input,
+) -> Result<Scalar, error::Unspecified> {
+    let num_limbs = n.num_limbs.into();
+    let mut r = Scalar::zero();
+    {
+        let r = &mut r.limbs[..num_limbs];
+        parse_big_endian_and_pad_consttime(bytes, r)?;
+        limbs_reduce_once(r, &n.limbs[..num_limbs])
+            .unwrap_or_else(unwrap_impossible_len_mismatch_error);
+    }
+
+    Ok(r)
+}
+
+fn parse_big_endian_fixed_consttime<M>(
+    m: &Modulus<M>,
+    bytes: untrusted::Input,
+    allow_zero: AllowZero,
+) -> Result<elem::Elem<M, Unencoded>, error::Unspecified> {
+    let num_limbs = m.num_limbs.into();
+    if bytes.len() != m.bytes_len() {
+        return Err(error::Unspecified);
+    }
+    let mut r = elem::Elem::zero();
+    parse_big_endian_in_range_and_pad_consttime(
+        bytes,
+        allow_zero,
+        &m.limbs[..num_limbs],
+        &mut r.limbs[..num_limbs],
+    )?;
+    Ok(r)
+}
+
+#[cold]
+#[inline(never)]
+fn unwrap_impossible_len_mismatch_error<T>(LenMismatchError { .. }: LenMismatchError) -> T {
+    unreachable!()
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate alloc;
+    use super::*;
+    use crate::testutil as test;
+    use alloc::{format, vec, vec::Vec};
+
+    const ZERO_SCALAR: Scalar = Scalar {
+        limbs: [0; elem::NumLimbs::MAX],
+        m: PhantomData,
+        encoding: PhantomData,
+    };
+
+    trait Convert<E: Encoding> {
+        fn convert(self, q: &Modulus<Q>) -> Elem<E>;
+    }
+
+    impl Convert<R> for Elem<R> {
+        fn convert(self, _q: &Modulus<Q>) -> Elem<R> {
+            self
+        }
+    }
+
+    impl Convert<Unencoded> for Elem<R> {
+        fn convert(self, q: &Modulus<Q>) -> Elem<Unencoded> {
+            q.elem_unencoded(&self)
+        }
+    }
+
+    fn q_minus_n_plus_n_equals_0_test(ops: &PublicScalarOps) {
+        let cops = ops.scalar_ops.common;
+        let q = &cops.elem_modulus(cpu::features());
+        let mut x = Elem::from(&ops.q_minus_n);
+        q.add_assign(&mut x, &Elem::from(&cops.n));
+        assert!(q.is_zero(&x));
+    }
+
+    #[test]
+    fn p256_q_minus_n_plus_n_equals_0_test() {
+        q_minus_n_plus_n_equals_0_test(&p256::PUBLIC_SCALAR_OPS);
+    }
+
+    #[test]
+    fn p384_q_minus_n_plus_n_equals_0_test() {
+        q_minus_n_plus_n_equals_0_test(&p384::PUBLIC_SCALAR_OPS);
+    }
+
+    #[test]
+    fn p256_elem_add_test() {
+        elem_add_test(
+            &p256::PUBLIC_SCALAR_OPS,
+            test_vector_file!("ops/p256_elem_sum_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_elem_add_test() {
+        elem_add_test(
+            &p384::PUBLIC_SCALAR_OPS,
+            test_vector_file!("ops/p384_elem_sum_tests.txt"),
+        );
+    }
+
+    fn elem_add_test(ops: &PublicScalarOps, test_file: test::File) {
+        let cops = ops.public_key_ops.common;
+        let q = &cops.elem_modulus(cpu::features());
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_elem(q, test_case, "a");
+            let b = consume_elem(q, test_case, "b");
+            let expected_sum = consume_elem(q, test_case, "r");
+
+            let mut actual_sum = a;
+            q.add_assign(&mut actual_sum, &b);
+            assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs);
+
+            let mut actual_sum = b;
+            q.add_assign(&mut actual_sum, &a);
+            assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs);
+
+            Ok(())
+        })
+    }
+
+    // XXX: There's no `p256_sub` in *ring*; it's logic is inlined into
+    // the point arithmetic functions. Thus, we can't test it.
+
+    #[test]
+    fn p384_elem_sub_test() {
+        prefixed_extern! {
+            fn p384_elem_sub(r: *mut Limb, a: *const Limb, b: *const Limb);
+        }
+        elem_sub_test(
+            &p384::COMMON_OPS,
+            p384_elem_sub,
+            test_vector_file!("ops/p384_elem_sum_tests.txt"),
+        );
+    }
+
+    fn elem_sub_test(
+        ops: &'static CommonOps,
+        elem_sub: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+        test_file: test::File,
+    ) {
+        let q = &ops.elem_modulus(cpu::features());
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_elem(q, test_case, "a");
+            let b = consume_elem(q, test_case, "b");
+            let r = consume_elem(q, test_case, "r");
+
+            let mut actual_difference = Elem::<R>::zero();
+            unsafe {
+                elem_sub(
+                    actual_difference.limbs.as_mut_ptr(),
+                    r.limbs.as_ptr(),
+                    b.limbs.as_ptr(),
+                );
+            }
+            assert_limbs_are_equal(ops, &actual_difference.limbs, &a.limbs);
+
+            let mut actual_difference = Elem::<R>::zero();
+            unsafe {
+                elem_sub(
+                    actual_difference.limbs.as_mut_ptr(),
+                    r.limbs.as_ptr(),
+                    a.limbs.as_ptr(),
+                );
+            }
+            assert_limbs_are_equal(ops, &actual_difference.limbs, &b.limbs);
+
+            Ok(())
+        })
+    }
+
+    // XXX: There's no `p256_div_by_2` in *ring*; it's logic is inlined
+    // into the point arithmetic functions. Thus, we can't test it.
+
+    #[test]
+    fn p384_elem_div_by_2_test() {
+        prefixed_extern! {
+            fn p384_elem_div_by_2(r: *mut Limb, a: *const Limb);
+        }
+        elem_div_by_2_test(
+            &p384::COMMON_OPS,
+            p384_elem_div_by_2,
+            test_vector_file!("ops/p384_elem_div_by_2_tests.txt"),
+        );
+    }
+
+    fn elem_div_by_2_test(
+        ops: &'static CommonOps,
+        elem_div_by_2: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
+        test_file: test::File,
+    ) {
+        let q = &ops.elem_modulus(cpu::features());
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_elem(q, test_case, "a");
+            let r = consume_elem(q, test_case, "r");
+
+            let mut actual_result = Elem::<R>::zero();
+            unsafe {
+                elem_div_by_2(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr());
+            }
+            assert_limbs_are_equal(ops, &actual_result.limbs, &r.limbs);
+
+            Ok(())
+        })
+    }
+
+    // There is no `ecp_nistz256_neg` on other targets.
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn p256_elem_neg_test() {
+        prefixed_extern! {
+            fn ecp_nistz256_neg(r: *mut Limb, a: *const Limb);
+        }
+        elem_neg_test(
+            &p256::COMMON_OPS,
+            ecp_nistz256_neg,
+            test_vector_file!("ops/p256_elem_neg_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_elem_neg_test() {
+        prefixed_extern! {
+            fn p384_elem_neg(r: *mut Limb, a: *const Limb);
+        }
+        elem_neg_test(
+            &p384::COMMON_OPS,
+            p384_elem_neg,
+            test_vector_file!("ops/p384_elem_neg_tests.txt"),
+        );
+    }
+
+    fn elem_neg_test(
+        ops: &'static CommonOps,
+        elem_neg: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
+        test_file: test::File,
+    ) {
+        let q = &ops.elem_modulus(cpu::features());
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_elem(q, test_case, "a");
+            let b = consume_elem(q, test_case, "b");
+
+            // Verify -a == b.
+            {
+                let mut actual_result = Elem::<R>::zero();
+                unsafe {
+                    elem_neg(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr());
+                }
+                assert_limbs_are_equal(ops, &actual_result.limbs, &b.limbs);
+            }
+
+            // Verify -b == a.
+            {
+                let mut actual_result = Elem::<R>::zero();
+                unsafe {
+                    elem_neg(actual_result.limbs.as_mut_ptr(), b.limbs.as_ptr());
+                }
+                assert_limbs_are_equal(ops, &actual_result.limbs, &a.limbs);
+            }
+
+            Ok(())
+        })
+    }
+
+    #[test]
+    fn p256_elem_mul_test() {
+        elem_mul_test(
+            &p256::COMMON_OPS,
+            test_vector_file!("ops/p256_elem_mul_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_elem_mul_test() {
+        elem_mul_test(
+            &p384::COMMON_OPS,
+            test_vector_file!("ops/p384_elem_mul_tests.txt"),
+        );
+    }
+
+    fn elem_mul_test(ops: &'static CommonOps, test_file: test::File) {
+        let q = &ops.elem_modulus(cpu::features());
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let mut a = consume_elem(q, test_case, "a");
+            let b = consume_elem(q, test_case, "b");
+            let r = consume_elem(q, test_case, "r");
+            q.elem_mul(&mut a, &b);
+            assert_limbs_are_equal(ops, &a.limbs, &r.limbs);
+
+            Ok(())
+        })
+    }
+
+    #[test]
+    fn p256_scalar_mul_test() {
+        scalar_mul_test(
+            &p256::SCALAR_OPS,
+            test_vector_file!("ops/p256_scalar_mul_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_scalar_mul_test() {
+        scalar_mul_test(
+            &p384::SCALAR_OPS,
+            test_vector_file!("ops/p384_scalar_mul_tests.txt"),
+        );
+    }
+
+    fn scalar_mul_test(ops: &ScalarOps, test_file: test::File) {
+        let cpu = cpu::features();
+        let cops = ops.common;
+        let n = &cops.scalar_modulus(cpu);
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+            let a = consume_scalar(n, test_case, "a");
+            let b = consume_scalar_mont(n, test_case, "b");
+            let expected_result = consume_scalar(n, test_case, "r");
+            let actual_result = ops.scalar_product(&a, &b, cpu);
+            assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs);
+
+            Ok(())
+        })
+    }
+
+    #[test]
+    fn p256_scalar_square_test() {
+        prefixed_extern! {
+            fn p256_scalar_sqr_rep_mont(r: *mut Limb, a: *const Limb, rep: LeakyWord);
+        }
+        scalar_square_test(
+            &p256::SCALAR_OPS,
+            p256_scalar_sqr_rep_mont,
+            test_vector_file!("ops/p256_scalar_square_tests.txt"),
+        );
+    }
+
+    // XXX: There's no `p384_scalar_square_test()` because there's no dedicated
+    // `p384_scalar_sqr_rep_mont()`.
+
+    fn scalar_square_test(
+        ops: &ScalarOps,
+        sqr_rep: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, rep: LeakyWord),
+        test_file: test::File,
+    ) {
+        let cpu = cpu::features();
+        let cops = ops.common;
+        let n = &cops.scalar_modulus(cpu);
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+            let cpu = cpu::features();
+            let a = consume_scalar(n, test_case, "a");
+            let expected_result = consume_scalar(n, test_case, "r");
+
+            {
+                let mut actual_result: Scalar<R> = Scalar {
+                    limbs: [0; elem::NumLimbs::MAX],
+                    m: PhantomData,
+                    encoding: PhantomData,
+                };
+                unsafe {
+                    sqr_rep(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr(), 1);
+                }
+                assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs);
+            }
+
+            {
+                let actual_result = ops.scalar_product(&a, &a, cpu);
+                assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs);
+            }
+
+            Ok(())
+        })
+    }
+
+    #[test]
+    #[should_panic(expected = "!self.scalar_ops.common.is_zero(a)")]
+    fn p256_scalar_inv_to_mont_zero_panic_test() {
+        let _ = p256::PRIVATE_SCALAR_OPS.scalar_inv_to_mont(&ZERO_SCALAR, cpu::features());
+    }
+
+    #[test]
+    #[should_panic(expected = "!self.scalar_ops.common.is_zero(a)")]
+    fn p384_scalar_inv_to_mont_zero_panic_test() {
+        let _ = p384::PRIVATE_SCALAR_OPS.scalar_inv_to_mont(&ZERO_SCALAR, cpu::features());
+    }
+
+    #[test]
+    fn p256_point_sum_test() {
+        point_sum_test(
+            &p256::PRIVATE_KEY_OPS,
+            test_vector_file!("ops/p256_point_sum_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_point_sum_test() {
+        point_sum_test(
+            &p384::PRIVATE_KEY_OPS,
+            test_vector_file!("ops/p384_point_sum_tests.txt"),
+        );
+    }
+
+    fn point_sum_test(ops: &PrivateKeyOps, test_file: test::File) {
+        let cpu = cpu::features();
+
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_jacobian_point(ops, test_case, "a");
+            let b = consume_jacobian_point(ops, test_case, "b");
+            let r_expected: TestPoint<R> = consume_point(ops, test_case, "r");
+
+            let r_actual = ops.point_sum(&a, &b, cpu);
+            assert_point_actual_equals_expected(ops, &r_actual, &r_expected);
+
+            Ok(())
+        });
+    }
+
+    #[test]
+    fn p256_point_sum_mixed_test() {
+        prefixed_extern! {
+            fn p256_point_add_affine(
+                r: *mut Limb,   // [p256::COMMON_OPS.num_limbs*3]
+                a: *const Limb, // [p256::COMMON_OPS.num_limbs*3]
+                b: *const Limb, // [p256::COMMON_OPS.num_limbs*2]
+            );
+        }
+        point_sum_mixed_test(
+            &p256::PRIVATE_KEY_OPS,
+            p256_point_add_affine,
+            test_vector_file!("ops/p256_point_sum_mixed_tests.txt"),
+        );
+    }
+
+    // XXX: There is no `nistz384_point_add_affine()`.
+
+    fn point_sum_mixed_test(
+        ops: &PrivateKeyOps,
+        point_add_affine: unsafe extern "C" fn(
+            r: *mut Limb,   // [ops.num_limbs*3]
+            a: *const Limb, // [ops.num_limbs*3]
+            b: *const Limb, // [ops.num_limbs*2]
+        ),
+        test_file: test::File,
+    ) {
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_jacobian_point(ops, test_case, "a");
+            let b = consume_affine_point(ops, test_case, "b");
+            let r_expected: TestPoint<R> = consume_point(ops, test_case, "r");
+
+            let mut r_actual = Point::new_at_infinity();
+            unsafe {
+                point_add_affine(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr(), b.xy.as_ptr());
+            }
+
+            assert_point_actual_equals_expected(ops, &r_actual, &r_expected);
+
+            Ok(())
+        });
+    }
+
+    #[test]
+    fn p256_point_double_test() {
+        prefixed_extern! {
+            fn p256_point_double(
+                r: *mut Limb,   // [p256::COMMON_OPS.num_limbs*3]
+                a: *const Limb, // [p256::COMMON_OPS.num_limbs*3]
+            );
+        }
+        point_double_test(
+            &p256::PRIVATE_KEY_OPS,
+            p256_point_double,
+            test_vector_file!("ops/p256_point_double_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_point_double_test() {
+        prefixed_extern! {
+            fn p384_point_double(
+                r: *mut Limb,   // [p384::COMMON_OPS.num_limbs*3]
+                a: *const Limb, // [p384::COMMON_OPS.num_limbs*3]
+            );
+        }
+        point_double_test(
+            &p384::PRIVATE_KEY_OPS,
+            p384_point_double,
+            test_vector_file!("ops/p384_point_double_tests.txt"),
+        );
+    }
+
+    fn point_double_test(
+        ops: &PrivateKeyOps,
+        point_double: unsafe extern "C" fn(
+            r: *mut Limb,   // [ops.num_limbs*3]
+            a: *const Limb, // [ops.num_limbs*3]
+        ),
+        test_file: test::File,
+    ) {
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+
+            let a = consume_jacobian_point(ops, test_case, "a");
+            let r_expected: TestPoint<R> = consume_point(ops, test_case, "r");
+
+            let mut r_actual = Point::new_at_infinity();
+            unsafe {
+                point_double(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr());
+            }
+
+            assert_point_actual_equals_expected(ops, &r_actual, &r_expected);
+
+            Ok(())
+        });
+    }
+
+    /// TODO: We should be testing `point_mul` with points other than the generator.
+    #[test]
+    fn p256_point_mul_test() {
+        let generator = (
+            Elem::from(&p256::GENERATOR.0),
+            Elem::from(&p256::GENERATOR.1),
+        );
+        point_mul_base_tests(
+            &p256::PRIVATE_KEY_OPS,
+            |s, cpu| p256::PRIVATE_KEY_OPS.point_mul(s, &generator, cpu),
+            test_vector_file!("ops/p256_point_mul_base_tests.txt"),
+        );
+    }
+
+    /// TODO: We should be testing `point_mul` with points other than the generator.
+    #[test]
+    fn p384_point_mul_test() {
+        let generator = (
+            Elem::from(&p384::GENERATOR.0),
+            Elem::from(&p384::GENERATOR.1),
+        );
+
+        point_mul_base_tests(
+            &p384::PRIVATE_KEY_OPS,
+            |s, cpu| p384::PRIVATE_KEY_OPS.point_mul(s, &generator, cpu),
+            test_vector_file!("ops/p384_point_mul_base_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p256_point_mul_serialized_test() {
+        point_mul_serialized_test(
+            &p256::PRIVATE_KEY_OPS,
+            &p256::PUBLIC_KEY_OPS,
+            test_vector_file!("ops/p256_point_mul_serialized_tests.txt"),
+        );
+    }
+
+    fn point_mul_serialized_test(
+        priv_ops: &PrivateKeyOps,
+        pub_ops: &PublicKeyOps,
+        test_file: test::File,
+    ) {
+        let cpu = cpu::features();
+        let cops = pub_ops.common;
+        let q = &cops.elem_modulus(cpu);
+        let n = &cops.scalar_modulus(cpu);
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+            let p_scalar = consume_scalar(n, test_case, "p_scalar");
+
+            let p = test_case.consume_bytes("p");
+            let p = super::super::public_key::parse_uncompressed_point(
+                pub_ops,
+                q,
+                untrusted::Input::from(&p),
+            )
+            .expect("valid point");
+
+            let expected_result = test_case.consume_bytes("r");
+
+            let product = priv_ops.point_mul(&p_scalar, &p, cpu::features());
+
+            let mut actual_result = vec![4u8; 1 + (2 * cops.len())];
+            {
+                let (x, y) = actual_result[1..].split_at_mut(cops.len());
+                super::super::private_key::big_endian_affine_from_jacobian(
+                    priv_ops,
+                    q,
+                    x,
+                    Some(y),
+                    &product,
+                )
+                .expect("successful encoding");
+            }
+
+            assert_eq!(expected_result, actual_result);
+
+            Ok(())
+        })
+    }
+
+    #[test]
+    fn p256_point_mul_base_test() {
+        point_mul_base_tests(
+            &p256::PRIVATE_KEY_OPS,
+            |s, cpu| p256::PRIVATE_KEY_OPS.point_mul_base(s, cpu),
+            test_vector_file!("ops/p256_point_mul_base_tests.txt"),
+        );
+    }
+
+    #[test]
+    fn p384_point_mul_base_test() {
+        point_mul_base_tests(
+            &p384::PRIVATE_KEY_OPS,
+            |s, cpu| p384::PRIVATE_KEY_OPS.point_mul_base(s, cpu),
+            test_vector_file!("ops/p384_point_mul_base_tests.txt"),
+        );
+    }
+
+    pub(super) fn point_mul_base_tests(
+        ops: &PrivateKeyOps,
+        f: impl Fn(&Scalar, cpu::Features) -> Point,
+        test_file: test::File,
+    ) {
+        let cpu = cpu::features();
+        let n = &ops.common.scalar_modulus(cpu);
+        test::run(test_file, |section, test_case| {
+            assert_eq!(section, "");
+            let g_scalar = consume_scalar(n, test_case, "g_scalar");
+            let expected_result: TestPoint<Unencoded> = consume_point(ops, test_case, "r");
+            let actual_result = f(&g_scalar, cpu);
+            assert_point_actual_equals_expected(ops, &actual_result, &expected_result);
+            Ok(())
+        })
+    }
+
+    fn assert_point_actual_equals_expected<E: Encoding>(
+        ops: &PrivateKeyOps,
+        actual_point: &Point,
+        expected_point: &TestPoint<E>,
+    ) where
+        Elem<R>: Convert<E>,
+    {
+        let cpu = cpu::features();
+
+        let cops = ops.common;
+        let q = &cops.elem_modulus(cpu);
+        let actual_x = &q.point_x(actual_point);
+        let actual_y = &q.point_y(actual_point);
+        let actual_z = &q.point_z(actual_point);
+        match expected_point {
+            TestPoint::Infinity => {
+                let zero = Elem::zero();
+                assert_elems_are_equal(q, actual_z, &zero);
+            }
+            TestPoint::Affine(expected_x, expected_y) => {
+                let zz_inv = ops.elem_inverse_squared(q, actual_z);
+                let x_aff = q.elem_product(actual_x, &zz_inv);
+                let y_aff = {
+                    let zzzz_inv = q.elem_squared(&zz_inv);
+                    let zzz_inv = q.elem_product(actual_z, &zzzz_inv);
+                    q.elem_product(actual_y, &zzz_inv)
+                };
+
+                let x_aff = x_aff.convert(q);
+                let y_aff = y_aff.convert(q);
+
+                assert_elems_are_equal(q, &x_aff, expected_x);
+                assert_elems_are_equal(q, &y_aff, expected_y);
+            }
+        }
+    }
+
+    fn consume_jacobian_point(
+        ops: &PrivateKeyOps,
+        test_case: &mut test::TestCase,
+        name: &str,
+    ) -> Point {
+        let q = &ops.common.elem_modulus(cpu::features());
+        let input = test_case.consume_string(name);
+        let elems = input.split(", ").collect::<Vec<&str>>();
+        assert_eq!(elems.len(), 3);
+        let mut p = Point::new_at_infinity();
+        consume_point_elem(q, &mut p.xyz, &elems, 0);
+        consume_point_elem(q, &mut p.xyz, &elems, 1);
+        consume_point_elem(q, &mut p.xyz, &elems, 2);
+        p
+    }
+
+    struct AffinePoint {
+        xy: [Limb; 2 * elem::NumLimbs::MAX],
+    }
+
+    fn consume_affine_point(
+        ops: &PrivateKeyOps,
+        test_case: &mut test::TestCase,
+        name: &str,
+    ) -> AffinePoint {
+        let q = &ops.common.elem_modulus(cpu::features());
+        let input = test_case.consume_string(name);
+        let elems = input.split(", ").collect::<Vec<&str>>();
+        assert_eq!(elems.len(), 2);
+        let mut p = AffinePoint {
+            xy: [0; 2 * elem::NumLimbs::MAX],
+        };
+        consume_point_elem(q, &mut p.xy, &elems, 0);
+        consume_point_elem(q, &mut p.xy, &elems, 1);
+        p
+    }
+
+    fn consume_point_elem(q: &Modulus<Q>, limbs_out: &mut [Limb], elems: &[&str], i: usize) {
+        let num_limbs = q.num_limbs.into();
+        let bytes = test::from_hex(elems[i]).unwrap();
+        let bytes = untrusted::Input::from(&bytes);
+        let r: Elem<Unencoded> = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap();
+        // XXX: “Transmute” this to `Elem<R>` limbs.
+        limbs_out[(i * num_limbs)..((i + 1) * num_limbs)].copy_from_slice(&r.limbs[..num_limbs]);
+    }
+
+    enum TestPoint<E: Encoding> {
+        Infinity,
+        Affine(Elem<E>, Elem<E>),
+    }
+
+    fn consume_point<E: Encoding>(
+        ops: &PrivateKeyOps,
+        test_case: &mut test::TestCase,
+        name: &str,
+    ) -> TestPoint<E> {
+        let q = &ops.common.elem_modulus(cpu::features());
+        fn consume_point_elem<E: Encoding>(q: &Modulus<Q>, elems: &[&str], i: usize) -> Elem<E> {
+            let bytes = test::from_hex(elems[i]).unwrap();
+            let bytes = untrusted::Input::from(&bytes);
+            let unencoded: Elem<Unencoded> =
+                elem_parse_big_endian_fixed_consttime(q, bytes).unwrap();
+            // XXX: “Transmute” this to `Elem<R>` limbs.
+            Elem {
+                limbs: unencoded.limbs,
+                m: PhantomData,
+                encoding: PhantomData,
+            }
+        }
+
+        let input = test_case.consume_string(name);
+        if input == "inf" {
+            return TestPoint::Infinity;
+        }
+        let elems = input.split(", ").collect::<Vec<&str>>();
+        assert_eq!(elems.len(), 2);
+        let x = consume_point_elem(q, &elems, 0);
+        let y = consume_point_elem(q, &elems, 1);
+        TestPoint::Affine(x, y)
+    }
+
+    fn assert_elems_are_equal<E: Encoding>(q: &Modulus<Q>, a: &Elem<E>, b: &Elem<E>) {
+        assert_limbs_are_equal(q.cops, &a.limbs, &b.limbs)
+    }
+
+    fn assert_limbs_are_equal(
+        ops: &CommonOps,
+        actual: &[Limb; elem::NumLimbs::MAX],
+        expected: &[Limb; elem::NumLimbs::MAX],
+    ) {
+        let num_limbs = ops.num_limbs.into();
+        if actual[..num_limbs] != expected[..num_limbs] {
+            let mut actual_s = alloc::string::String::new();
+            let mut expected_s = alloc::string::String::new();
+            for j in 0..num_limbs {
+                let width = LIMB_BITS / 4;
+                let formatted = format!("{:0width$x}", actual[num_limbs - j - 1]);
+                actual_s.push_str(&formatted);
+                let formatted = format!("{:0width$x}", expected[num_limbs - j - 1]);
+                expected_s.push_str(&formatted);
+            }
+            panic!(
+                "Actual != Expected,\nActual = {}, Expected = {}",
+                actual_s, expected_s
+            );
+        }
+    }
+
+    fn consume_elem(q: &Modulus<Q>, test_case: &mut test::TestCase, name: &str) -> Elem<R> {
+        let unpadded_bytes = test_case.consume_bytes(name);
+        let mut bytes = vec![0; q.bytes_len() - unpadded_bytes.len()];
+        bytes.extend(&unpadded_bytes);
+
+        let bytes = untrusted::Input::from(&bytes);
+        let r: Elem<Unencoded> = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap();
+        // XXX: “Transmute” this to an `Elem<R>`.
+        Elem {
+            limbs: r.limbs,
+            m: PhantomData,
+            encoding: PhantomData,
+        }
+    }
+
+    fn consume_scalar(n: &Modulus<N>, test_case: &mut test::TestCase, name: &str) -> Scalar {
+        let bytes = test_case.consume_bytes(name);
+        let bytes = untrusted::Input::from(&bytes);
+        scalar_parse_big_endian_variable(n, AllowZero::Yes, bytes).unwrap()
+    }
+
+    fn consume_scalar_mont(
+        n: &Modulus<N>,
+        test_case: &mut test::TestCase,
+        name: &str,
+    ) -> Scalar<R> {
+        let bytes = test_case.consume_bytes(name);
+        let bytes = untrusted::Input::from(&bytes);
+        let s = scalar_parse_big_endian_variable(n, AllowZero::Yes, bytes).unwrap();
+        // “Transmute” it to a `Scalar<R>`.
+        Scalar {
+            limbs: s.limbs,
+            m: PhantomData,
+            encoding: PhantomData,
+        }
+    }
+}
+
+mod elem;
+pub mod p256;
+pub mod p384;
diff --git a/ring-0.17.14/src/ec/suite_b/ops/elem.rs b/ring-0.17.14/src/ec/suite_b/ops/elem.rs
new file mode 100644
index 0000000000..6c308d959e
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ops/elem.rs
@@ -0,0 +1,167 @@
+// Copyright 2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::ec::suite_b::ops::{
+    p256::NUM_LIMBS as P256_NUM_LIMBS, p384::NUM_LIMBS as P384_NUM_LIMBS,
+};
+use crate::{
+    arithmetic::{
+        limbs_from_hex,
+        montgomery::{Encoding, ProductEncoding, Unencoded},
+    },
+    limb::{LeakyLimb, Limb},
+};
+use core::marker::PhantomData;
+
+#[derive(Clone, Copy)]
+pub(super) enum NumLimbs {
+    P256,
+    P384,
+}
+
+impl NumLimbs {
+    pub(super) const MAX: usize = Self::P384.into();
+
+    pub(super) const fn into(self) -> usize {
+        match self {
+            NumLimbs::P256 => P256_NUM_LIMBS,
+            NumLimbs::P384 => P384_NUM_LIMBS,
+        }
+    }
+}
+
+/// Elements of ℤ/mℤ for some modulus *m*. Elements are always fully reduced
+/// with respect to *m*; i.e. the 0 <= x < m for every value x.
+#[derive(Clone, Copy)]
+pub struct Elem<M, E: Encoding> {
+    // XXX: pub
+    pub(super) limbs: [Limb; NumLimbs::MAX],
+
+    /// The modulus *m* for the ring ℤ/mℤ for which this element is a value.
+    pub(super) m: PhantomData<M>,
+
+    /// The number of Montgomery factors that need to be canceled out from
+    /// `value` to get the actual value.
+    pub(super) encoding: PhantomData<E>,
+}
+
+pub struct PublicElem<M, E: Encoding> {
+    pub(super) limbs: [LeakyLimb; NumLimbs::MAX],
+    pub(super) m: PhantomData<M>,
+    pub(super) encoding: PhantomData<E>,
+}
+
+impl<M, E: Encoding> From<&PublicElem<M, E>> for Elem<M, E> {
+    fn from(value: &PublicElem<M, E>) -> Self {
+        Self {
+            limbs: core::array::from_fn(|i| Limb::from(value.limbs[i])),
+            m: value.m,
+            encoding: value.encoding,
+        }
+    }
+}
+
+impl<M, E: Encoding> Elem<M, E> {
+    // There's no need to convert `value` to the Montgomery domain since
+    // 0 * R**2 (mod m) == 0, so neither the modulus nor the encoding are needed
+    // as inputs for constructing a zero-valued element.
+    pub fn zero() -> Self {
+        Self {
+            limbs: [0; NumLimbs::MAX],
+            m: PhantomData,
+            encoding: PhantomData,
+        }
+    }
+}
+
+impl<M> Elem<M, Unencoded> {
+    pub fn one() -> Self {
+        let mut r = Self::zero();
+        r.limbs[0] = 1;
+        r
+    }
+}
+
+impl<M, E: Encoding> PublicElem<M, E> {
+    pub const fn from_hex(hex: &str) -> Self {
+        Self {
+            limbs: limbs_from_hex(hex),
+            m: PhantomData,
+            encoding: PhantomData,
+        }
+    }
+}
+
+#[inline]
+pub fn mul_mont<M, EA: Encoding, EB: Encoding>(
+    f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+    a: &Elem<M, EA>,
+    b: &Elem<M, EB>,
+) -> Elem<M, <(EA, EB) as ProductEncoding>::Output>
+where
+    (EA, EB): ProductEncoding,
+{
+    binary_op(f, a, b)
+}
+
+// let r = f(a, b); return r;
+#[inline]
+pub fn binary_op<M, EA: Encoding, EB: Encoding, ER: Encoding>(
+    f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+    a: &Elem<M, EA>,
+    b: &Elem<M, EB>,
+) -> Elem<M, ER> {
+    let mut r = Elem::zero();
+    unsafe { f(r.limbs.as_mut_ptr(), a.limbs.as_ptr(), b.limbs.as_ptr()) }
+    r
+}
+
+// a := f(a, b);
+#[inline]
+pub fn binary_op_assign<M, EA: Encoding, EB: Encoding>(
+    f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+    a: &mut Elem<M, EA>,
+    b: &Elem<M, EB>,
+) {
+    unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), b.limbs.as_ptr()) }
+}
+
+// let r = f(a); return r;
+#[inline]
+pub fn unary_op<M, E: Encoding>(
+    f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
+    a: &Elem<M, E>,
+) -> Elem<M, E> {
+    let mut r = Elem::zero();
+    unsafe { f(r.limbs.as_mut_ptr(), a.limbs.as_ptr()) }
+    r
+}
+
+// a := f(a);
+#[inline]
+pub fn unary_op_assign<M, E: Encoding>(
+    f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
+    a: &mut Elem<M, E>,
+) {
+    unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr()) }
+}
+
+// a := f(a, a);
+#[inline]
+pub fn unary_op_from_binary_op_assign<M, E: Encoding>(
+    f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
+    a: &mut Elem<M, E>,
+) {
+    unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), a.limbs.as_ptr()) }
+}
diff --git a/ring-0.17.14/src/ec/suite_b/ops/p256.rs b/ring-0.17.14/src/ec/suite_b/ops/p256.rs
new file mode 100644
index 0000000000..6f3cd02398
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ops/p256.rs
@@ -0,0 +1,334 @@
+// Copyright 2016-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    elem::{binary_op, binary_op_assign},
+    elem_sqr_mul, elem_sqr_mul_acc, PublicModulus, *,
+};
+
+pub(super) const NUM_LIMBS: usize = 256 / LIMB_BITS;
+
+pub static COMMON_OPS: CommonOps = CommonOps {
+    num_limbs: elem::NumLimbs::P256,
+
+    q: PublicModulus {
+        p: limbs_from_hex("ffffffff00000001000000000000000000000000ffffffffffffffffffffffff"),
+        rr: PublicElem::from_hex("4fffffffdfffffffffffffffefffffffbffffffff0000000000000003"),
+    },
+    n: PublicElem::from_hex("ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551"),
+
+    a: PublicElem::from_hex("fffffffc00000004000000000000000000000003fffffffffffffffffffffffc"),
+    b: PublicElem::from_hex("dc30061d04874834e5a220abf7212ed6acf005cd78843090d89cdf6229c4bddf"),
+
+    elem_mul_mont: p256_mul_mont,
+    elem_sqr_mont: p256_sqr_mont,
+};
+
+#[cfg(test)]
+pub(super) static GENERATOR: (PublicElem<R>, PublicElem<R>) = (
+    PublicElem::from_hex("18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c"),
+    PublicElem::from_hex("8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a"),
+);
+
+pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps {
+    common: &COMMON_OPS,
+    elem_inv_squared: p256_elem_inv_squared,
+    point_mul_base_impl: p256_point_mul_base_impl,
+    point_mul_impl: p256_point_mul,
+    point_add_jacobian_impl: p256_point_add,
+};
+
+fn p256_elem_inv_squared(q: &Modulus<Q>, a: &Elem<R>) -> Elem<R> {
+    // Calculate a**-2 (mod q) == a**(q - 3) (mod q)
+    //
+    // The exponent (q - 3) is:
+    //
+    //    0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc
+
+    #[inline]
+    fn sqr_mul(q: &Modulus<Q>, a: &Elem<R>, squarings: LeakyWord, b: &Elem<R>) -> Elem<R> {
+        elem_sqr_mul(&COMMON_OPS, a, squarings, b, q.cpu())
+    }
+
+    #[inline]
+    fn sqr_mul_acc(q: &Modulus<Q>, a: &mut Elem<R>, squarings: LeakyWord, b: &Elem<R>) {
+        elem_sqr_mul_acc(&COMMON_OPS, a, squarings, b, q.cpu())
+    }
+
+    let b_1 = &a;
+    let b_11 = sqr_mul(q, b_1, 1, b_1);
+    let b_111 = sqr_mul(q, &b_11, 1, b_1);
+    let f_11 = sqr_mul(q, &b_111, 3, &b_111);
+    let fff = sqr_mul(q, &f_11, 6, &f_11);
+    let fff_111 = sqr_mul(q, &fff, 3, &b_111);
+    let fffffff_11 = sqr_mul(q, &fff_111, 15, &fff_111);
+    let ffffffff = sqr_mul(q, &fffffff_11, 2, &b_11);
+
+    // ffffffff00000001
+    let mut acc = sqr_mul(q, &ffffffff, 31 + 1, b_1);
+
+    // ffffffff00000001000000000000000000000000ffffffff
+    sqr_mul_acc(q, &mut acc, 96 + 32, &ffffffff);
+
+    // ffffffff00000001000000000000000000000000ffffffffffffffff
+    sqr_mul_acc(q, &mut acc, 32, &ffffffff);
+
+    // ffffffff00000001000000000000000000000000fffffffffffffffffffffff_11
+    sqr_mul_acc(q, &mut acc, 30, &fffffff_11);
+
+    // ffffffff00000001000000000000000000000000fffffffffffffffffffffffc
+    q.elem_square(&mut acc);
+    q.elem_square(&mut acc);
+
+    acc
+}
+
+fn p256_point_mul_base_impl(g_scalar: &Scalar, _cpu: cpu::Features) -> Point {
+    prefixed_extern! {
+        fn p256_point_mul_base(
+            r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
+            g_scalar: *const Limb, // [COMMON_OPS.num_limbs]
+        );
+    }
+
+    let mut r = Point::new_at_infinity();
+    unsafe {
+        p256_point_mul_base(r.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr());
+    }
+    r
+}
+
+pub static PUBLIC_KEY_OPS: PublicKeyOps = PublicKeyOps {
+    common: &COMMON_OPS,
+};
+
+pub static SCALAR_OPS: ScalarOps = ScalarOps {
+    common: &COMMON_OPS,
+    scalar_mul_mont: p256_scalar_mul_mont,
+};
+
+pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
+    scalar_ops: &SCALAR_OPS,
+    public_key_ops: &PUBLIC_KEY_OPS,
+
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ))]
+    twin_mul: twin_mul_nistz256,
+
+    #[cfg(not(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    )))]
+    twin_mul: |g_scalar, p_scalar, p_xy, cpu| {
+        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy, cpu)
+    },
+
+    q_minus_n: PublicElem::from_hex("4319055358e8617b0c46353d039cdaae"),
+
+    // TODO: Use an optimized variable-time implementation.
+    scalar_inv_to_mont_vartime: |s, cpu| PRIVATE_SCALAR_OPS.scalar_inv_to_mont(s, cpu),
+};
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+fn twin_mul_nistz256(
+    g_scalar: &Scalar,
+    p_scalar: &Scalar,
+    p_xy: &(Elem<R>, Elem<R>),
+    cpu: cpu::Features,
+) -> Point {
+    let scaled_g = point_mul_base_vartime(g_scalar, cpu);
+    let scaled_p = PRIVATE_KEY_OPS.point_mul(p_scalar, p_xy, cpu::features());
+    PRIVATE_KEY_OPS.point_sum(&scaled_g, &scaled_p, cpu)
+}
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
+fn point_mul_base_vartime(g_scalar: &Scalar, _cpu: cpu::Features) -> Point {
+    prefixed_extern! {
+        fn p256_point_mul_base_vartime(r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
+                                       g_scalar: *const Limb, // [COMMON_OPS.num_limbs]
+        );
+    }
+    let mut scaled_g = Point::new_at_infinity();
+    unsafe {
+        p256_point_mul_base_vartime(scaled_g.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr());
+    }
+    scaled_g
+}
+
+pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps {
+    scalar_ops: &SCALAR_OPS,
+
+    oneRR_mod_n: PublicScalar::from_hex(
+        "66e12d94f3d956202845b2392b6bec594699799c49bd6fa683244c95be79eea2",
+    ),
+    scalar_inv_to_mont: p256_scalar_inv_to_mont,
+};
+
+#[allow(clippy::just_underscores_and_digits)]
+fn p256_scalar_inv_to_mont(a: Scalar<R>, _cpu: cpu::Features) -> Scalar<R> {
+    // Calculate the modular inverse of scalar |a| using Fermat's Little
+    // Theorem:
+    //
+    //    a**-1 (mod n) == a**(n - 2) (mod n)
+    //
+    // The exponent (n - 2) is:
+    //
+    //    0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f
+
+    #[inline]
+    fn mul(a: &Scalar<R>, b: &Scalar<R>) -> Scalar<R> {
+        binary_op(p256_scalar_mul_mont, a, b)
+    }
+
+    #[inline]
+    fn sqr(a: &Scalar<R>) -> Scalar<R> {
+        let mut tmp = Scalar::zero();
+        unsafe { p256_scalar_sqr_rep_mont(tmp.limbs.as_mut_ptr(), a.limbs.as_ptr(), 1) }
+        tmp
+    }
+
+    // Returns (`a` squared `squarings` times) * `b`.
+    fn sqr_mul(a: &Scalar<R>, squarings: LeakyWord, b: &Scalar<R>) -> Scalar<R> {
+        debug_assert!(squarings >= 1);
+        let mut tmp = Scalar::zero();
+        unsafe { p256_scalar_sqr_rep_mont(tmp.limbs.as_mut_ptr(), a.limbs.as_ptr(), squarings) }
+        mul(&tmp, b)
+    }
+
+    // Sets `acc` = (`acc` squared `squarings` times) * `b`.
+    fn sqr_mul_acc(acc: &mut Scalar<R>, squarings: LeakyWord, b: &Scalar<R>) {
+        debug_assert!(squarings >= 1);
+        unsafe { p256_scalar_sqr_rep_mont(acc.limbs.as_mut_ptr(), acc.limbs.as_ptr(), squarings) }
+        binary_op_assign(p256_scalar_mul_mont, acc, b);
+    }
+
+    let _1 = &a;
+
+    let _10 = sqr(_1); // 2
+    let _100 = sqr(&_10); // 4
+    let _101 = mul(&_100, _1); // 5
+    let _111 = mul(&_101, &_10); // 7
+
+    let _1000 = sqr(&_100); // 8
+    let _10000 = sqr(&_1000); // 16
+    let _100000 = sqr(&_10000); // 32
+
+    let _100111 = mul(&_111, &_100000); // 39 = 7 + 32
+    let _101011 = mul(&_100, &_100111); // 43 = 4 + 39
+    let _101111 = mul(&_100, &_101011); // 47 = 4 + 39
+    let _1001111 = mul(&_100000, &_101111); // 79 = 32 + 47
+    let _86 = sqr(&_101011); // 86 = 43 * 2
+    let _1011011 = mul(&_101, &_86); // 91 = 5 + 86
+    let _92 = mul(_1, &_1011011); // 92 = 1 + 91
+    let _1100011 = mul(&_111, &_92); // 99 = 7 + 92
+    let _10111111 = mul(&_92, &_1100011); // 191 = 92 + 99
+    let _11011111 = mul(&_100000, &_10111111); // 223 = 32 + 191
+
+    let ff = mul(&_100000, &_11011111); // 255 = 32 + 223
+    let ffff = sqr_mul(&ff, 0 + 8, &ff);
+    let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff);
+
+    // ffffffff00000000ffffffff
+    let mut acc = sqr_mul(&ffffffff, 32 + 32, &ffffffff);
+
+    // ffffffff00000000ffffffffffffffff
+    sqr_mul_acc(&mut acc, 0 + 32, &ffffffff);
+
+    // The rest of the exponent, in binary, is:
+    //
+    //    1011110011100110111110101010110110100111000101111001111010000100
+    //    1111001110111001110010101100001011111100011000110010010101001111
+
+    sqr_mul_acc(&mut acc, 6, &_101111);
+    sqr_mul_acc(&mut acc, 2 + 3, &_111);
+    sqr_mul_acc(&mut acc, 2 + 8, &_11011111);
+    sqr_mul_acc(&mut acc, 1 + 3, &_101);
+    sqr_mul_acc(&mut acc, 1 + 7, &_1011011);
+    sqr_mul_acc(&mut acc, 1 + 6, &_100111);
+    sqr_mul_acc(&mut acc, 3 + 6, &_101111);
+    sqr_mul_acc(&mut acc, 2 + 3, &_111);
+    sqr_mul_acc(&mut acc, 3, &_101);
+    sqr_mul_acc(&mut acc, 4 + 7, &_1001111);
+    sqr_mul_acc(&mut acc, 2 + 3, &_111);
+    sqr_mul_acc(&mut acc, 1 + 3, &_111);
+    sqr_mul_acc(&mut acc, 2 + 3, &_111);
+    sqr_mul_acc(&mut acc, 2 + 6, &_101011);
+    sqr_mul_acc(&mut acc, 4 + 8, &_10111111);
+    sqr_mul_acc(&mut acc, 3 + 7, &_1100011);
+    sqr_mul_acc(&mut acc, 2 + 1, _1);
+    sqr_mul_acc(&mut acc, 2 + 3, &_101);
+    sqr_mul_acc(&mut acc, 1 + 7, &_1001111);
+
+    acc
+}
+
+prefixed_extern! {
+    pub(super) fn p256_mul_mont(
+        r: *mut Limb,   // [COMMON_OPS.num_limbs]
+        a: *const Limb, // [COMMON_OPS.num_limbs]
+        b: *const Limb, // [COMMON_OPS.num_limbs]
+    );
+    pub(super) fn p256_sqr_mont(
+        r: *mut Limb,   // [COMMON_OPS.num_limbs]
+        a: *const Limb, // [COMMON_OPS.num_limbs]
+    );
+
+    fn p256_point_add(
+        r: *mut Limb,   // [3][COMMON_OPS.num_limbs]
+        a: *const Limb, // [3][COMMON_OPS.num_limbs]
+        b: *const Limb, // [3][COMMON_OPS.num_limbs]
+    );
+    fn p256_point_mul(
+        r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
+        p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
+        p_x: *const Limb,      // [COMMON_OPS.num_limbs]
+        p_y: *const Limb,      // [COMMON_OPS.num_limbs]
+    );
+
+    fn p256_scalar_mul_mont(
+        r: *mut Limb,   // [COMMON_OPS.num_limbs]
+        a: *const Limb, // [COMMON_OPS.num_limbs]
+        b: *const Limb, // [COMMON_OPS.num_limbs]
+    );
+    fn p256_scalar_sqr_rep_mont(
+        r: *mut Limb,   // [COMMON_OPS.num_limbs]
+        a: *const Limb, // [COMMON_OPS.num_limbs]
+        rep: LeakyWord,
+    );
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        target_arch = "x86_64"
+    ))]
+    #[test]
+    fn p256_point_mul_base_vartime_test() {
+        use super::{super::tests::point_mul_base_tests, *};
+        point_mul_base_tests(
+            &PRIVATE_KEY_OPS,
+            point_mul_base_vartime,
+            test_vector_file!("p256_point_mul_base_tests.txt"),
+        );
+    }
+}
diff --git a/ring-0.17.14/src/ec/suite_b/ops/p384.rs b/ring-0.17.14/src/ec/suite_b/ops/p384.rs
new file mode 100644
index 0000000000..f1bde6d848
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/ops/p384.rs
@@ -0,0 +1,304 @@
+// Copyright 2016-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    elem::{binary_op, binary_op_assign},
+    elem_sqr_mul, elem_sqr_mul_acc, PublicModulus, *,
+};
+
+pub(super) const NUM_LIMBS: usize = 384 / LIMB_BITS;
+
+pub static COMMON_OPS: CommonOps = CommonOps {
+    num_limbs: elem::NumLimbs::P384,
+
+    q: PublicModulus {
+        p: limbs_from_hex("fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff"),
+        rr: PublicElem::from_hex("10000000200000000fffffffe000000000000000200000000fffffffe00000001"),
+    },
+    n: PublicElem::from_hex("ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973"),
+
+    a: PublicElem::from_hex("fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffc0000000000000003fffffffc"),
+    b: PublicElem::from_hex("cd08114b604fbff9b62b21f41f022094e3374bee94938ae277f2209b1920022ef729add87a4c32ec081188719d412dcc"),
+
+    elem_mul_mont: p384_elem_mul_mont,
+    elem_sqr_mont: p384_elem_sqr_mont,
+};
+
+pub(super) static GENERATOR: (PublicElem<R>, PublicElem<R>) = (
+    PublicElem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"),
+    PublicElem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"),
+);
+
+pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps {
+    common: &COMMON_OPS,
+    elem_inv_squared: p384_elem_inv_squared,
+    point_mul_base_impl: p384_point_mul_base_impl,
+    point_mul_impl: p384_point_mul,
+    point_add_jacobian_impl: p384_point_add,
+};
+
+fn p384_elem_inv_squared(q: &Modulus<Q>, a: &Elem<R>) -> Elem<R> {
+    // Calculate a**-2 (mod q) == a**(q - 3) (mod q)
+    //
+    // The exponent (q - 3) is:
+    //
+    //    0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe\
+    //      ffffffff0000000000000000fffffffc
+
+    #[inline]
+    fn sqr_mul(q: &Modulus<Q>, a: &Elem<R>, squarings: LeakyWord, b: &Elem<R>) -> Elem<R> {
+        elem_sqr_mul(&COMMON_OPS, a, squarings, b, q.cpu())
+    }
+
+    #[inline]
+    fn sqr_mul_acc(q: &Modulus<Q>, a: &mut Elem<R>, squarings: LeakyWord, b: &Elem<R>) {
+        elem_sqr_mul_acc(&COMMON_OPS, a, squarings, b, q.cpu())
+    }
+
+    let b_1 = &a;
+    let b_11 = sqr_mul(q, b_1, 1, b_1);
+    let b_111 = sqr_mul(q, &b_11, 1, b_1);
+    let f_11 = sqr_mul(q, &b_111, 3, &b_111);
+    let fff = sqr_mul(q, &f_11, 6, &f_11);
+    let fff_111 = sqr_mul(q, &fff, 3, &b_111);
+    let fffffff_11 = sqr_mul(q, &fff_111, 15, &fff_111);
+
+    let fffffffffffffff = sqr_mul(q, &fffffff_11, 30, &fffffff_11);
+
+    let ffffffffffffffffffffffffffffff = sqr_mul(q, &fffffffffffffff, 60, &fffffffffffffff);
+
+    // ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+    let mut acc = sqr_mul(
+        q,
+        &ffffffffffffffffffffffffffffff,
+        120,
+        &ffffffffffffffffffffffffffffff,
+    );
+
+    // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff_111
+    sqr_mul_acc(q, &mut acc, 15, &fff_111);
+
+    // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff
+    sqr_mul_acc(q, &mut acc, 1 + 30, &fffffff_11);
+    sqr_mul_acc(q, &mut acc, 2, &b_11);
+
+    // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff
+    // 0000000000000000fffffff_11
+    sqr_mul_acc(q, &mut acc, 64 + 30, &fffffff_11);
+
+    // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff
+    // 0000000000000000fffffffc
+    q.elem_square(&mut acc);
+    q.elem_square(&mut acc);
+
+    acc
+}
+
+fn p384_point_mul_base_impl(a: &Scalar, cpu: cpu::Features) -> Point {
+    // XXX: Not efficient. TODO: Precompute multiples of the generator.
+    let generator = (Elem::from(&GENERATOR.0), Elem::from(&GENERATOR.1));
+    PRIVATE_KEY_OPS.point_mul(a, &generator, cpu)
+}
+
+pub static PUBLIC_KEY_OPS: PublicKeyOps = PublicKeyOps {
+    common: &COMMON_OPS,
+};
+
+pub static SCALAR_OPS: ScalarOps = ScalarOps {
+    common: &COMMON_OPS,
+    scalar_mul_mont: p384_scalar_mul_mont,
+};
+
+pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
+    scalar_ops: &SCALAR_OPS,
+    public_key_ops: &PUBLIC_KEY_OPS,
+    twin_mul: |g_scalar, p_scalar, p_xy, cpu| {
+        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy, cpu)
+    },
+
+    q_minus_n: PublicElem::from_hex("389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68c"),
+
+    // TODO: Use an optimized variable-time implementation.
+    scalar_inv_to_mont_vartime: |s, cpu| PRIVATE_SCALAR_OPS.scalar_inv_to_mont(s, cpu),
+};
+
+pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps {
+    scalar_ops: &SCALAR_OPS,
+
+    oneRR_mod_n: PublicScalar::from_hex("c84ee012b39bf213fb05b7a28266895d40d49174aab1cc5bc3e483afcb82947ff3d81e5df1aa4192d319b2419b409a9"),
+    scalar_inv_to_mont: p384_scalar_inv_to_mont,
+};
+
+fn p384_scalar_inv_to_mont(a: Scalar<R>, _cpu: cpu::Features) -> Scalar<R> {
+    // Calculate the modular inverse of scalar |a| using Fermat's Little
+    // Theorem:
+    //
+    //    a**-1 (mod n) == a**(n - 2) (mod n)
+    //
+    // The exponent (n - 2) is:
+    //
+    //    0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf\
+    //      581a0db248b0a77aecec196accc52971
+
+    fn mul(a: &Scalar<R>, b: &Scalar<R>) -> Scalar<R> {
+        binary_op(p384_scalar_mul_mont, a, b)
+    }
+
+    fn sqr(a: &Scalar<R>) -> Scalar<R> {
+        binary_op(p384_scalar_mul_mont, a, a)
+    }
+
+    fn sqr_mut(a: &mut Scalar<R>) {
+        unary_op_from_binary_op_assign(p384_scalar_mul_mont, a);
+    }
+
+    // Returns (`a` squared `squarings` times) * `b`.
+    fn sqr_mul(a: &Scalar<R>, squarings: LeakyWord, b: &Scalar<R>) -> Scalar<R> {
+        debug_assert!(squarings >= 1);
+        let mut tmp = sqr(a);
+        for _ in 1..squarings {
+            sqr_mut(&mut tmp);
+        }
+        mul(&tmp, b)
+    }
+
+    // Sets `acc` = (`acc` squared `squarings` times) * `b`.
+    fn sqr_mul_acc(acc: &mut Scalar<R>, squarings: LeakyWord, b: &Scalar<R>) {
+        debug_assert!(squarings >= 1);
+        for _ in 0..squarings {
+            sqr_mut(acc);
+        }
+        binary_op_assign(p384_scalar_mul_mont, acc, b)
+    }
+
+    // Indexes into `d`.
+    const B_1: usize = 0;
+    const B_11: usize = 1;
+    const B_101: usize = 2;
+    const B_111: usize = 3;
+    const B_1001: usize = 4;
+    const B_1011: usize = 5;
+    const B_1101: usize = 6;
+    const B_1111: usize = 7;
+    const DIGIT_COUNT: usize = 8;
+
+    let mut d = [Scalar::zero(); DIGIT_COUNT];
+    d[B_1] = a;
+    let b_10 = sqr(&d[B_1]);
+    for i in B_11..DIGIT_COUNT {
+        d[i] = mul(&d[i - 1], &b_10);
+    }
+
+    let ff = sqr_mul(&d[B_1111], 0 + 4, &d[B_1111]);
+    let ffff = sqr_mul(&ff, 0 + 8, &ff);
+    let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff);
+
+    let ffffffffffffffff = sqr_mul(&ffffffff, 0 + 32, &ffffffff);
+
+    let ffffffffffffffffffffffff = sqr_mul(&ffffffffffffffff, 0 + 32, &ffffffff);
+
+    // ffffffffffffffffffffffffffffffffffffffffffffffff
+    let mut acc = sqr_mul(&ffffffffffffffffffffffff, 0 + 96, &ffffffffffffffffffffffff);
+
+    // The rest of the exponent, in binary, is:
+    //
+    //    1100011101100011010011011000000111110100001101110010110111011111
+    //    0101100000011010000011011011001001001000101100001010011101111010
+    //    1110110011101100000110010110101011001100110001010010100101110001
+
+    #[allow(clippy::cast_possible_truncation)]
+    static REMAINING_WINDOWS: [(u8, u8); 39] = [
+        (2, B_11 as u8),
+        (3 + 3, B_111 as u8),
+        (1 + 2, B_11 as u8),
+        (3 + 2, B_11 as u8),
+        (1 + 4, B_1001 as u8),
+        (4, B_1011 as u8),
+        (6 + 4, B_1111 as u8),
+        (3, B_101 as u8),
+        (4 + 1, B_1 as u8),
+        (4, B_1011 as u8),
+        (4, B_1001 as u8),
+        (1 + 4, B_1101 as u8),
+        (4, B_1101 as u8),
+        (4, B_1111 as u8),
+        (1 + 4, B_1011 as u8),
+        (6 + 4, B_1101 as u8),
+        (5 + 4, B_1101 as u8),
+        (4, B_1011 as u8),
+        (2 + 4, B_1001 as u8),
+        (2 + 1, B_1 as u8),
+        (3 + 4, B_1011 as u8),
+        (4 + 3, B_101 as u8),
+        (2 + 3, B_111 as u8),
+        (1 + 4, B_1111 as u8),
+        (1 + 4, B_1011 as u8),
+        (4, B_1011 as u8),
+        (2 + 3, B_111 as u8),
+        (1 + 2, B_11 as u8),
+        (5 + 2, B_11 as u8),
+        (2 + 4, B_1011 as u8),
+        (1 + 3, B_101 as u8),
+        (1 + 2, B_11 as u8),
+        (2 + 2, B_11 as u8),
+        (2 + 2, B_11 as u8),
+        (3 + 3, B_101 as u8),
+        (2 + 3, B_101 as u8),
+        (2 + 3, B_101 as u8),
+        (2, B_11 as u8),
+        (3 + 1, B_1 as u8),
+    ];
+
+    for &(squarings, digit) in &REMAINING_WINDOWS[..] {
+        sqr_mul_acc(&mut acc, LeakyWord::from(squarings), &d[usize::from(digit)]);
+    }
+
+    acc
+}
+
+unsafe extern "C" fn p384_elem_sqr_mont(
+    r: *mut Limb,   // [COMMON_OPS.num_limbs]
+    a: *const Limb, // [COMMON_OPS.num_limbs]
+) {
+    // XXX: Inefficient. TODO: Make a dedicated squaring routine.
+    unsafe {
+        p384_elem_mul_mont(r, a, a);
+    }
+}
+
+prefixed_extern! {
+    fn p384_elem_mul_mont(
+        r: *mut Limb,   // [COMMON_OPS.num_limbs]
+        a: *const Limb, // [COMMON_OPS.num_limbs]
+        b: *const Limb, // [COMMON_OPS.num_limbs]
+    );
+
+    fn p384_point_add(
+        r: *mut Limb,   // [3][COMMON_OPS.num_limbs]
+        a: *const Limb, // [3][COMMON_OPS.num_limbs]
+        b: *const Limb, // [3][COMMON_OPS.num_limbs]
+    );
+    fn p384_point_mul(
+        r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
+        p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
+        p_x: *const Limb,      // [COMMON_OPS.num_limbs]
+        p_y: *const Limb,      // [COMMON_OPS.num_limbs]
+    );
+
+    fn p384_scalar_mul_mont(
+        r: *mut Limb,   // [COMMON_OPS.num_limbs]
+        a: *const Limb, // [COMMON_OPS.num_limbs]
+        b: *const Limb, // [COMMON_OPS.num_limbs]
+    );
+}
diff --git a/ring-0.17.14/src/ec/suite_b/private_key.rs b/ring-0.17.14/src/ec/suite_b/private_key.rs
new file mode 100644
index 0000000000..19129296c3
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/private_key.rs
@@ -0,0 +1,203 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Functionality shared by operations on private keys (ECC keygen and
+//! ECDSA signing).
+
+use super::{ops::*, verify_affine_point_is_on_the_curve};
+use crate::{arithmetic::montgomery::R, cpu, ec, error, limb, rand};
+
+/// Generates a random scalar in the range [1, n).
+pub(super) fn random_scalar(
+    ops: &PrivateKeyOps,
+    n: &Modulus<N>,
+    rng: &dyn rand::SecureRandom,
+) -> Result<Scalar, error::Unspecified> {
+    let mut bytes = [0; ec::SCALAR_MAX_BYTES];
+    let bytes = &mut bytes[..ops.common.len()];
+    generate_private_scalar_bytes(ops, rng, bytes, n.cpu())?;
+    scalar_from_big_endian_bytes(n, bytes)
+}
+
+pub(super) fn generate_private_scalar_bytes(
+    ops: &PrivateKeyOps,
+    rng: &dyn rand::SecureRandom,
+    out: &mut [u8],
+    cpu: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    // [NSA Suite B Implementer's Guide to ECDSA] Appendix A.1.2, and
+    // [NSA Suite B Implementer's Guide to NIST SP 800-56A] Appendix B.2,
+    // "Key Pair Generation by Testing Candidates".
+    //
+    // [NSA Suite B Implementer's Guide to ECDSA]: doc/ecdsa.pdf
+    // [NSA Suite B Implementer's Guide to NIST SP 800-56A]: doc/ecdh.pdf
+
+    // TODO: The NSA guide also suggests, in appendix B.1, another mechanism
+    // that would avoid the need to use `rng.fill()` more than once. It works
+    // by generating an extra 64 bits of random bytes and then reducing the
+    // output (mod n). Supposedly, this removes enough of the bias towards
+    // small values from the modular reduction, but it isn't obvious that it is
+    // sufficient. TODO: Figure out what we can do to mitigate the bias issue
+    // and switch to the other mechanism.
+
+    let candidate = out;
+
+    // XXX: The value 100 was chosen to match OpenSSL due to uncertainty of
+    // what specific value would be better, but it seems bad to try 100 times.
+    for _ in 0..100 {
+        // NSA Guide Steps 1, 2, and 3.
+        //
+        // Since we calculate the length ourselves, it is pointless to check
+        // it, since we can only check it by doing the same calculation.
+
+        // NSA Guide Step 4.
+        //
+        // The requirement that the random number generator has the
+        // requested security strength is delegated to `rng`.
+        rng.fill(candidate)?;
+
+        // NSA Guide Steps 5, 6, and 7.
+        if check_scalar_big_endian_bytes(ops, candidate, cpu).is_err() {
+            continue;
+        }
+
+        // NSA Guide Step 8 is done in `public_from_private()`.
+
+        // NSA Guide Step 9.
+        return Ok(());
+    }
+
+    Err(error::Unspecified)
+}
+
+// The underlying X25519 and Ed25519 code uses an [u8; 32] to store the private
+// key. To make the ECDH and ECDSA code similar to that, we also store the
+// private key that way, which means we have to convert it to a Scalar whenever
+// we need to use it.
+#[inline]
+pub(super) fn private_key_as_scalar(n: &Modulus<N>, private_key: &ec::Seed) -> Scalar {
+    // This cannot fail because we know the private key is valid.
+    scalar_from_big_endian_bytes(n, private_key.bytes_less_safe()).unwrap()
+}
+
+pub(super) fn check_scalar_big_endian_bytes(
+    ops: &PrivateKeyOps,
+    bytes: &[u8],
+    cpu: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    debug_assert_eq!(bytes.len(), ops.common.len());
+    let n = &ops.common.scalar_modulus(cpu);
+    scalar_from_big_endian_bytes(n, bytes).map(|_| ())
+}
+
+// Parses a fixed-length (zero-padded) big-endian-encoded scalar in the range
+// [1, n). This is intended to be constant-time with respect to the actual
+// value *only if* the value is actually in range. In other words, this won't
+// leak anything about a valid value, but it might leak small amounts of
+// information about an invalid value (which constraint it failed).
+pub(super) fn scalar_from_big_endian_bytes(
+    n: &Modulus<N>,
+    bytes: &[u8],
+) -> Result<Scalar, error::Unspecified> {
+    // [NSA Suite B Implementer's Guide to ECDSA] Appendix A.1.2, and
+    // [NSA Suite B Implementer's Guide to NIST SP 800-56A] Appendix B.2,
+    // "Key Pair Generation by Testing Candidates".
+    //
+    // [NSA Suite B Implementer's Guide to ECDSA]: doc/ecdsa.pdf
+    // [NSA Suite B Implementer's Guide to NIST SP 800-56A]: doc/ecdh.pdf
+    //
+    // Steps 5, 6, and 7.
+    //
+    // XXX: The NSA guide says that we should verify that the random scalar is
+    // in the range [0, n - 1) and then add one to it so that it is in the range
+    // [1, n). Instead, we verify that the scalar is in the range [1, n). This
+    // way, we avoid needing to compute or store the value (n - 1), we avoid the
+    // need to implement a function to add one to a scalar, and we avoid needing
+    // to convert the scalar back into an array of bytes.
+    scalar_parse_big_endian_fixed_consttime(n, untrusted::Input::from(bytes))
+}
+
+pub(super) fn public_from_private(
+    ops: &PrivateKeyOps,
+    public_out: &mut [u8],
+    my_private_key: &ec::Seed,
+    cpu: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    let q = &ops.common.elem_modulus(cpu);
+    let elem_and_scalar_bytes = ops.common.len();
+    debug_assert_eq!(public_out.len(), 1 + (2 * elem_and_scalar_bytes));
+    let n = &ops.common.scalar_modulus(cpu);
+    let my_private_key = private_key_as_scalar(n, my_private_key);
+    let my_public_key = ops.point_mul_base(&my_private_key, cpu);
+    public_out[0] = 4; // Uncompressed encoding.
+    let (x_out, y_out) = public_out[1..].split_at_mut(elem_and_scalar_bytes);
+
+    // `big_endian_affine_from_jacobian` verifies that the point is not at
+    // infinity and is on the curve.
+    big_endian_affine_from_jacobian(ops, q, x_out, Some(y_out), &my_public_key)
+}
+
+pub(super) fn affine_from_jacobian(
+    ops: &PrivateKeyOps,
+    q: &Modulus<Q>,
+    p: &Point,
+) -> Result<(Elem<R>, Elem<R>), error::Unspecified> {
+    let z = q.point_z(p);
+
+    // Since we restrict our private key to the range [1, n), the curve has
+    // prime order, and we verify that the peer's point is on the curve,
+    // there's no way that the result can be at infinity. But, use `assert!`
+    // instead of `debug_assert!` anyway
+    assert!(q.elem_verify_is_not_zero(&z).is_ok());
+
+    let x = q.point_x(p);
+    let y = q.point_y(p);
+
+    let zz_inv = ops.elem_inverse_squared(q, &z);
+
+    let x_aff = q.elem_product(&x, &zz_inv);
+
+    // `y_aff` is needed to validate the point is on the curve. It is also
+    // needed in the non-ECDH case where we need to output it.
+    let y_aff = {
+        let zzzz_inv = q.elem_squared(&zz_inv);
+        let zzz_inv = q.elem_product(&z, &zzzz_inv);
+        q.elem_product(&y, &zzz_inv)
+    };
+
+    // If we validated our inputs correctly and then computed (x, y, z), then
+    // (x, y, z) will be on the curve. See
+    // `verify_affine_point_is_on_the_curve_scaled` for the motivation.
+    verify_affine_point_is_on_the_curve(q, (&x_aff, &y_aff))?;
+
+    Ok((x_aff, y_aff))
+}
+
+pub(super) fn big_endian_affine_from_jacobian(
+    ops: &PrivateKeyOps,
+    q: &Modulus<Q>,
+    x_out: &mut [u8],
+    y_out: Option<&mut [u8]>,
+    p: &Point,
+) -> Result<(), error::Unspecified> {
+    let (x_aff, y_aff) = affine_from_jacobian(ops, q, p)?;
+    let x = q.elem_unencoded(&x_aff);
+    limb::big_endian_from_limbs(ops.leak_limbs(&x), x_out);
+    if let Some(y_out) = y_out {
+        let y = q.elem_unencoded(&y_aff);
+        limb::big_endian_from_limbs(ops.leak_limbs(&y), y_out);
+    }
+
+    Ok(())
+}
diff --git a/ring-0.17.14/src/ec/suite_b/public_key.rs b/ring-0.17.14/src/ec/suite_b/public_key.rs
new file mode 100644
index 0000000000..328bb371a4
--- /dev/null
+++ b/ring-0.17.14/src/ec/suite_b/public_key.rs
@@ -0,0 +1,111 @@
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Functionality shared by operations on public keys (ECDSA verification and
+//! ECDH agreement).
+
+use super::{ops::*, verify_affine_point_is_on_the_curve};
+use crate::{arithmetic::montgomery::*, error};
+
+/// Parses a public key encoded in uncompressed form. The key is validated
+/// using the ECC Partial Public-Key Validation Routine from
+/// [NIST SP 800-56A, revision 2] Section 5.6.2.3.3, the NSA's
+/// "Suite B Implementer's Guide to NIST SP 800-56A," Appendix B.3, and the
+/// NSA's "Suite B Implementer's Guide to FIPS 186-3 (ECDSA)," Appendix A.3.
+///
+/// [NIST SP 800-56A, revision 2]:
+///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
+pub(super) fn parse_uncompressed_point(
+    ops: &PublicKeyOps,
+    q: &Modulus<Q>,
+    input: untrusted::Input,
+) -> Result<(Elem<R>, Elem<R>), error::Unspecified> {
+    // NIST SP 800-56A Step 1: "Verify that Q is not the point at infinity.
+    // This can be done by inspection if the point is entered in the standard
+    // affine representation." (We do it by inspection since we only accept
+    // the affine representation.)
+    let (x, y) = input.read_all(error::Unspecified, |input| {
+        // The encoding must be 4, which is the encoding for "uncompressed".
+        let encoding = input.read_byte()?;
+        if encoding != 4 {
+            return Err(error::Unspecified);
+        }
+
+        // NIST SP 800-56A Step 2: "Verify that xQ and yQ are integers in the
+        // interval [0, p-1] in the case that q is an odd prime p[.]"
+        let x = ops.elem_parse(q, input)?;
+        let y = ops.elem_parse(q, input)?;
+        Ok((x, y))
+    })?;
+
+    // NIST SP 800-56A Step 3: "If q is an odd prime p, verify that
+    // yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed
+    // modulo p."
+    verify_affine_point_is_on_the_curve(q, (&x, &y))?;
+
+    // NIST SP 800-56A Note: "Since its order is not verified, there is no
+    // check that the public key is in the correct EC subgroup."
+    //
+    // NSA Suite B Implementer's Guide Note: "ECC Full Public-Key Validation
+    // includes an additional check to ensure that the point has the correct
+    // order. This check is not necessary for curves having prime order (and
+    // cofactor h = 1), such as P-256 and P-384."
+
+    Ok((x, y))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cpu;
+    use crate::testutil as test;
+
+    #[test]
+    fn parse_uncompressed_point_test() {
+        let cpu = cpu::features();
+        test::run(
+            test_vector_file!("suite_b_public_key_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let curve_name = test_case.consume_string("Curve");
+
+                let public_key = test_case.consume_bytes("Q");
+                let public_key = untrusted::Input::from(&public_key);
+                let is_valid = test_case.consume_string("Result") == "P";
+
+                let curve_ops = public_key_ops_from_curve_name(&curve_name);
+                let q = &curve_ops.common.elem_modulus(cpu);
+
+                let result = parse_uncompressed_point(curve_ops, q, public_key);
+                assert_eq!(is_valid, result.is_ok());
+
+                // TODO: Verify that we when we re-serialize the parsed (x, y), the
+                // output is equal to the input.
+
+                Ok(())
+            },
+        );
+    }
+
+    fn public_key_ops_from_curve_name(curve_name: &str) -> &'static PublicKeyOps {
+        if curve_name == "P-256" {
+            &p256::PUBLIC_KEY_OPS
+        } else if curve_name == "P-384" {
+            &p384::PUBLIC_KEY_OPS
+        } else {
+            panic!("Unsupported curve: {}", curve_name);
+        }
+    }
+}
diff --git a/ring-0.17.14/src/error/input_too_long.rs b/ring-0.17.14/src/error/input_too_long.rs
new file mode 100644
index 0000000000..6000077469
--- /dev/null
+++ b/ring-0.17.14/src/error/input_too_long.rs
@@ -0,0 +1,39 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub struct InputTooLongError<T = usize> {
+    /// Note that this might not actually be the (exact) length of the input,
+    /// and its units might be lost. For example, it could be any of the
+    /// following:
+    ///
+    ///    * The length in bytes of the entire input.
+    ///    * The length in bytes of some *part* of the input.
+    ///    * A bit length.
+    ///    * A length in terms of "blocks" or other grouping of input values.
+    ///    * Some intermediate quantity that was used when checking the input
+    ///      length.
+    ///    * Some arbitrary value.
+    #[allow(dead_code)]
+    imprecise_input_length: T,
+}
+
+impl<T> InputTooLongError<T> {
+    #[cold]
+    #[inline(never)]
+    pub(crate) fn new(imprecise_input_length: T) -> Self {
+        Self {
+            imprecise_input_length,
+        }
+    }
+}
diff --git a/ring-0.17.14/src/error/into_unspecified.rs b/ring-0.17.14/src/error/into_unspecified.rs
new file mode 100644
index 0000000000..dc0bd94ae5
--- /dev/null
+++ b/ring-0.17.14/src/error/into_unspecified.rs
@@ -0,0 +1,33 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::error::{KeyRejected, Unspecified};
+
+impl From<untrusted::EndOfInput> for Unspecified {
+    fn from(source: untrusted::EndOfInput) -> Self {
+        super::erase(source)
+    }
+}
+
+impl From<core::array::TryFromSliceError> for Unspecified {
+    fn from(source: core::array::TryFromSliceError) -> Self {
+        super::erase(source)
+    }
+}
+
+impl From<KeyRejected> for Unspecified {
+    fn from(source: KeyRejected) -> Self {
+        super::erase(source)
+    }
+}
diff --git a/ring-0.17.14/src/error/key_rejected.rs b/ring-0.17.14/src/error/key_rejected.rs
new file mode 100644
index 0000000000..c12d973ebc
--- /dev/null
+++ b/ring-0.17.14/src/error/key_rejected.rs
@@ -0,0 +1,111 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Error reporting.
+
+#[cfg(feature = "std")]
+extern crate std;
+
+/// An error parsing or validating a key.
+///
+/// The `Display` implementation will return a string that will help you better
+/// understand why a key was rejected change which errors are reported in which
+/// situations while minimizing the likelihood that any applications will be
+/// broken.
+///
+/// Here is an incomplete list of reasons a key may be unsupported:
+///
+/// * Invalid or Inconsistent Components: A component of the key has an invalid
+///   value, or the mathematical relationship between two (or more) components
+///   required for a valid key does not hold.
+///
+/// * The encoding of the key is invalid. Perhaps the key isn't in the correct
+///   format; e.g. it may be Base64 ("PEM") encoded, in which case   the Base64
+///   encoding needs to be undone first.
+///
+/// * The encoding includes a versioning mechanism and that mechanism indicates
+///   that the key is encoded in a version of the encoding that isn't supported.
+///   This might happen for multi-prime RSA keys (keys with more than two
+///   private   prime factors), which aren't supported, for example.
+///
+/// * Too small or too Large: One of the primary components of the key is too
+///   small or two large. Too-small keys are rejected for security reasons. Some
+///   unnecessarily large keys are rejected for performance reasons.
+///
+///  * Wrong algorithm: The key is not valid for the algorithm in which it was
+///    being used.
+///
+///  * Unexpected errors: Report this as a bug.
+#[derive(Copy, Clone, Debug)]
+pub struct KeyRejected(&'static str);
+
+impl KeyRejected {
+    pub(crate) fn inconsistent_components() -> Self {
+        Self("InconsistentComponents")
+    }
+
+    pub(crate) fn invalid_component() -> Self {
+        Self("InvalidComponent")
+    }
+
+    #[inline]
+    pub(crate) fn invalid_encoding() -> Self {
+        Self("InvalidEncoding")
+    }
+
+    // XXX: See the comment at the call site.
+    pub(crate) fn rng_failed() -> Self {
+        Self("RNG failed")
+    }
+
+    pub(crate) fn public_key_is_missing() -> Self {
+        Self("PublicKeyIsMissing")
+    }
+
+    #[cfg(feature = "alloc")]
+    pub(crate) fn too_small() -> Self {
+        Self("TooSmall")
+    }
+
+    #[cfg(feature = "alloc")]
+    pub(crate) fn too_large() -> Self {
+        Self("TooLarge")
+    }
+
+    pub(crate) fn version_not_supported() -> Self {
+        Self("VersionNotSupported")
+    }
+
+    pub(crate) fn wrong_algorithm() -> Self {
+        Self("WrongAlgorithm")
+    }
+
+    #[cfg(feature = "alloc")]
+    pub(crate) fn private_modulus_len_not_multiple_of_512_bits() -> Self {
+        Self("PrivateModulusLenNotMultipleOf512Bits")
+    }
+
+    pub(crate) fn unexpected_error() -> Self {
+        Self("UnexpectedError")
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for KeyRejected {}
+
+impl core::fmt::Display for KeyRejected {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.write_str(self.0)
+    }
+}
diff --git a/ring-0.17.14/src/error/mod.rs b/ring-0.17.14/src/error/mod.rs
new file mode 100644
index 0000000000..ead174fd45
--- /dev/null
+++ b/ring-0.17.14/src/error/mod.rs
@@ -0,0 +1,58 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Error reporting.
+
+pub use self::{key_rejected::KeyRejected, unspecified::Unspecified};
+
+pub(crate) use self::{
+    input_too_long::InputTooLongError, len_mismatch_error::LenMismatchError,
+    too_much_output_requested::TooMuchOutputRequestedError,
+};
+
+mod input_too_long;
+mod into_unspecified;
+mod key_rejected;
+mod unspecified;
+
+#[cold]
+#[inline(never)]
+pub(crate) fn erase<T>(_: T) -> Unspecified {
+    Unspecified
+}
+
+cold_exhaustive_error! {
+    struct too_much_output_requested::TooMuchOutputRequestedError
+        with pub(crate) constructor {
+        // Note that this might not actually be the (exact) output length
+        // requested, and its units might be lost. For example, it could be any of
+        // the following:
+        //
+        //    * The length in bytes of the entire output.
+        //    * The length in bytes of some *part* of the output.
+        //    * A bit length.
+        //    * A length in terms of "blocks" or other grouping of output values.
+        //    * Some intermediate quantity that was used when checking the output
+        //      length.
+        //    * Some arbitrary value.
+        imprecise_output_length: usize
+    }
+}
+
+cold_exhaustive_error! {
+    struct len_mismatch_error::LenMismatchError
+        with pub(crate) constructor {
+        len: usize
+    }
+}
diff --git a/ring-0.17.14/src/error/unspecified.rs b/ring-0.17.14/src/error/unspecified.rs
new file mode 100644
index 0000000000..22a3e02950
--- /dev/null
+++ b/ring-0.17.14/src/error/unspecified.rs
@@ -0,0 +1,85 @@
+// Copyright 2016-2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#[cfg(feature = "std")]
+extern crate std;
+
+/// An error with absolutely no details.
+///
+/// *ring* uses this unit type as the error type in most of its results
+/// because (a) usually the specific reasons for a failure are obvious or are
+/// not useful to know, and/or (b) providing more details about a failure might
+/// provide a dangerous side channel, and/or (c) it greatly simplifies the
+/// error handling logic.
+///
+/// `Result<T, ring::error::Unspecified>` is mostly equivalent to
+/// `Result<T, ()>`. However, `ring::error::Unspecified` implements
+/// [`std::error::Error`] and users of *ring* can implement
+/// `From<ring::error::Unspecified>` to map this to their own error types, as
+/// described in [“Error Handling” in the Rust Book]:
+///
+/// ```
+/// use ring::rand::{self, SecureRandom};
+///
+/// enum Error {
+///     CryptoError,
+///
+/// #  #[cfg(feature = "alloc")]
+///     IOError(std::io::Error),
+///     // [...]
+/// }
+///
+/// impl From<ring::error::Unspecified> for Error {
+///     fn from(_: ring::error::Unspecified) -> Self { Error::CryptoError }
+/// }
+///
+/// fn eight_random_bytes() -> Result<[u8; 8], Error> {
+///     let rng = rand::SystemRandom::new();
+///     let mut bytes = [0; 8];
+///
+///     // The `From<ring::error::Unspecified>` implementation above makes this
+///     // equivalent to
+///     // `rng.fill(&mut bytes).map_err(|_| Error::CryptoError)?`.
+///     rng.fill(&mut bytes)?;
+///
+///     Ok(bytes)
+/// }
+///
+/// assert!(eight_random_bytes().is_ok());
+/// ```
+///
+/// Experience with using and implementing other crypto libraries like has
+/// shown that sophisticated error reporting facilities often cause significant
+/// bugs themselves, both within the crypto library and within users of the
+/// crypto library. This approach attempts to minimize complexity in the hopes
+/// of avoiding such problems. In some cases, this approach may be too extreme,
+/// and it may be important for an operation to provide some details about the
+/// cause of a failure. Users of *ring* are encouraged to report such cases so
+/// that they can be addressed individually.
+///
+/// [`std::error::Error`]: https://doc.rust-lang.org/std/error/trait.Error.html
+/// [“Error Handling” in the Rust Book]:
+///     https://doc.rust-lang.org/book/first-edition/error-handling.html#the-from-trait
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct Unspecified;
+
+// This is required for the implementation of `std::error::Error`.
+impl core::fmt::Display for Unspecified {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.write_str("ring::error::Unspecified")
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Unspecified {}
diff --git a/ring-0.17.14/src/hkdf.rs b/ring-0.17.14/src/hkdf.rs
new file mode 100644
index 0000000000..6a28a4e91a
--- /dev/null
+++ b/ring-0.17.14/src/hkdf.rs
@@ -0,0 +1,233 @@
+// Copyright 2015 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! HMAC-based Extract-and-Expand Key Derivation Function.
+//!
+//! HKDF is specified in [RFC 5869].
+//!
+//! [RFC 5869]: https://tools.ietf.org/html/rfc5869
+
+use crate::{error, hmac};
+
+/// An HKDF algorithm.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub struct Algorithm(hmac::Algorithm);
+
+impl Algorithm {
+    /// The underlying HMAC algorithm.
+    #[inline]
+    pub fn hmac_algorithm(&self) -> hmac::Algorithm {
+        self.0
+    }
+}
+
+/// HKDF using HMAC-SHA-1. Obsolete.
+pub static HKDF_SHA1_FOR_LEGACY_USE_ONLY: Algorithm =
+    Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY);
+
+/// HKDF using HMAC-SHA-256.
+pub static HKDF_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256);
+
+/// HKDF using HMAC-SHA-384.
+pub static HKDF_SHA384: Algorithm = Algorithm(hmac::HMAC_SHA384);
+
+/// HKDF using HMAC-SHA-512.
+pub static HKDF_SHA512: Algorithm = Algorithm(hmac::HMAC_SHA512);
+
+impl KeyType for Algorithm {
+    fn len(&self) -> usize {
+        self.0.digest_algorithm().output_len()
+    }
+}
+
+/// A salt for HKDF operations.
+#[derive(Debug)]
+pub struct Salt(hmac::Key);
+
+impl Salt {
+    /// Constructs a new `Salt` with the given value based on the given digest
+    /// algorithm.
+    ///
+    /// Constructing a `Salt` is relatively expensive so it is good to reuse a
+    /// `Salt` object instead of re-constructing `Salt`s with the same value.
+    pub fn new(algorithm: Algorithm, value: &[u8]) -> Self {
+        Self(hmac::Key::new(algorithm.0, value))
+    }
+
+    /// The [HKDF-Extract] operation.
+    ///
+    /// [HKDF-Extract]: https://tools.ietf.org/html/rfc5869#section-2.2
+    pub fn extract(&self, secret: &[u8]) -> Prk {
+        // The spec says that if no salt is provided then a key of
+        // `digest_alg.output_len` bytes of zeros is used. But, HMAC keys are
+        // already zero-padded to the block length, which is larger than the output
+        // length of the extract step (the length of the digest). Consequently the
+        // `Key` constructor will automatically do the right thing for a
+        // zero-length string.
+        let salt = &self.0;
+        let prk = hmac::sign(salt, secret);
+        Prk(hmac::Key::new(salt.algorithm(), prk.as_ref()))
+    }
+
+    /// The algorithm used to derive this salt.
+    #[inline]
+    pub fn algorithm(&self) -> Algorithm {
+        Algorithm(self.0.algorithm())
+    }
+}
+
+impl From<Okm<'_, Algorithm>> for Salt {
+    fn from(okm: Okm<'_, Algorithm>) -> Self {
+        Self(hmac::Key::from(Okm {
+            prk: okm.prk,
+            info: okm.info,
+            len: okm.len().0,
+            len_cached: okm.len_cached,
+        }))
+    }
+}
+
+/// The length of the OKM (Output Keying Material) for a `Prk::expand()` call.
+pub trait KeyType {
+    /// The length that `Prk::expand()` should expand its input to.
+    fn len(&self) -> usize;
+}
+
+/// A HKDF PRK (pseudorandom key).
+#[derive(Clone, Debug)]
+pub struct Prk(hmac::Key);
+
+impl Prk {
+    /// Construct a new `Prk` directly with the given value.
+    ///
+    /// Usually one can avoid using this. It is useful when the application
+    /// intentionally wants to leak the PRK secret, e.g. to implement
+    /// `SSLKEYLOGFILE` functionality.
+    pub fn new_less_safe(algorithm: Algorithm, value: &[u8]) -> Self {
+        Self(hmac::Key::new(algorithm.hmac_algorithm(), value))
+    }
+
+    /// The [HKDF-Expand] operation.
+    ///
+    /// [HKDF-Expand]: https://tools.ietf.org/html/rfc5869#section-2.3
+    ///
+    /// Fails if (and only if) `len` is too large.
+    #[inline]
+    pub fn expand<'a, L: KeyType>(
+        &'a self,
+        info: &'a [&'a [u8]],
+        len: L,
+    ) -> Result<Okm<'a, L>, error::Unspecified> {
+        let len_cached = len.len();
+        if len_cached > 255 * self.0.algorithm().digest_algorithm().output_len() {
+            return Err(error::Unspecified);
+        }
+        Ok(Okm {
+            prk: self,
+            info,
+            len,
+            len_cached,
+        })
+    }
+}
+
+impl From<Okm<'_, Algorithm>> for Prk {
+    fn from(okm: Okm<Algorithm>) -> Self {
+        Self(hmac::Key::from(Okm {
+            prk: okm.prk,
+            info: okm.info,
+            len: okm.len().0,
+            len_cached: okm.len_cached,
+        }))
+    }
+}
+
+/// An HKDF OKM (Output Keying Material)
+///
+/// Intentionally not `Clone` or `Copy` as an OKM is generally only safe to
+/// use once.
+#[derive(Debug)]
+pub struct Okm<'a, L: KeyType> {
+    prk: &'a Prk,
+    info: &'a [&'a [u8]],
+    len: L,
+    len_cached: usize,
+}
+
+impl<L: KeyType> Okm<'_, L> {
+    /// The `OkmLength` given to `Prk::expand()`.
+    #[inline]
+    pub fn len(&self) -> &L {
+        &self.len
+    }
+
+    /// Fills `out` with the output of the HKDF-Expand operation for the given
+    /// inputs.
+    ///
+    /// Fails if (and only if) the requested output length is larger than 255
+    /// times the size of the digest algorithm's output. (This is the limit
+    /// imposed by the HKDF specification due to the way HKDF's counter is
+    /// constructed.)
+    #[inline]
+    pub fn fill(self, out: &mut [u8]) -> Result<(), error::Unspecified> {
+        fill_okm(self.prk, self.info, out, self.len_cached)
+    }
+}
+
+fn fill_okm(
+    prk: &Prk,
+    info: &[&[u8]],
+    out: &mut [u8],
+    len: usize,
+) -> Result<(), error::Unspecified> {
+    if out.len() != len {
+        return Err(error::Unspecified);
+    }
+
+    let digest_alg = prk.0.algorithm().digest_algorithm();
+    assert!(digest_alg.block_len() >= digest_alg.output_len());
+
+    let mut ctx = hmac::Context::with_key(&prk.0);
+
+    let mut n = 1u8;
+    let mut out = out;
+    loop {
+        for info in info {
+            ctx.update(info);
+        }
+        ctx.update(&[n]);
+
+        let t = ctx.sign();
+        let t = t.as_ref();
+
+        // Append `t` to the output.
+        out = if out.len() < digest_alg.output_len() {
+            let len = out.len();
+            out.copy_from_slice(&t[..len]);
+            &mut []
+        } else {
+            let (this_chunk, rest) = out.split_at_mut(digest_alg.output_len());
+            this_chunk.copy_from_slice(t);
+            rest
+        };
+
+        if out.is_empty() {
+            return Ok(());
+        }
+
+        ctx = hmac::Context::with_key(&prk.0);
+        ctx.update(t);
+        n = n.checked_add(1).unwrap();
+    }
+}
diff --git a/ring-0.17.14/src/hmac.rs b/ring-0.17.14/src/hmac.rs
new file mode 100644
index 0000000000..f0b23809c4
--- /dev/null
+++ b/ring-0.17.14/src/hmac.rs
@@ -0,0 +1,440 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! HMAC is specified in [RFC 2104].
+//!
+//! After a `Key` is constructed, it can be used for multiple signing or
+//! verification operations. Separating the construction of the key from the
+//! rest of the HMAC operation allows the per-key precomputation to be done
+//! only once, instead of it being done in every HMAC operation.
+//!
+//! Frequently all the data to be signed in a message is available in a single
+//! contiguous piece. In that case, the module-level `sign` function can be
+//! used. Otherwise, if the input is in multiple parts, `Context` should be
+//! used.
+//!
+//! # Examples:
+//!
+//! ## Signing a value and verifying it wasn't tampered with
+//!
+//! ```
+//! use ring::{hmac, rand};
+//!
+//! let rng = rand::SystemRandom::new();
+//! let key = hmac::Key::generate(hmac::HMAC_SHA256, &rng)?;
+//!
+//! let msg = "hello, world";
+//!
+//! let tag = hmac::sign(&key, msg.as_bytes());
+//!
+//! // [We give access to the message to an untrusted party, and they give it
+//! // back to us. We need to verify they didn't tamper with it.]
+//!
+//! hmac::verify(&key, msg.as_bytes(), tag.as_ref())?;
+//!
+//! # Ok::<(), ring::error::Unspecified>(())
+//! ```
+//!
+//! ## Using the one-shot API:
+//!
+//! ```
+//! use ring::{digest, hmac, rand};
+//! use ring::rand::SecureRandom;
+//!
+//! let msg = "hello, world";
+//!
+//! // The sender generates a secure key value and signs the message with it.
+//! // Note that in a real protocol, a key agreement protocol would be used to
+//! // derive `key_value`.
+//! let rng = rand::SystemRandom::new();
+//! let key_value: [u8; digest::SHA256_OUTPUT_LEN] = rand::generate(&rng)?.expose();
+//!
+//! let s_key = hmac::Key::new(hmac::HMAC_SHA256, key_value.as_ref());
+//! let tag = hmac::sign(&s_key, msg.as_bytes());
+//!
+//! // The receiver (somehow!) knows the key value, and uses it to verify the
+//! // integrity of the message.
+//! let v_key = hmac::Key::new(hmac::HMAC_SHA256, key_value.as_ref());
+//! hmac::verify(&v_key, msg.as_bytes(), tag.as_ref())?;
+//!
+//! # Ok::<(), ring::error::Unspecified>(())
+//! ```
+//!
+//! ## Using the multi-part API:
+//! ```
+//! use ring::{digest, hmac, rand};
+//! use ring::rand::SecureRandom;
+//!
+//! let parts = ["hello", ", ", "world"];
+//!
+//! // The sender generates a secure key value and signs the message with it.
+//! // Note that in a real protocol, a key agreement protocol would be used to
+//! // derive `key_value`.
+//! let rng = rand::SystemRandom::new();
+//! let mut key_value: [u8; digest::SHA384_OUTPUT_LEN] = rand::generate(&rng)?.expose();
+//!
+//! let s_key = hmac::Key::new(hmac::HMAC_SHA384, key_value.as_ref());
+//! let mut s_ctx = hmac::Context::with_key(&s_key);
+//! for part in &parts {
+//!     s_ctx.update(part.as_bytes());
+//! }
+//! let tag = s_ctx.sign();
+//!
+//! // The receiver (somehow!) knows the key value, and uses it to verify the
+//! // integrity of the message.
+//! let v_key = hmac::Key::new(hmac::HMAC_SHA384, key_value.as_ref());
+//! let mut msg = Vec::<u8>::new();
+//! for part in &parts {
+//!     msg.extend(part.as_bytes());
+//! }
+//! hmac::verify(&v_key, &msg.as_ref(), tag.as_ref())?;
+//!
+//! # Ok::<(), ring::error::Unspecified>(())
+//! ```
+//!
+//! [RFC 2104]: https://tools.ietf.org/html/rfc2104
+
+use crate::{
+    bb, cpu,
+    digest::{self, Digest, FinishError},
+    error, hkdf, rand,
+};
+
+pub(crate) use crate::digest::InputTooLongError;
+
+/// An HMAC algorithm.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Algorithm(&'static digest::Algorithm);
+
+impl Algorithm {
+    /// The digest algorithm this HMAC algorithm is based on.
+    #[inline]
+    pub fn digest_algorithm(&self) -> &'static digest::Algorithm {
+        self.0
+    }
+}
+
+/// HMAC using SHA-1. Obsolete.
+pub static HMAC_SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm(&digest::SHA1_FOR_LEGACY_USE_ONLY);
+
+/// HMAC using SHA-256.
+pub static HMAC_SHA256: Algorithm = Algorithm(&digest::SHA256);
+
+/// HMAC using SHA-384.
+pub static HMAC_SHA384: Algorithm = Algorithm(&digest::SHA384);
+
+/// HMAC using SHA-512.
+pub static HMAC_SHA512: Algorithm = Algorithm(&digest::SHA512);
+
+/// An HMAC tag.
+///
+/// For a given tag `t`, use `t.as_ref()` to get the tag value as a byte slice.
+#[derive(Clone, Copy, Debug)]
+pub struct Tag(Digest);
+
+impl AsRef<[u8]> for Tag {
+    #[inline]
+    fn as_ref(&self) -> &[u8] {
+        self.0.as_ref()
+    }
+}
+
+/// A key to use for HMAC signing.
+#[derive(Clone)]
+pub struct Key {
+    inner: digest::BlockContext,
+    outer: digest::BlockContext,
+}
+
+impl core::fmt::Debug for Key {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        f.debug_struct("Key")
+            .field("algorithm", self.algorithm().digest_algorithm())
+            .finish()
+    }
+}
+
+impl Key {
+    /// Generate an HMAC signing key using the given digest algorithm with a
+    /// random value generated from `rng`.
+    ///
+    /// The key will be `digest_alg.output_len` bytes long, based on the
+    /// recommendation in [RFC 2104 Section 3].
+    ///
+    /// [RFC 2104 Section 3]: https://tools.ietf.org/html/rfc2104#section-3
+    pub fn generate(
+        algorithm: Algorithm,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<Self, error::Unspecified> {
+        Self::construct(algorithm, |buf| rng.fill(buf), cpu::features())
+    }
+
+    fn construct<F>(
+        algorithm: Algorithm,
+        fill: F,
+        cpu: cpu::Features,
+    ) -> Result<Self, error::Unspecified>
+    where
+        F: FnOnce(&mut [u8]) -> Result<(), error::Unspecified>,
+    {
+        let mut key_bytes = [0; digest::MAX_OUTPUT_LEN];
+        let key_bytes = &mut key_bytes[..algorithm.0.output_len()];
+        fill(key_bytes)?;
+        Self::try_new(algorithm, key_bytes, cpu).map_err(error::erase::<InputTooLongError>)
+    }
+
+    /// Construct an HMAC signing key using the given digest algorithm and key
+    /// value.
+    ///
+    /// `key_value` should be a value generated using a secure random number
+    /// generator (e.g. the `key_value` output by
+    /// `SealingKey::generate_serializable()`) or derived from a random key by
+    /// a key derivation function (e.g. `ring::hkdf`). In particular,
+    /// `key_value` shouldn't be a password.
+    ///
+    /// As specified in RFC 2104, if `key_value` is shorter than the digest
+    /// algorithm's block length (as returned by `digest::Algorithm::block_len()`,
+    /// not the digest length returned by `digest::Algorithm::output_len()`) then
+    /// it will be padded with zeros. Similarly, if it is longer than the block
+    /// length then it will be compressed using the digest algorithm.
+    ///
+    /// You should not use keys larger than the `digest_alg.block_len` because
+    /// the truncation described above reduces their strength to only
+    /// `digest_alg.output_len * 8` bits. Support for such keys is likely to be
+    /// removed in a future version of *ring*.
+    pub fn new(algorithm: Algorithm, key_value: &[u8]) -> Self {
+        Self::try_new(algorithm, key_value, cpu::features())
+            .map_err(error::erase::<InputTooLongError>)
+            .unwrap()
+    }
+
+    pub(crate) fn try_new(
+        algorithm: Algorithm,
+        key_value: &[u8],
+        cpu_features: cpu::Features,
+    ) -> Result<Self, InputTooLongError> {
+        let digest_alg = algorithm.0;
+        let mut key = Self {
+            inner: digest::BlockContext::new(digest_alg),
+            outer: digest::BlockContext::new(digest_alg),
+        };
+
+        let block_len = digest_alg.block_len();
+
+        let key_hash;
+        let key_value = if key_value.len() <= block_len {
+            key_value
+        } else {
+            key_hash = Digest::compute_from(digest_alg, key_value, cpu_features)?;
+            key_hash.as_ref()
+        };
+
+        const IPAD: u8 = 0x36;
+
+        let mut padded_key = [IPAD; digest::MAX_BLOCK_LEN];
+        let padded_key = &mut padded_key[..block_len];
+
+        // If the key is shorter than one block then we're supposed to act like
+        // it is padded with zero bytes up to the block length. `x ^ 0 == x` so
+        // we can just leave the trailing bytes of `padded_key` untouched.
+        bb::xor_assign_at_start(&mut padded_key[..], key_value);
+
+        let leftover = key.inner.update(padded_key, cpu_features);
+        debug_assert_eq!(leftover.len(), 0);
+
+        const OPAD: u8 = 0x5C;
+
+        // Remove the `IPAD` masking, leaving the unmasked padded key, then
+        // mask with `OPAD`, all in one step.
+        bb::xor_assign(&mut padded_key[..], IPAD ^ OPAD);
+        let leftover = key.outer.update(padded_key, cpu_features);
+        debug_assert_eq!(leftover.len(), 0);
+
+        Ok(key)
+    }
+
+    /// The digest algorithm for the key.
+    #[inline]
+    pub fn algorithm(&self) -> Algorithm {
+        Algorithm(self.inner.algorithm)
+    }
+
+    pub(crate) fn sign(&self, data: &[u8], cpu: cpu::Features) -> Result<Tag, InputTooLongError> {
+        let mut ctx = Context::with_key(self);
+        ctx.update(data);
+        ctx.try_sign(cpu)
+    }
+
+    fn verify(&self, data: &[u8], tag: &[u8], cpu: cpu::Features) -> Result<(), VerifyError> {
+        let computed = self
+            .sign(data, cpu)
+            .map_err(VerifyError::InputTooLongError)?;
+        bb::verify_slices_are_equal(computed.as_ref(), tag)
+            .map_err(|_: error::Unspecified| VerifyError::Mismatch)
+    }
+}
+
+impl hkdf::KeyType for Algorithm {
+    fn len(&self) -> usize {
+        self.digest_algorithm().output_len()
+    }
+}
+
+impl From<hkdf::Okm<'_, Algorithm>> for Key {
+    fn from(okm: hkdf::Okm<Algorithm>) -> Self {
+        Self::construct(*okm.len(), |buf| okm.fill(buf), cpu::features()).unwrap()
+    }
+}
+
+/// A context for multi-step (Init-Update-Finish) HMAC signing.
+///
+/// Use `sign` for single-step HMAC signing.
+#[derive(Clone)]
+pub struct Context {
+    inner: digest::Context,
+    outer: digest::BlockContext,
+}
+
+impl core::fmt::Debug for Context {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        f.debug_struct("Context")
+            .field("algorithm", self.inner.algorithm())
+            .finish()
+    }
+}
+
+impl Context {
+    /// Constructs a new HMAC signing context using the given digest algorithm
+    /// and key.
+    pub fn with_key(signing_key: &Key) -> Self {
+        Self {
+            inner: digest::Context::clone_from(&signing_key.inner),
+            outer: signing_key.outer.clone(),
+        }
+    }
+
+    /// Updates the HMAC with all the data in `data`. `update` may be called
+    /// zero or more times until `finish` is called.
+    pub fn update(&mut self, data: &[u8]) {
+        self.inner.update(data);
+    }
+
+    /// Finalizes the HMAC calculation and returns the HMAC value. `sign`
+    /// consumes the context so it cannot be (mis-)used after `sign` has been
+    /// called.
+    ///
+    /// It is generally not safe to implement HMAC verification by comparing
+    /// the return value of `sign` to a tag. Use `verify` for verification
+    /// instead.
+    pub fn sign(self) -> Tag {
+        self.try_sign(cpu::features())
+            .map_err(error::erase::<InputTooLongError>)
+            .unwrap()
+    }
+
+    pub(crate) fn try_sign(self, cpu_features: cpu::Features) -> Result<Tag, InputTooLongError> {
+        // Consequently, `num_pending` is valid.
+        debug_assert_eq!(self.inner.algorithm(), self.outer.algorithm);
+        debug_assert!(self.inner.algorithm().output_len() < self.outer.algorithm.block_len());
+
+        let inner = self.inner.try_finish(cpu_features)?;
+        let inner = inner.as_ref();
+        let num_pending = inner.len();
+        let buffer = &mut [0u8; digest::MAX_BLOCK_LEN];
+        const _BUFFER_IS_LARGE_ENOUGH_TO_HOLD_INNER: () =
+            assert!(digest::MAX_OUTPUT_LEN < digest::MAX_BLOCK_LEN);
+        buffer[..num_pending].copy_from_slice(inner);
+
+        self.outer
+            .try_finish(buffer, num_pending, cpu_features)
+            .map(Tag)
+            .map_err(|err| match err {
+                FinishError::InputTooLong(i) => {
+                    // Unreachable, as we gave the inner context exactly the
+                    // same input we gave the outer context, and
+                    // `inner.try_finish` already succeeded. However, it is
+                    // quite difficult to prove this, and we already return
+                    // `InputTooLongError`, so just forward it along.
+                    i
+                }
+                FinishError::PendingNotAPartialBlock(_) => {
+                    // Follows from the assertions above.
+                    unreachable!()
+                }
+            })
+    }
+}
+
+/// Calculates the HMAC of `data` using the key `key` in one step.
+///
+/// Use `Context` to calculate HMACs where the input is in multiple parts.
+///
+/// It is generally not safe to implement HMAC verification by comparing the
+/// return value of `sign` to a tag. Use `verify` for verification instead.
+pub fn sign(key: &Key, data: &[u8]) -> Tag {
+    key.sign(data, cpu::features())
+        .map_err(error::erase::<InputTooLongError>)
+        .unwrap()
+}
+
+/// Calculates the HMAC of `data` using the signing key `key`, and verifies
+/// whether the resultant value equals `tag`, in one step.
+///
+/// This is logically equivalent to, but more efficient than, constructing a
+/// `Key` with the same value as `key` and then using `verify`.
+///
+/// The verification will be done in constant time to prevent timing attacks.
+pub fn verify(key: &Key, data: &[u8], tag: &[u8]) -> Result<(), error::Unspecified> {
+    key.verify(data, tag, cpu::features())
+        .map_err(|_: VerifyError| error::Unspecified)
+}
+
+enum VerifyError {
+    // Theoretically somebody could have calculated a valid tag with a gigantic
+    // input that we do not support. If we were to support every theoretically
+    // valid input length, for *every* digest algorithm, then we could argue
+    // that hitting the input length limit implies a mismatch since nobody
+    // could have calculated such a tag with the given input.
+    #[allow(dead_code)]
+    InputTooLongError(InputTooLongError),
+
+    Mismatch,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{hmac, rand};
+
+    // Make sure that `Key::generate` and `verify_with_own_key` aren't
+    // completely wacky.
+    #[test]
+    pub fn hmac_signing_key_coverage() {
+        let rng = rand::SystemRandom::new();
+
+        const HELLO_WORLD_GOOD: &[u8] = b"hello, world";
+        const HELLO_WORLD_BAD: &[u8] = b"hello, worle";
+
+        for algorithm in &[
+            hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY,
+            hmac::HMAC_SHA256,
+            hmac::HMAC_SHA384,
+            hmac::HMAC_SHA512,
+        ] {
+            let key = hmac::Key::generate(*algorithm, &rng).unwrap();
+            let tag = hmac::sign(&key, HELLO_WORLD_GOOD);
+            assert!(hmac::verify(&key, HELLO_WORLD_GOOD, tag.as_ref()).is_ok());
+            assert!(hmac::verify(&key, HELLO_WORLD_BAD, tag.as_ref()).is_err())
+        }
+    }
+}
diff --git a/ring-0.17.14/src/io.rs b/ring-0.17.14/src/io.rs
new file mode 100644
index 0000000000..232d5ff39a
--- /dev/null
+++ b/ring-0.17.14/src/io.rs
@@ -0,0 +1,31 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Serialization and deserialization.
+
+#[doc(hidden)]
+pub mod der;
+
+#[cfg(feature = "alloc")]
+mod writer;
+
+#[cfg(feature = "alloc")]
+pub(crate) mod der_writer;
+
+pub(crate) mod positive;
+
+pub use self::positive::Positive;
+
+#[cfg(feature = "alloc")]
+pub(crate) use self::writer::TooLongError;
diff --git a/ring-0.17.14/src/io/der.rs b/ring-0.17.14/src/io/der.rs
new file mode 100644
index 0000000000..52aed92b9b
--- /dev/null
+++ b/ring-0.17.14/src/io/der.rs
@@ -0,0 +1,301 @@
+// Copyright 2015 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Building blocks for parsing DER-encoded ASN.1 structures.
+//!
+//! This module contains the foundational parts of an ASN.1 DER parser.
+
+use super::Positive;
+use crate::error;
+
+pub const CONSTRUCTED: u8 = 1 << 5;
+pub const CONTEXT_SPECIFIC: u8 = 2 << 6;
+
+#[derive(Clone, Copy, PartialEq)]
+#[repr(u8)]
+pub enum Tag {
+    Boolean = 0x01,
+    Integer = 0x02,
+    BitString = 0x03,
+    OctetString = 0x04,
+    Null = 0x05,
+    OID = 0x06,
+    Sequence = CONSTRUCTED | 0x10, // 0x30
+    UTCTime = 0x17,
+    GeneralizedTime = 0x18,
+
+    ContextSpecific1 = CONTEXT_SPECIFIC | 1,
+
+    ContextSpecificConstructed0 = CONTEXT_SPECIFIC | CONSTRUCTED | 0,
+    ContextSpecificConstructed1 = CONTEXT_SPECIFIC | CONSTRUCTED | 1,
+    ContextSpecificConstructed3 = CONTEXT_SPECIFIC | CONSTRUCTED | 3,
+}
+
+impl From<Tag> for usize {
+    fn from(tag: Tag) -> Self {
+        Self::from(Tag::into(tag))
+    }
+}
+
+impl From<Tag> for u8 {
+    fn from(tag: Tag) -> Self {
+        Tag::into(tag)
+    }
+}
+
+// `impl From<Tag> for u8` but as a `const fn`.
+impl Tag {
+    pub const fn into(self) -> u8 {
+        self as u8
+    }
+}
+
+pub fn expect_tag_and_get_value<'a>(
+    input: &mut untrusted::Reader<'a>,
+    tag: Tag,
+) -> Result<untrusted::Input<'a>, error::Unspecified> {
+    let (actual_tag, inner) = read_tag_and_get_value(input)?;
+    if usize::from(tag) != usize::from(actual_tag) {
+        return Err(error::Unspecified);
+    }
+    Ok(inner)
+}
+
+pub fn read_tag_and_get_value<'a>(
+    input: &mut untrusted::Reader<'a>,
+) -> Result<(u8, untrusted::Input<'a>), error::Unspecified> {
+    let tag = input.read_byte()?;
+    if (tag & 0x1F) == 0x1F {
+        return Err(error::Unspecified); // High tag number form is not allowed.
+    }
+
+    // If the high order bit of the first byte is set to zero then the length
+    // is encoded in the seven remaining bits of that byte. Otherwise, those
+    // seven bits represent the number of bytes used to encode the length.
+    let length = match input.read_byte()? {
+        n if (n & 0x80) == 0 => usize::from(n),
+        0x81 => {
+            let second_byte = input.read_byte()?;
+            if second_byte < 128 {
+                return Err(error::Unspecified); // Not the canonical encoding.
+            }
+            usize::from(second_byte)
+        }
+        0x82 => {
+            let second_byte = usize::from(input.read_byte()?);
+            let third_byte = usize::from(input.read_byte()?);
+            let combined = (second_byte << 8) | third_byte;
+            if combined < 256 {
+                return Err(error::Unspecified); // Not the canonical encoding.
+            }
+            combined
+        }
+        _ => {
+            return Err(error::Unspecified); // We don't support longer lengths.
+        }
+    };
+
+    let inner = input.read_bytes(length)?;
+    Ok((tag, inner))
+}
+
+#[inline]
+pub fn bit_string_with_no_unused_bits<'a>(
+    input: &mut untrusted::Reader<'a>,
+) -> Result<untrusted::Input<'a>, error::Unspecified> {
+    bit_string_tagged_with_no_unused_bits(Tag::BitString, input)
+}
+
+pub(crate) fn bit_string_tagged_with_no_unused_bits<'a>(
+    tag: Tag,
+    input: &mut untrusted::Reader<'a>,
+) -> Result<untrusted::Input<'a>, error::Unspecified> {
+    nested(input, tag, error::Unspecified, |value| {
+        let unused_bits_at_end = value.read_byte().map_err(|_| error::Unspecified)?;
+        if unused_bits_at_end != 0 {
+            return Err(error::Unspecified);
+        }
+        Ok(value.read_bytes_to_end())
+    })
+}
+
+// TODO: investigate taking decoder as a reference to reduce generated code
+// size.
+pub fn nested<'a, F, R, E: Copy>(
+    input: &mut untrusted::Reader<'a>,
+    tag: Tag,
+    error: E,
+    decoder: F,
+) -> Result<R, E>
+where
+    F: FnOnce(&mut untrusted::Reader<'a>) -> Result<R, E>,
+{
+    let inner = expect_tag_and_get_value(input, tag).map_err(|_| error)?;
+    inner.read_all(error, decoder)
+}
+
+pub(crate) fn nonnegative_integer<'a>(
+    input: &mut untrusted::Reader<'a>,
+) -> Result<untrusted::Input<'a>, error::Unspecified> {
+    let value = expect_tag_and_get_value(input, Tag::Integer)?;
+    match value
+        .as_slice_less_safe()
+        .split_first()
+        .ok_or(error::Unspecified)?
+    {
+        // Zero or leading zero.
+        (0, rest) => {
+            match rest.first() {
+                // Zero.
+                None => Ok(value),
+                // Necessary leading zero.
+                Some(&second) if second & 0x80 == 0x80 => Ok(untrusted::Input::from(rest)),
+                // Unnecessary leading zero.
+                _ => Err(error::Unspecified),
+            }
+        }
+        // Positive value with no leading zero.
+        (first, _) if first & 0x80 == 0 => Ok(value),
+        // Negative value.
+        (_, _) => Err(error::Unspecified),
+    }
+}
+
+/// Parse as integer with a value in the in the range [0, 255], returning its
+/// numeric value. This is typically used for parsing version numbers.
+#[inline]
+pub fn small_nonnegative_integer(input: &mut untrusted::Reader) -> Result<u8, error::Unspecified> {
+    let value = nonnegative_integer(input)?;
+    match *value.as_slice_less_safe() {
+        [b] => Ok(b),
+        _ => Err(error::Unspecified),
+    }
+}
+
+/// Parses a positive DER integer, returning the big-endian-encoded value,
+/// sans any leading zero byte.
+pub fn positive_integer<'a>(
+    input: &mut untrusted::Reader<'a>,
+) -> Result<Positive<'a>, error::Unspecified> {
+    let value = nonnegative_integer(input)?;
+    Positive::from_be_bytes(value)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::error;
+
+    fn with_i<'a, F, R>(value: &'a [u8], f: F) -> Result<R, error::Unspecified>
+    where
+        F: FnOnce(&mut untrusted::Reader<'a>) -> Result<R, error::Unspecified>,
+    {
+        untrusted::Input::from(value).read_all(error::Unspecified, f)
+    }
+
+    static ZERO_INTEGER: &[u8] = &[0x02, 0x01, 0x00];
+
+    static GOOD_POSITIVE_INTEGERS_SMALL: &[(&[u8], u8)] = &[
+        (&[0x02, 0x01, 0x01], 0x01),
+        (&[0x02, 0x01, 0x02], 0x02),
+        (&[0x02, 0x01, 0x7e], 0x7e),
+        (&[0x02, 0x01, 0x7f], 0x7f),
+        // Values that need to have an 0x00 prefix to disambiguate them from
+        // them from negative values.
+        (&[0x02, 0x02, 0x00, 0x80], 0x80),
+        (&[0x02, 0x02, 0x00, 0x81], 0x81),
+        (&[0x02, 0x02, 0x00, 0xfe], 0xfe),
+        (&[0x02, 0x02, 0x00, 0xff], 0xff),
+    ];
+
+    static GOOD_POSITIVE_INTEGERS_LARGE: &[(&[u8], &[u8])] = &[
+        (&[0x02, 0x02, 0x01, 0x00], &[0x01, 0x00]),
+        (&[0x02, 0x02, 0x02, 0x01], &[0x02, 0x01]),
+        (&[0x02, 0x02, 0x7e, 0xfe], &[0x7e, 0xfe]),
+        (&[0x02, 0x02, 0x7f, 0xff], &[0x7f, 0xff]),
+        // Values that need to have an 0x00 prefix to disambiguate them from
+        // them from negative values.
+        (&[0x02, 0x03, 0x00, 0x80, 0x00], &[0x80, 0x00]),
+        (&[0x02, 0x03, 0x00, 0x81, 0x01], &[0x81, 0x01]),
+        (&[0x02, 0x03, 0x00, 0xfe, 0xfe], &[0xfe, 0xfe]),
+        (&[0x02, 0x03, 0x00, 0xff, 0xff], &[0xff, 0xff]),
+    ];
+
+    static BAD_NONNEGATIVE_INTEGERS: &[&[u8]] = &[
+        &[],           // At end of input
+        &[0x02],       // Tag only
+        &[0x02, 0x00], // Empty value
+        // Length mismatch
+        &[0x02, 0x00, 0x01],
+        &[0x02, 0x01],
+        // Would be valid if leading zero is ignored when comparing length.
+        &[0x02, 0x01, 0x00, 0x01],
+        &[0x02, 0x01, 0x01, 0x00], // Would be valid if last byte is ignored.
+        &[0x02, 0x02, 0x01],
+        // Values that are missing a necessary leading 0x00
+        &[0x02, 0x01, 0x80],
+        &[0x02, 0x01, 0x81],
+        &[0x02, 0x01, 0xfe],
+        &[0x02, 0x01, 0xff],
+        // Values that have an unnecessary leading 0x00
+        &[0x02, 0x02, 0x00, 0x00],
+        &[0x02, 0x02, 0x00, 0x01],
+        &[0x02, 0x02, 0x00, 0x02],
+        &[0x02, 0x02, 0x00, 0x7e],
+        &[0x02, 0x02, 0x00, 0x7f],
+    ];
+
+    #[test]
+    fn test_small_nonnegative_integer() {
+        let zero = (ZERO_INTEGER, 0x00);
+        for &(test_in, test_out) in
+            core::iter::once(&zero).chain(GOOD_POSITIVE_INTEGERS_SMALL.iter())
+        {
+            let result = with_i(test_in, |input| {
+                assert_eq!(small_nonnegative_integer(input)?, test_out);
+                Ok(())
+            });
+            assert_eq!(result, Ok(()));
+        }
+        for &test_in in BAD_NONNEGATIVE_INTEGERS
+            .iter()
+            .chain(GOOD_POSITIVE_INTEGERS_LARGE.iter().map(|(input, _)| input))
+        {
+            let result = with_i(test_in, small_nonnegative_integer);
+            assert_eq!(result, Err(error::Unspecified));
+        }
+    }
+
+    #[test]
+    fn test_positive_integer() {
+        for (test_in, test_out) in GOOD_POSITIVE_INTEGERS_SMALL
+            .iter()
+            .map(|(test_in, test_out)| (*test_in, core::slice::from_ref(test_out)))
+            .chain(GOOD_POSITIVE_INTEGERS_LARGE.iter().copied())
+        {
+            let result = with_i(test_in, |input| {
+                assert_eq!(
+                    positive_integer(input)?.big_endian_without_leading_zero(),
+                    test_out
+                );
+                Ok(())
+            });
+            assert_eq!(result, Ok(()))
+        }
+        for &test_in in core::iter::once(&ZERO_INTEGER).chain(BAD_NONNEGATIVE_INTEGERS.iter()) {
+            let result = with_i(test_in, positive_integer);
+            assert!(matches!(result, Err(error::Unspecified)));
+        }
+    }
+}
diff --git a/ring-0.17.14/src/io/der_writer.rs b/ring-0.17.14/src/io/der_writer.rs
new file mode 100644
index 0000000000..d6f9717d6c
--- /dev/null
+++ b/ring-0.17.14/src/io/der_writer.rs
@@ -0,0 +1,71 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{der::*, writer::*, *};
+use alloc::boxed::Box;
+
+pub(crate) fn write_positive_integer(
+    output: &mut dyn Accumulator,
+    value: &Positive,
+) -> Result<(), TooLongError> {
+    let first_byte = value.first_byte();
+    let value = value.big_endian_without_leading_zero_as_input();
+    write_tlv(output, Tag::Integer, |output| {
+        if (first_byte & 0x80) != 0 {
+            output.write_byte(0)?; // Disambiguate negative number.
+        }
+        write_copy(output, value)
+    })
+}
+
+pub(crate) fn write_all(
+    tag: Tag,
+    write_value: &dyn Fn(&mut dyn Accumulator) -> Result<(), TooLongError>,
+) -> Result<Box<[u8]>, TooLongError> {
+    let length = {
+        let mut length = LengthMeasurement::zero();
+        write_tlv(&mut length, tag, write_value)?;
+        length
+    };
+
+    let mut output = Writer::with_capacity(length);
+    write_tlv(&mut output, tag, write_value)?;
+
+    Ok(output.into())
+}
+
+fn write_tlv<F>(output: &mut dyn Accumulator, tag: Tag, write_value: F) -> Result<(), TooLongError>
+where
+    F: Fn(&mut dyn Accumulator) -> Result<(), TooLongError>,
+{
+    let length: usize = {
+        let mut length = LengthMeasurement::zero();
+        write_value(&mut length)?;
+        length.into()
+    };
+    let length: u16 = length.try_into().map_err(|_| TooLongError::new())?;
+
+    output.write_byte(tag.into())?;
+
+    let [lo, hi] = length.to_le_bytes();
+    if length >= 0x1_00 {
+        output.write_byte(0x82)?;
+        output.write_byte(hi)?;
+    } else if length >= 0x80 {
+        output.write_byte(0x81)?;
+    }
+    output.write_byte(lo)?;
+
+    write_value(output)
+}
diff --git a/ring-0.17.14/src/io/positive.rs b/ring-0.17.14/src/io/positive.rs
new file mode 100644
index 0000000000..33f2fefc57
--- /dev/null
+++ b/ring-0.17.14/src/io/positive.rs
@@ -0,0 +1,98 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Serialization and deserialization.
+
+use crate::error;
+
+/// A serialized positive integer.
+#[derive(Copy, Clone)]
+pub struct Positive<'a>(untrusted::Input<'a>);
+
+impl<'a> Positive<'a> {
+    #[inline]
+    pub(crate) fn from_be_bytes(input: untrusted::Input<'a>) -> Result<Self, error::Unspecified> {
+        // Empty inputs are not allowed.
+        let &first_byte = input
+            .as_slice_less_safe()
+            .first()
+            .ok_or(error::Unspecified)?;
+        // Zero isn't allowed and leading zeros aren't allowed.
+        if first_byte == 0 {
+            return Err(error::Unspecified);
+        }
+        Ok(Self(input))
+    }
+
+    /// Returns the value, ordered from significant byte to least significant
+    /// byte, without any leading zeros. The result is guaranteed to be
+    /// non-empty.
+    #[inline]
+    pub fn big_endian_without_leading_zero(&self) -> &'a [u8] {
+        self.big_endian_without_leading_zero_as_input()
+            .as_slice_less_safe()
+    }
+
+    #[inline]
+    pub(crate) fn big_endian_without_leading_zero_as_input(&self) -> untrusted::Input<'a> {
+        self.0
+    }
+}
+
+impl Positive<'_> {
+    /// Returns the first byte.
+    ///
+    /// Will not panic because the value is guaranteed to have at least one
+    /// byte.
+    pub fn first_byte(&self) -> u8 {
+        // This won't panic because
+        self.0.as_slice_less_safe()[0]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_from_be_bytes() {
+        static TEST_CASES: &[(&[u8], Result<&[u8], error::Unspecified>)] = &[
+            // An empty input isn't a number.
+            (&[], Err(error::Unspecified)),
+            // Zero is not positive.
+            (&[0x00], Err(error::Unspecified)),
+            // Minimum value. No leading zero required or allowed.
+            (&[0x00, 0x01], Err(error::Unspecified)),
+            (&[0x01], Ok(&[0x01])),
+            // Maximum first byte. No leading zero required or allowed.
+            (&[0xff], Ok(&[0xff])),
+            (&[0x00, 0xff], Err(error::Unspecified)),
+            // The last byte can be zero.
+            (&[0x01, 0x00], Ok(&[0x01, 0x00])),
+            (&[0x01, 0x00, 0x00], Ok(&[0x01, 0x00, 0x00])),
+            // Having no zero bytes are also allowed.
+            (&[0x01, 0x01], Ok(&[0x01, 0x01])),
+            // A middle byte can be zero.
+            (&[0x01, 0x00, 0x01], Ok(&[0x01, 0x00, 0x01])),
+            (&[0x01, 0x01, 0x01], Ok(&[0x01, 0x01, 0x01])),
+        ];
+        for &(input, result) in TEST_CASES {
+            let input = untrusted::Input::from(input);
+            assert_eq!(
+                Positive::from_be_bytes(input).map(|p| p.big_endian_without_leading_zero()),
+                result
+            );
+        }
+    }
+}
diff --git a/ring-0.17.14/src/io/writer.rs b/ring-0.17.14/src/io/writer.rs
new file mode 100644
index 0000000000..9ebe0d32a3
--- /dev/null
+++ b/ring-0.17.14/src/io/writer.rs
@@ -0,0 +1,97 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use alloc::{boxed::Box, vec::Vec};
+
+pub trait Accumulator {
+    fn write_byte(&mut self, value: u8) -> Result<(), TooLongError>;
+    fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError>;
+}
+
+pub(super) struct LengthMeasurement {
+    len: usize,
+}
+
+impl From<LengthMeasurement> for usize {
+    fn from(len: LengthMeasurement) -> usize {
+        len.len
+    }
+}
+
+impl LengthMeasurement {
+    pub fn zero() -> Self {
+        Self { len: 0 }
+    }
+}
+
+impl Accumulator for LengthMeasurement {
+    fn write_byte(&mut self, _value: u8) -> Result<(), TooLongError> {
+        self.len = self.len.checked_add(1).ok_or_else(TooLongError::new)?;
+        Ok(())
+    }
+    fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError> {
+        self.len = self
+            .len
+            .checked_add(value.len())
+            .ok_or_else(TooLongError::new)?;
+        Ok(())
+    }
+}
+
+pub(super) struct Writer {
+    bytes: Vec<u8>,
+    requested_capacity: usize,
+}
+
+impl Writer {
+    pub(super) fn with_capacity(capacity: LengthMeasurement) -> Self {
+        Self {
+            bytes: Vec::with_capacity(capacity.len),
+            requested_capacity: capacity.len,
+        }
+    }
+}
+
+impl From<Writer> for Box<[u8]> {
+    fn from(writer: Writer) -> Self {
+        assert_eq!(writer.requested_capacity, writer.bytes.len());
+        writer.bytes.into_boxed_slice()
+    }
+}
+
+impl Accumulator for Writer {
+    fn write_byte(&mut self, value: u8) -> Result<(), TooLongError> {
+        self.bytes.push(value);
+        Ok(())
+    }
+    fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError> {
+        self.bytes.extend(value);
+        Ok(())
+    }
+}
+
+pub fn write_copy(
+    accumulator: &mut dyn Accumulator,
+    to_copy: untrusted::Input,
+) -> Result<(), TooLongError> {
+    accumulator.write_bytes(to_copy.as_slice_less_safe())
+}
+
+pub struct TooLongError(());
+
+impl TooLongError {
+    pub fn new() -> Self {
+        Self(())
+    }
+}
diff --git a/ring-0.17.14/src/lib.rs b/ring-0.17.14/src/lib.rs
new file mode 100644
index 0000000000..5d33f39026
--- /dev/null
+++ b/ring-0.17.14/src/lib.rs
@@ -0,0 +1,180 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! # Feature Flags
+//!
+//! <table>
+//! <tr><th>Feature
+//!     <th>Description
+//! <tr><td><code>alloc (default)</code>
+//!     <td>Enable features that require use of the heap, RSA in particular.
+//! <tr><td><code>less-safe-getrandom-custom-or-rdrand</code>
+//!     <td>Treat user-provided ("custom") and RDRAND-based <code>getrandom</code>
+//!         implementations as secure random number generators (see
+//!         <code>SecureRandom</code>). This feature only works with
+//!         <code>os = "none"</code> targets. See
+//!         <a href="https://docs.rs/getrandom/0.2.10/getrandom/macro.register_custom_getrandom.html">
+//!             <code>register_custom_getrandom</code>
+//!         </a> and <a href="https://docs.rs/getrandom/0.2.10/getrandom/#rdrand-on-x86">
+//!             RDRAND on x86
+//!         </a> for additional details.
+//! <tr><td><code>less-safe-getrandom-espidf</code>
+//!     <td>Treat getrandom as a secure random number generator (see
+//!         <code>SecureRandom</code>) on the esp-idf target. While the esp-idf
+//!         target does have hardware RNG, it is beyond the scope of ring to
+//!         ensure its configuration. This feature allows ring to build
+//!         on esp-idf despite the likelihood that RNG is not secure.
+//!         This feature only works with <code>os = espidf</code> targets.
+//!         See <a href="https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/random.html">
+//! <tr><td><code>std</code>
+//!     <td>Enable features that use libstd, in particular
+//!         <code>std::error::Error</code> integration. Implies `alloc`.
+//! <tr><td><code>wasm32_unknown_unknown_js</code>
+//!     <td>When this feature is enabled, for the wasm32-unknown-unknown target,
+//!         Web APIs will be used to implement features like `ring::rand` that
+//!         require an operating environment of some kind. This has no effect
+//!         for any other target. This enables the `getrandom` crate's `js`
+//!         feature.
+//! </table>
+
+// When running mk/package.sh, don't actually build any code.
+#![allow(
+    clippy::collapsible_if,
+    clippy::identity_op,
+    clippy::len_without_is_empty,
+    clippy::let_unit_value,
+    clippy::new_without_default,
+    clippy::neg_cmp_op_on_partial_ord,
+    clippy::too_many_arguments,
+    clippy::type_complexity,
+    non_camel_case_types,
+    non_snake_case,
+    unsafe_code
+)]
+#![deny(variant_size_differences)]
+#![forbid(
+    unused_results,
+    unsafe_op_in_unsafe_fn,
+    clippy::char_lit_as_u8,
+    clippy::fn_to_numeric_cast,
+    clippy::fn_to_numeric_cast_with_truncation,
+    clippy::ptr_as_ptr
+)]
+#![warn(
+    clippy::unnecessary_cast,
+    clippy::cast_lossless,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss
+)]
+#![cfg_attr(
+    not(any(
+        all(target_arch = "aarch64", target_endian = "little"),
+        all(target_arch = "arm", target_endian = "little"),
+        target_arch = "x86",
+        target_arch = "x86_64",
+        feature = "alloc"
+    )),
+    allow(dead_code, unused_imports, unused_macros)
+)]
+#![no_std]
+
+#[cfg(all(target_arch = "aarch64", not(target_os="optee")))]
+extern crate libc;
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+#[macro_use]
+mod debug;
+
+#[macro_use]
+mod prefixed;
+
+#[doc(hidden)]
+#[macro_use]
+mod testutil;
+
+#[macro_use]
+mod bssl;
+
+#[macro_use]
+mod polyfill;
+
+pub mod aead;
+
+pub mod agreement;
+mod arithmetic;
+mod bits;
+
+pub(crate) mod bb;
+pub(crate) mod c;
+
+#[doc(hidden)]
+#[deprecated(
+    note = "Will be removed. Internal module not intended for external use, with no promises regarding side channels."
+)]
+pub mod deprecated_constant_time;
+
+#[doc(hidden)]
+#[allow(deprecated)]
+#[deprecated(
+    note = "Will be removed. Internal module not intended for external use, with no promises regarding side channels."
+)]
+pub use deprecated_constant_time as constant_time;
+
+pub mod io;
+
+mod cpu;
+pub mod digest;
+mod ec;
+pub mod error;
+pub mod hkdf;
+pub mod hmac;
+mod limb;
+pub mod pbkdf2;
+pub mod pkcs8;
+pub mod rand;
+
+#[cfg(feature = "alloc")]
+pub mod rsa;
+
+pub mod signature;
+
+#[cfg(test)]
+mod tests;
+
+mod sealed {
+    /// Traits that are designed to only be implemented internally in *ring*.
+    //
+    // Usage:
+    // ```
+    // use crate::sealed;
+    //
+    // pub trait MyType: sealed::Sealed {
+    //     // [...]
+    // }
+    //
+    // impl sealed::Sealed for MyType {}
+    // ```
+    pub trait Sealed {}
+}
+
+#[deprecated(note = "internal API that will be removed")]
+pub mod deprecated_test;
+
+#[allow(deprecated)]
+#[deprecated(note = "internal API that will be removed")]
+pub use deprecated_test as test;
diff --git a/ring-0.17.14/src/limb.rs b/ring-0.17.14/src/limb.rs
new file mode 100644
index 0000000000..fef4bc058b
--- /dev/null
+++ b/ring-0.17.14/src/limb.rs
@@ -0,0 +1,662 @@
+// Copyright 2016 David Judd.
+// Copyright 2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Unsigned multi-precision integer arithmetic.
+//!
+//! Limbs ordered least-significant-limb to most-significant-limb. The bits
+//! limbs use the native endianness.
+
+use crate::{
+    arithmetic::inout::{AliasingSlices2, AliasingSlices3},
+    bb, c,
+    error::{self, LenMismatchError},
+    polyfill::{sliceutil, usize_from_u32, ArrayFlatMap},
+};
+use core::{iter, num::NonZeroUsize};
+
+#[cfg(any(test, feature = "alloc"))]
+use crate::bits;
+
+#[cfg(feature = "alloc")]
+use core::num::Wrapping;
+
+// XXX: Not correct for x32 ABIs.
+pub type Limb = bb::Word;
+pub type LeakyLimb = bb::LeakyWord;
+pub const LIMB_BITS: usize = usize_from_u32(Limb::BITS);
+pub const LIMB_BYTES: usize = (LIMB_BITS + 7) / 8;
+
+pub type LimbMask = bb::BoolMask;
+
+#[inline]
+pub fn limbs_equal_limbs_consttime(a: &[Limb], b: &[Limb]) -> Result<LimbMask, LenMismatchError> {
+    if a.len() != b.len() {
+        return Err(LenMismatchError::new(a.len()));
+    }
+    let all = a.iter().zip(b).fold(0, |running, (a, b)| running | (a ^ b));
+    Ok(limb_is_zero(all))
+}
+
+#[inline]
+fn limbs_less_than_limbs(a: &[Limb], b: &[Limb]) -> Result<LimbMask, LenMismatchError> {
+    prefixed_extern! {
+        fn LIMBS_less_than(a: *const Limb, b: *const Limb, num_limbs: c::NonZero_size_t)
+            -> LimbMask;
+    }
+    // Use `b.len` because usually `b` will be the modulus which is likely to
+    // have had its length checked already so this can be elided by the
+    // optimizer.
+    // XXX: Questionable whether `LenMismatchError` is appropriate.
+    let len = NonZeroUsize::new(b.len()).ok_or_else(|| LenMismatchError::new(a.len()))?;
+    if a.len() != len.get() {
+        return Err(LenMismatchError::new(a.len()));
+    }
+    Ok(unsafe { LIMBS_less_than(a.as_ptr(), b.as_ptr(), len) })
+}
+
+#[inline]
+pub(crate) fn verify_limbs_less_than_limbs_leak_bit(
+    a: &[Limb],
+    b: &[Limb],
+) -> Result<(), error::Unspecified> {
+    let r = limbs_less_than_limbs(a, b).map_err(error::erase::<LenMismatchError>)?;
+    if r.leak() {
+        Ok(())
+    } else {
+        Err(error::Unspecified)
+    }
+}
+
+#[inline]
+pub fn limbs_less_than_limbs_vartime(a: &[Limb], b: &[Limb]) -> Result<bool, LenMismatchError> {
+    let r = limbs_less_than_limbs(a, b)?;
+    Ok(r.leak())
+}
+
+#[inline]
+fn limb_is_zero(limb: Limb) -> LimbMask {
+    prefixed_extern! {
+        fn LIMB_is_zero(limb: Limb) -> LimbMask;
+    }
+    unsafe { LIMB_is_zero(limb) }
+}
+
+#[inline]
+pub fn limbs_are_zero(limbs: &[Limb]) -> LimbMask {
+    limb_is_zero(limbs.iter().fold(0, |a, b| a | b))
+}
+
+/// Leaks one bit of information (other than the lengths of the inputs):
+/// Whether the given limbs are even.
+#[cfg(any(test, feature = "alloc"))]
+#[inline]
+pub fn limbs_reject_even_leak_bit(limbs: &[Limb]) -> Result<(), error::Unspecified> {
+    let bottom = *limbs.first().ok_or(error::Unspecified)?;
+    if limb_is_zero(bottom & 1).leak() {
+        return Err(error::Unspecified);
+    }
+    Ok(())
+}
+
+#[cfg(any(test, feature = "alloc"))]
+#[inline]
+pub fn verify_limbs_equal_1_leak_bit(a: &[Limb]) -> Result<(), error::Unspecified> {
+    if let [bottom, ref rest @ ..] = *a {
+        let equal = limb_is_zero(bottom ^ 1) & limbs_are_zero(rest);
+        if equal.leak() {
+            return Ok(());
+        }
+    }
+    Err(error::Unspecified)
+}
+
+/// Returns the number of bits in `a`.
+//
+// This strives to be constant-time with respect to the values of all bits
+// except the most significant bit. This does not attempt to be constant-time
+// with respect to `a.len()` or the value of the result or the value of the
+// most significant bit (It's 1, unless the input is zero, in which case it's
+// zero.)
+#[cfg(any(test, feature = "alloc"))]
+pub fn limbs_minimal_bits(a: &[Limb]) -> bits::BitLength {
+    for num_limbs in (1..=a.len()).rev() {
+        let high_limb = a[num_limbs - 1];
+
+        // Find the number of set bits in |high_limb| by a linear scan from the
+        // most significant bit to the least significant bit. This works great
+        // for the most common inputs because usually the most significant bit
+        // it set.
+        for high_limb_num_bits in (1..=LIMB_BITS).rev() {
+            let shifted = unsafe { LIMB_shr(high_limb, high_limb_num_bits - 1) };
+            if shifted != 0 {
+                return bits::BitLength::from_bits(
+                    ((num_limbs - 1) * LIMB_BITS) + high_limb_num_bits,
+                );
+            }
+        }
+    }
+
+    // No bits were set.
+    bits::BitLength::from_bits(0)
+}
+
+/// Equivalent to `if (r >= m) { r -= m; }`
+#[inline]
+pub fn limbs_reduce_once(r: &mut [Limb], m: &[Limb]) -> Result<(), LenMismatchError> {
+    prefixed_extern! {
+        fn LIMBS_reduce_once(r: *mut Limb, m: *const Limb, num_limbs: c::NonZero_size_t);
+    }
+    let num_limbs = NonZeroUsize::new(r.len()).ok_or_else(|| LenMismatchError::new(m.len()))?;
+    let r = r.as_mut_ptr(); // Non-dangling because num_limbs is non-zero.
+    let m = m.as_ptr(); // Non-dangling because num_limbs is non-zero.
+    unsafe { LIMBS_reduce_once(r, m, num_limbs) };
+    Ok(())
+}
+
+#[derive(Clone, Copy, PartialEq)]
+pub enum AllowZero {
+    No,
+    Yes,
+}
+
+/// Parses `input` into `result`, verifies that the value is less than
+/// `max_exclusive`, and pads `result` with zeros to its length. If `allow_zero`
+/// is not `AllowZero::Yes`, zero values are rejected.
+///
+/// This attempts to be constant-time with respect to the actual value *only if*
+/// the value is actually in range. In other words, this won't leak anything
+/// about a valid value, but it might leak small amounts of information about an
+/// invalid value (which constraint it failed).
+pub fn parse_big_endian_in_range_and_pad_consttime(
+    input: untrusted::Input,
+    allow_zero: AllowZero,
+    max_exclusive: &[Limb],
+    result: &mut [Limb],
+) -> Result<(), error::Unspecified> {
+    parse_big_endian_and_pad_consttime(input, result)?;
+    verify_limbs_less_than_limbs_leak_bit(result, max_exclusive)?;
+    if allow_zero != AllowZero::Yes {
+        if limbs_are_zero(result).leak() {
+            return Err(error::Unspecified);
+        }
+    }
+    Ok(())
+}
+
+/// Parses `input` into `result`, padding `result` with zeros to its length.
+/// This attempts to be constant-time with respect to the value but not with
+/// respect to the length; it is assumed that the length is public knowledge.
+pub fn parse_big_endian_and_pad_consttime(
+    input: untrusted::Input,
+    result: &mut [Limb],
+) -> Result<(), error::Unspecified> {
+    if input.is_empty() {
+        return Err(error::Unspecified);
+    }
+    let input_limbs = input.as_slice_less_safe().rchunks(LIMB_BYTES).map(|chunk| {
+        let mut padded = [0; LIMB_BYTES];
+        sliceutil::overwrite_at_start(&mut padded[(LIMB_BYTES - chunk.len())..], chunk);
+        Limb::from_be_bytes(padded)
+    });
+    if input_limbs.len() > result.len() {
+        return Err(error::Unspecified);
+    }
+
+    result
+        .iter_mut()
+        .zip(input_limbs.chain(iter::repeat(0)))
+        .for_each(|(r, i)| *r = i);
+
+    Ok(())
+}
+
+pub fn big_endian_from_limbs(limbs: &[Limb], out: &mut [u8]) {
+    let be_bytes = unstripped_be_bytes(limbs);
+    assert_eq!(out.len(), be_bytes.len());
+    out.iter_mut().zip(be_bytes).for_each(|(o, i)| {
+        *o = i;
+    });
+}
+
+/// Returns an iterator of the big-endian encoding of `limbs`.
+///
+/// The number of bytes returned will be a multiple of `LIMB_BYTES`
+/// and thus may be padded with leading zeros.
+pub fn unstripped_be_bytes(limbs: &[Limb]) -> impl ExactSizeIterator<Item = u8> + Clone + '_ {
+    // The unwrap is safe because a slice can never be larger than `usize` bytes.
+    ArrayFlatMap::new(limbs.iter().rev().copied(), Limb::to_be_bytes).unwrap()
+}
+
+// Used in FFI
+pub type Window = bb::Word;
+
+// Used in FFI
+pub type LeakyWindow = bb::LeakyWord;
+
+/// Processes `limbs` as a sequence of 5-bit windows, folding the windows from
+/// most significant to least significant and returning the accumulated result.
+/// The first window will be mapped by `init` to produce the initial value for
+/// the accumulator. Then `f` will be called to fold the accumulator and the
+/// next window until all windows are processed. When the input's bit length
+/// isn't divisible by 5, the window passed to `init` will be partial; all
+/// windows passed to `fold` will be full.
+///
+/// This is designed to avoid leaking the contents of `limbs` through side
+/// channels as long as `init` and `fold` are side-channel free.
+///
+/// Panics if `limbs` is empty.
+#[cfg(feature = "alloc")]
+pub fn fold_5_bit_windows<R, I: FnOnce(Window) -> R, F: Fn(R, Window) -> R>(
+    limbs: &[Limb],
+    init: I,
+    fold: F,
+) -> R {
+    #[derive(Clone, Copy)]
+    #[repr(transparent)]
+    struct BitIndex(Wrapping<c::size_t>);
+
+    const WINDOW_BITS: Wrapping<c::size_t> = Wrapping(5);
+
+    prefixed_extern! {
+        fn LIMBS_window5_split_window(
+            lower_limb: Limb,
+            higher_limb: Limb,
+            index_within_word: BitIndex,
+        ) -> Window;
+        fn LIMBS_window5_unsplit_window(limb: Limb, index_within_word: BitIndex) -> Window;
+    }
+
+    let num_limbs = limbs.len();
+    let mut window_low_bit = {
+        let num_whole_windows = (num_limbs * LIMB_BITS) / 5;
+        let mut leading_bits = (num_limbs * LIMB_BITS) - (num_whole_windows * 5);
+        if leading_bits == 0 {
+            leading_bits = WINDOW_BITS.0;
+        }
+        BitIndex(Wrapping(LIMB_BITS - leading_bits))
+    };
+
+    let initial_value = {
+        let leading_partial_window =
+            unsafe { LIMBS_window5_split_window(*limbs.first().unwrap(), 0, window_low_bit) };
+        window_low_bit.0 -= WINDOW_BITS;
+        init(leading_partial_window)
+    };
+
+    let mut low_limb = Limb::from(0 as LeakyWindow);
+    limbs.iter().fold(initial_value, |mut acc, current_limb| {
+        let higher_limb = low_limb;
+        low_limb = *current_limb;
+
+        if window_low_bit.0 > Wrapping(LIMB_BITS) - WINDOW_BITS {
+            let window =
+                unsafe { LIMBS_window5_split_window(low_limb, higher_limb, window_low_bit) };
+            window_low_bit.0 -= WINDOW_BITS;
+            acc = fold(acc, window);
+        };
+        while window_low_bit.0 < Wrapping(LIMB_BITS) {
+            let window = unsafe { LIMBS_window5_unsplit_window(low_limb, window_low_bit) };
+            // The loop exits when this subtraction underflows, causing `window_low_bit` to
+            // wrap around to a very large value.
+            window_low_bit.0 -= WINDOW_BITS;
+            acc = fold(acc, window);
+        }
+        window_low_bit.0 += Wrapping(LIMB_BITS); // "Fix" the underflow.
+
+        acc
+    })
+}
+
+#[inline]
+pub(crate) fn limbs_add_assign_mod(
+    a: &mut [Limb],
+    b: &[Limb],
+    m: &[Limb],
+) -> Result<(), LenMismatchError> {
+    prefixed_extern! {
+        // `r` and `a` may alias.
+        fn LIMBS_add_mod(
+            r: *mut Limb,
+            a: *const Limb,
+            b: *const Limb,
+            m: *const Limb,
+            num_limbs: c::NonZero_size_t,
+        );
+    }
+    let num_limbs = NonZeroUsize::new(m.len()).ok_or_else(|| LenMismatchError::new(m.len()))?;
+    (a, b).with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, b| {
+        let m = m.as_ptr(); // Also non-dangling because `num_limbs` is non-zero.
+        unsafe { LIMBS_add_mod(r, a, b, m, num_limbs) }
+    })
+}
+
+// r *= 2 (mod m).
+pub(crate) fn limbs_double_mod(r: &mut [Limb], m: &[Limb]) -> Result<(), LenMismatchError> {
+    prefixed_extern! {
+        // `r` and `a` may alias.
+        fn LIMBS_shl_mod(
+            r: *mut Limb,
+            a: *const Limb,
+            m: *const Limb,
+            num_limbs: c::NonZero_size_t);
+    }
+    let num_limbs = NonZeroUsize::new(m.len()).ok_or_else(|| LenMismatchError::new(m.len()))?;
+    r.with_non_dangling_non_null_pointers_ra(num_limbs, |r, a| {
+        let m = m.as_ptr(); // Also non-dangling because num_limbs > 0.
+        unsafe {
+            LIMBS_shl_mod(r, a, m, num_limbs);
+        }
+    })
+}
+
+// *r = -a, assuming a is odd.
+pub(crate) fn limbs_negative_odd(r: &mut [Limb], a: &[Limb]) {
+    debug_assert_eq!(r.len(), a.len());
+    // Two's complement step 1: flip all the bits.
+    // The compiler should optimize this to vectorized (a ^ !0).
+    r.iter_mut().zip(a.iter()).for_each(|(r, &a)| {
+        *r = !a;
+    });
+    // Two's complement step 2: Add one. Since `a` is odd, `r` is even. Thus we
+    // can use a bitwise or for addition.
+    r[0] |= 1;
+}
+
+#[cfg(any(test, feature = "alloc"))]
+prefixed_extern! {
+    fn LIMB_shr(a: Limb, shift: c::size_t) -> Limb;
+}
+
+#[allow(clippy::useless_conversion)]
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use alloc::vec::Vec;
+    use cfg_if::cfg_if;
+
+    const MAX: LeakyLimb = LeakyLimb::MAX;
+
+    fn leak_in_test(a: LimbMask) -> bool {
+        a.leak()
+    }
+
+    #[test]
+    fn test_limbs_are_even() {
+        static EVENS: &[&[LeakyLimb]] = &[
+            &[],
+            &[0],
+            &[2],
+            &[0, 0],
+            &[2, 0],
+            &[0, 1],
+            &[0, 2],
+            &[0, 3],
+            &[0, 0, 0, 0, MAX],
+        ];
+        for even in EVENS {
+            let even = &Vec::from_iter(even.iter().copied().map(Limb::from));
+            assert!(matches!(
+                limbs_reject_even_leak_bit(even),
+                Err(error::Unspecified)
+            ));
+        }
+        static ODDS: &[&[LeakyLimb]] = &[
+            &[1],
+            &[3],
+            &[1, 0],
+            &[3, 0],
+            &[1, 1],
+            &[1, 2],
+            &[1, 3],
+            &[1, 0, 0, 0, MAX],
+        ];
+        for odd in ODDS {
+            let odd = &Vec::from_iter(odd.iter().copied().map(Limb::from));
+            assert!(matches!(limbs_reject_even_leak_bit(odd), Ok(())));
+        }
+    }
+
+    static ZEROES: &[&[LeakyLimb]] = &[
+        &[],
+        &[0],
+        &[0, 0],
+        &[0, 0, 0],
+        &[0, 0, 0, 0],
+        &[0, 0, 0, 0, 0],
+        &[0, 0, 0, 0, 0, 0, 0],
+        &[0, 0, 0, 0, 0, 0, 0, 0],
+        &[0, 0, 0, 0, 0, 0, 0, 0, 0],
+    ];
+
+    static NONZEROES: &[&[LeakyLimb]] = &[
+        &[1],
+        &[0, 1],
+        &[1, 1],
+        &[1, 0, 0, 0],
+        &[0, 1, 0, 0],
+        &[0, 0, 1, 0],
+        &[0, 0, 0, 1],
+    ];
+
+    #[test]
+    fn test_limbs_are_zero() {
+        for zero in ZEROES {
+            let zero = &Vec::from_iter(zero.iter().copied().map(Limb::from));
+            assert!(leak_in_test(limbs_are_zero(zero)));
+        }
+        for nonzero in NONZEROES {
+            let nonzero = &Vec::from_iter(nonzero.iter().copied().map(Limb::from));
+            assert!(!leak_in_test(limbs_are_zero(nonzero)));
+        }
+    }
+
+    #[test]
+    fn test_limbs_equal_limb() {
+        // Equal
+        static EQUAL: &[&[LeakyLimb]] = &[&[1], &[1, 0], &[1, 0, 0], &[1, 0, 0, 0, 0, 0, 0]];
+        for a in EQUAL {
+            let a = &Vec::from_iter(a.iter().copied().map(Limb::from));
+            assert!(matches!(verify_limbs_equal_1_leak_bit(a), Ok(())));
+        }
+
+        // Unequal
+        static UNEQUAL: &[&[LeakyLimb]] = &[
+            &[0],
+            &[2],
+            &[3],
+            &[MAX],
+            &[0, 1],
+            &[1, 1],
+            &[0, 0, 0, 0, 0, 0, 0, 1],
+            &[0, 0, 0, 0, 1, 0, 0, 0],
+            &[0, 0, 0, 0, 1, 0, 0, 1],
+            &[MAX, 1],
+        ];
+        for a in UNEQUAL {
+            let a = &Vec::from_iter(a.iter().copied().map(Limb::from));
+            assert!(matches!(
+                verify_limbs_equal_1_leak_bit(a),
+                Err(error::Unspecified)
+            ));
+        }
+    }
+
+    #[test]
+    fn test_parse_big_endian_and_pad_consttime() {
+        const LIMBS: usize = 4;
+
+        {
+            // Empty input.
+            let inp = untrusted::Input::from(&[]);
+            let mut result = [0; LIMBS].map(From::<LeakyLimb>::from);
+            assert!(parse_big_endian_and_pad_consttime(inp, &mut result).is_err());
+        }
+
+        // The input is longer than will fit in the given number of limbs.
+        {
+            let inp = [1, 2, 3, 4, 5, 6, 7, 8, 9];
+            let inp = untrusted::Input::from(&inp);
+            let mut result = [0; 8 / LIMB_BYTES].map(From::<LeakyLimb>::from);
+            assert!(parse_big_endian_and_pad_consttime(inp, &mut result[..]).is_err());
+        }
+
+        // Less than a full limb.
+        {
+            let inp = [0xfe];
+            let inp = untrusted::Input::from(&inp);
+            let mut result = [0; LIMBS].map(From::<LeakyLimb>::from);
+            assert_eq!(
+                Ok(()),
+                parse_big_endian_and_pad_consttime(inp, &mut result[..])
+            );
+            assert_eq!(&[0xfe, 0, 0, 0], &result);
+        }
+
+        // A whole limb for 32-bit, half a limb for 64-bit.
+        {
+            let inp = [0xbe, 0xef, 0xf0, 0x0d];
+            let inp = untrusted::Input::from(&inp);
+            let mut result = [0; LIMBS].map(From::<LeakyLimb>::from);
+            assert_eq!(Ok(()), parse_big_endian_and_pad_consttime(inp, &mut result));
+            assert_eq!(&[0xbeeff00d, 0, 0, 0], &result);
+        }
+
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                static TEST_CASES: &[(&[u8], &[Limb])] = &[
+                    (&[1], &[1, 0]),
+                    (&[1, 2], &[0x102, 0]),
+                    (&[1, 2, 3], &[0x10203, 0]),
+                    (&[1, 2, 3, 4], &[0x102_0304, 0]),
+                    (&[1, 2, 3, 4, 5], &[0x1_0203_0405, 0]),
+                    (&[1, 2, 3, 4, 5, 6], &[0x102_0304_0506, 0]),
+                    (&[1, 2, 3, 4, 5, 6, 7], &[0x1_0203_0405_0607, 0]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8], &[0x102_0304_0506_0708, 0]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0x0203_0405_0607_0809, 0x1]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa], &[0x0304_0506_0708_090a, 0x102]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb], &[0x0405_0607_0809_0a0b, 0x1_0203]),
+                ];
+                for (be_bytes, limbs) in TEST_CASES {
+                    let mut buf = [0; 2];
+                    parse_big_endian_and_pad_consttime(untrusted::Input::from(be_bytes), &mut buf)
+                        .unwrap();
+                    assert_eq!(limbs, &buf, "({be_bytes:x?}, {limbs:x?}");
+                }
+            } else if #[cfg(target_pointer_width = "32")] {
+                static TEST_CASES: &[(&[u8], &[Limb])] = &[
+                    (&[1], &[1, 0, 0]),
+                    (&[1, 2], &[0x102, 0, 0]),
+                    (&[1, 2, 3], &[0x10203, 0, 0]),
+                    (&[1, 2, 3, 4], &[0x102_0304, 0, 0]),
+                    (&[1, 2, 3, 4, 5], &[0x0203_0405, 0x1, 0]),
+                    (&[1, 2, 3, 4, 5, 6], &[0x0304_0506, 0x102, 0]),
+                    (&[1, 2, 3, 4, 5, 6, 7], &[0x0405_0607, 0x1_0203, 0]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8], &[0x0506_0708, 0x102_0304, 0]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0x0607_0809, 0x0203_0405, 0x1]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa], &[0x0708_090a, 0x0304_0506, 0x102]),
+                    (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb], &[0x0809_0a0b, 0x0405_0607, 0x1_0203]),
+                ];
+                for (be_bytes, limbs) in TEST_CASES {
+                    let mut buf = [0; 3];
+                    parse_big_endian_and_pad_consttime(untrusted::Input::from(be_bytes), &mut buf)
+                        .unwrap();
+                    assert_eq!(limbs, &buf, "({be_bytes:x?}, {limbs:x?}");
+                }
+            } else {
+                panic!("Unsupported target_pointer_width");
+            }
+
+            // XXX: This is a weak set of tests. TODO: expand it.
+        }
+    }
+
+    #[test]
+    fn test_big_endian_from_limbs_same_length() {
+        #[cfg(target_pointer_width = "32")]
+        let limbs = [
+            0xbccddeef, 0x89900aab, 0x45566778, 0x01122334, 0xddeeff00, 0x99aabbcc, 0x55667788,
+            0x11223344,
+        ];
+
+        #[cfg(target_pointer_width = "64")]
+        let limbs = [
+            0x8990_0aab_bccd_deef,
+            0x0112_2334_4556_6778,
+            0x99aa_bbcc_ddee_ff00,
+            0x1122_3344_5566_7788,
+        ];
+
+        let limbs = limbs.map(From::<LeakyLimb>::from);
+
+        let expected = [
+            0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee,
+            0xff, 0x00, 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78, 0x89, 0x90, 0x0a, 0xab,
+            0xbc, 0xcd, 0xde, 0xef,
+        ];
+
+        let mut out = [0xabu8; 32];
+        big_endian_from_limbs(&limbs[..], &mut out);
+        assert_eq!(&out[..], &expected[..]);
+    }
+
+    #[should_panic]
+    #[test]
+    fn test_big_endian_from_limbs_fewer_limbs() {
+        #[cfg(target_pointer_width = "32")]
+        // Two fewer limbs.
+        let limbs = [
+            0xbccddeef, 0x89900aab, 0x45566778, 0x01122334, 0xddeeff00, 0x99aabbcc,
+        ];
+
+        // One fewer limb.
+        #[cfg(target_pointer_width = "64")]
+        let limbs = [
+            0x8990_0aab_bccd_deef,
+            0x0112_2334_4556_6778,
+            0x99aa_bbcc_ddee_ff00,
+        ];
+
+        let limbs = limbs.map(From::<LeakyLimb>::from);
+
+        let mut out = [0xabu8; 32];
+
+        big_endian_from_limbs(&limbs[..], &mut out);
+    }
+
+    #[test]
+    fn test_limbs_minimal_bits() {
+        const ALL_ONES: LeakyLimb = LeakyLimb::MAX;
+        static CASES: &[(&[LeakyLimb], usize)] = &[
+            (&[], 0),
+            (&[0], 0),
+            (&[ALL_ONES], LIMB_BITS),
+            (&[ALL_ONES, 0], LIMB_BITS),
+            (&[ALL_ONES, 1], LIMB_BITS + 1),
+            (&[0, 0], 0),
+            (&[1, 0], 1),
+            (&[0, 1], LIMB_BITS + 1),
+            (&[0, ALL_ONES], 2 * LIMB_BITS),
+            (&[ALL_ONES, ALL_ONES], 2 * LIMB_BITS),
+            (&[ALL_ONES, ALL_ONES >> 1], 2 * LIMB_BITS - 1),
+            (&[ALL_ONES, 0b100_0000], LIMB_BITS + 7),
+            (&[ALL_ONES, 0b101_0000], LIMB_BITS + 7),
+            (&[ALL_ONES, ALL_ONES >> 1], LIMB_BITS + (LIMB_BITS) - 1),
+        ];
+        for (limbs, bits) in CASES {
+            let limbs = &Vec::from_iter(limbs.iter().copied().map(Limb::from));
+            assert_eq!(limbs_minimal_bits(limbs).as_bits(), *bits);
+        }
+    }
+}
diff --git a/ring-0.17.14/src/pbkdf2.rs b/ring-0.17.14/src/pbkdf2.rs
new file mode 100644
index 0000000000..7baacdbdc9
--- /dev/null
+++ b/ring-0.17.14/src/pbkdf2.rs
@@ -0,0 +1,344 @@
+// Copyright 2015 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! PBKDF2 derivation and verification.
+//!
+//! Use `derive` to derive PBKDF2 outputs. Use `verify` to verify secret
+//! against previously-derived outputs.
+//!
+//! PBKDF2 is specified in [RFC 2898 Section 5.2] with test vectors given in
+//! [RFC 6070]. See also [NIST Special Publication 800-132].
+//!
+//! [RFC 2898 Section 5.2]: https://tools.ietf.org/html/rfc2898#section-5.2
+//! [RFC 6070]: https://tools.ietf.org/html/rfc6070
+//! [NIST Special Publication 800-132]:
+//!    http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-132.pdf
+//!
+//! # Examples
+//!
+//! ## Password Database Example
+//!
+//! ```
+//! use ring::{digest, pbkdf2};
+//! use std::{collections::HashMap, num::NonZeroU32};
+//!
+//! static PBKDF2_ALG: pbkdf2::Algorithm = pbkdf2::PBKDF2_HMAC_SHA256;
+//! const CREDENTIAL_LEN: usize = digest::SHA256_OUTPUT_LEN;
+//! pub type Credential = [u8; CREDENTIAL_LEN];
+//!
+//! enum Error {
+//!     WrongUsernameOrPassword
+//! }
+//!
+//! struct PasswordDatabase {
+//!     pbkdf2_iterations: NonZeroU32,
+//!     db_salt_component: [u8; 16],
+//!
+//!     // Normally this would be a persistent database.
+//!     storage: HashMap<String, Credential>,
+//! }
+//!
+//! impl PasswordDatabase {
+//!     pub fn store_password(&mut self, username: &str, password: &str) {
+//!         let salt = self.salt(username);
+//!         let mut to_store: Credential = [0u8; CREDENTIAL_LEN];
+//!         pbkdf2::derive(PBKDF2_ALG, self.pbkdf2_iterations, &salt,
+//!                        password.as_bytes(), &mut to_store);
+//!         self.storage.insert(String::from(username), to_store);
+//!     }
+//!
+//!     pub fn verify_password(&self, username: &str, attempted_password: &str)
+//!                            -> Result<(), Error> {
+//!         match self.storage.get(username) {
+//!            Some(actual_password) => {
+//!                let salt = self.salt(username);
+//!                pbkdf2::verify(PBKDF2_ALG, self.pbkdf2_iterations, &salt,
+//!                               attempted_password.as_bytes(),
+//!                               actual_password)
+//!                     .map_err(|_| Error::WrongUsernameOrPassword)
+//!            },
+//!
+//!            None => Err(Error::WrongUsernameOrPassword)
+//!         }
+//!     }
+//!
+//!     // The salt should have a user-specific component so that an attacker
+//!     // cannot crack one password for multiple users in the database. It
+//!     // should have a database-unique component so that an attacker cannot
+//!     // crack the same user's password across databases in the unfortunate
+//!     // but common case that the user has used the same password for
+//!     // multiple systems.
+//!     fn salt(&self, username: &str) -> Vec<u8> {
+//!         let mut salt = Vec::with_capacity(self.db_salt_component.len() +
+//!                                           username.as_bytes().len());
+//!         salt.extend(self.db_salt_component.as_ref());
+//!         salt.extend(username.as_bytes());
+//!         salt
+//!     }
+//! }
+//!
+//! fn main() {
+//!     // Normally these parameters would be loaded from a configuration file.
+//!     let mut db = PasswordDatabase {
+//!         pbkdf2_iterations: NonZeroU32::new(100_000).unwrap(),
+//!         db_salt_component: [
+//!             // This value was generated from a secure PRNG.
+//!             0xd6, 0x26, 0x98, 0xda, 0xf4, 0xdc, 0x50, 0x52,
+//!             0x24, 0xf2, 0x27, 0xd1, 0xfe, 0x39, 0x01, 0x8a
+//!         ],
+//!         storage: HashMap::new(),
+//!     };
+//!
+//!     db.store_password("alice", "@74d7]404j|W}6u");
+//!
+//!     // An attempt to log in with the wrong password fails.
+//!     assert!(db.verify_password("alice", "wrong password").is_err());
+//!
+//!     // Normally there should be an expoentially-increasing delay between
+//!     // attempts to further protect against online attacks.
+//!
+//!     // An attempt to log in with the right password succeeds.
+//!     assert!(db.verify_password("alice", "@74d7]404j|W}6u").is_ok());
+//! }
+
+use self::{derive_error::DeriveError, verify_error::VerifyError};
+use crate::{
+    bb, cpu, digest,
+    error::{self, TooMuchOutputRequestedError},
+    hmac::{self, InputTooLongError},
+};
+use core::num::NonZeroU32;
+
+/// A PBKDF2 algorithm.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct Algorithm(hmac::Algorithm);
+
+/// PBKDF2 using HMAC-SHA1.
+pub static PBKDF2_HMAC_SHA1: Algorithm = Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY);
+
+/// PBKDF2 using HMAC-SHA256.
+pub static PBKDF2_HMAC_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256);
+
+/// PBKDF2 using HMAC-SHA384.
+pub static PBKDF2_HMAC_SHA384: Algorithm = Algorithm(hmac::HMAC_SHA384);
+
+/// PBKDF2 using HMAC-SHA512.
+pub static PBKDF2_HMAC_SHA512: Algorithm = Algorithm(hmac::HMAC_SHA512);
+
+/// Fills `out` with the key derived using PBKDF2 with the given inputs.
+///
+/// Do not use `derive` as part of verifying a secret; use `verify` instead, to
+/// minimize the effectiveness of timing attacks.
+///
+/// `out.len()` must be no larger than the digest length * (2**32 - 1), per the
+/// PBKDF2 specification.
+///
+/// | Parameter   | RFC 2898 Section 5.2 Term
+/// |-------------|-------------------------------------------
+/// | digest_alg  | PRF (HMAC with the given digest algorithm)
+/// | iterations  | c (iteration count)
+/// | salt        | S (salt)
+/// | secret      | P (password)
+/// | out         | dk (derived key)
+/// | out.len()   | dkLen (derived key length)
+///
+/// # Panics
+///
+/// Panics if `out.len() > u32::MAX * digest_alg.output_len()`, where
+/// `digest_alg` is the underlying HMAC/digest algorithm.
+///
+/// Panics if `salt` is so astronomically gigantic that it isn't a valid input
+/// to the underlying digest function.
+///
+/// Panics if `secret` is so astronomically gigantic that it isn't a valid
+/// input to the underlying digest function.
+pub fn derive(
+    algorithm: Algorithm,
+    iterations: NonZeroU32,
+    salt: &[u8],
+    secret: &[u8],
+    out: &mut [u8],
+) {
+    let cpu = cpu::features();
+    try_derive(algorithm, iterations, salt, secret, out, cpu)
+        .map_err(error::erase::<DeriveError>)
+        .unwrap()
+}
+
+fn try_derive(
+    algorithm: Algorithm,
+    iterations: NonZeroU32,
+    salt: &[u8],
+    secret: &[u8],
+    out: &mut [u8],
+    cpu: cpu::Features,
+) -> Result<(), DeriveError> {
+    let digest_alg = algorithm.0.digest_algorithm();
+    let output_len = digest_alg.output_len();
+
+    // This implementation's performance is asymptotically optimal as described
+    // in https://jbp.io/2015/08/11/pbkdf2-performance-matters/. However, it
+    // hasn't been optimized to the same extent as fastpbkdf2. In particular,
+    // this implementation is probably doing a lot of unnecessary copying.
+
+    let secret =
+        hmac::Key::try_new(algorithm.0, secret, cpu).map_err(DeriveError::secret_too_long)?;
+
+    // Clear |out|.
+    out.fill(0);
+
+    let mut idx: u32 = 0;
+
+    let out_len = out.len();
+    for chunk in out.chunks_mut(output_len) {
+        idx = idx.checked_add(1).ok_or_else(|| {
+            DeriveError::too_much_output_requested(TooMuchOutputRequestedError::new(out_len))
+        })?;
+        // If the salt is too long, then we'll detect this on the first
+        // iteration before we've written any output.
+        derive_block(&secret, iterations, salt, idx, chunk, cpu)
+            .map_err(DeriveError::salt_too_long)?;
+    }
+    Ok(())
+}
+
+fn derive_block(
+    secret: &hmac::Key,
+    iterations: NonZeroU32,
+    salt: &[u8],
+    idx: u32,
+    out: &mut [u8],
+    cpu: cpu::Features,
+) -> Result<(), InputTooLongError> {
+    let mut ctx = hmac::Context::with_key(secret);
+    ctx.update(salt);
+    ctx.update(&u32::to_be_bytes(idx));
+
+    let mut u = ctx.try_sign(cpu)?;
+
+    let mut remaining: u32 = iterations.into();
+    loop {
+        bb::xor_assign_at_start(&mut out[..], u.as_ref());
+
+        if remaining == 1 {
+            break;
+        }
+        remaining -= 1;
+
+        // This will not fail, because the output of HMAC is never too long to
+        // be an input for the same algorithm, but we can't prove that with
+        // only locally-available information.
+        u = secret.sign(u.as_ref(), cpu)?
+    }
+    Ok(())
+}
+
+cold_exhaustive_error! {
+    enum derive_error::DeriveError {
+        secret_too_long => SecretTooLong(InputTooLongError),
+        salt_too_long => SaltTooLong(InputTooLongError),
+        too_much_output_requested => TooMuchOutputRequested(TooMuchOutputRequestedError),
+    }
+}
+
+cold_exhaustive_error! {
+    enum verify_error::VerifyError {
+        mismatch => Mismatch(()),
+        secret_too_long => SecretTooLong(InputTooLongError),
+        salt_too_long => SaltTooLong(InputTooLongError),
+        previously_derived_empty => PreviouslyDerivedEmpty(usize),
+    }
+}
+
+/// Verifies that a previously-derived (e.g., using `derive`) PBKDF2 value
+/// matches the PBKDF2 value derived from the other inputs.
+///
+/// The comparison is done in constant time to prevent timing attacks. The
+/// comparison will fail if `previously_derived` is empty (has a length of
+/// zero).
+///
+/// | Parameter                  | RFC 2898 Section 5.2 Term
+/// |----------------------------|--------------------------------------------
+/// | digest_alg                 | PRF (HMAC with the given digest algorithm).
+/// | `iterations`               | c (iteration count)
+/// | `salt`                     | S (salt)
+/// | `secret`                   | P (password)
+/// | `previously_derived`       | dk (derived key)
+/// | `previously_derived.len()` | dkLen (derived key length)
+pub fn verify(
+    algorithm: Algorithm,
+    iterations: NonZeroU32,
+    salt: &[u8],
+    secret: &[u8],
+    previously_derived: &[u8],
+) -> Result<(), error::Unspecified> {
+    let cpu = cpu::features();
+    try_verify(algorithm, iterations, salt, secret, previously_derived, cpu)
+        .map_err(error::erase::<VerifyError>)
+}
+
+fn try_verify(
+    algorithm: Algorithm,
+    iterations: NonZeroU32,
+    salt: &[u8],
+    secret: &[u8],
+    previously_derived: &[u8],
+    cpu: cpu::Features,
+) -> Result<(), VerifyError> {
+    let digest_alg = algorithm.0.digest_algorithm();
+
+    if previously_derived.is_empty() {
+        return Err(VerifyError::previously_derived_empty(0));
+    }
+
+    let mut derived_buf = [0u8; digest::MAX_OUTPUT_LEN];
+
+    let output_len = digest_alg.output_len();
+    let secret =
+        hmac::Key::try_new(algorithm.0, secret, cpu).map_err(VerifyError::secret_too_long)?;
+    let mut idx: u32 = 0;
+
+    let mut matches = 1;
+
+    for previously_derived_chunk in previously_derived.chunks(output_len) {
+        idx = idx.checked_add(1).ok_or_else(|| {
+            // `previously_derived` is so gigantic that PBKDF2 couldn't
+            // have been used to compute it.
+            VerifyError::mismatch(())
+        })?;
+
+        let derived_chunk = &mut derived_buf[..previously_derived_chunk.len()];
+        derived_chunk.fill(0);
+
+        derive_block(&secret, iterations, salt, idx, derived_chunk, cpu)
+            .map_err(VerifyError::salt_too_long)?;
+
+        // XXX: This isn't fully constant-time-safe. TODO: Fix that.
+        #[allow(clippy::bool_to_int_with_if)]
+        let current_block_matches =
+            if bb::verify_slices_are_equal(derived_chunk, previously_derived_chunk).is_ok() {
+                1
+            } else {
+                0
+            };
+
+        matches &= current_block_matches;
+    }
+
+    if matches == 0 {
+        return Err(VerifyError::mismatch(()));
+    }
+
+    Ok(())
+}
diff --git a/ring-0.17.14/src/pkcs8.rs b/ring-0.17.14/src/pkcs8.rs
new file mode 100644
index 0000000000..5ef66fd18e
--- /dev/null
+++ b/ring-0.17.14/src/pkcs8.rs
@@ -0,0 +1,216 @@
+// Copyright 2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! PKCS#8 is specified in [RFC 5958].
+//!
+//! [RFC 5958]: https://tools.ietf.org/html/rfc5958
+
+use crate::{ec, error, io::der};
+
+pub(crate) struct PublicKeyOptions {
+    /// Should the wrong public key ASN.1 tagging used by early implementations
+    /// of PKCS#8 v2 (including earlier versions of *ring*) be accepted?
+    pub accept_legacy_ed25519_public_key_tag: bool,
+}
+
+pub(crate) enum Version {
+    V1Only,
+    V1OrV2(PublicKeyOptions),
+    V2Only(PublicKeyOptions),
+}
+
+/// A template for constructing PKCS#8 documents.
+///
+/// Note that this only works for ECC.
+pub(crate) struct Template {
+    pub bytes: &'static [u8],
+
+    // The range within `bytes` that holds the value (not including the tag and
+    // length) for use in the PKCS#8 document's privateKeyAlgorithm field.
+    pub alg_id_range: core::ops::Range<usize>,
+
+    // `bytes[alg_id_range][curve_id_index..]` contains the OID identifying the,
+    // curve, including the tag and length.
+    pub curve_id_index: usize,
+
+    // `bytes` will be split into two parts at `private_key_index`, where the
+    // first part is written before the private key and the second part is
+    // written after the private key. The public key is written after the second
+    // part.
+    pub private_key_index: usize,
+}
+
+impl Template {
+    #[inline]
+    fn alg_id_value(&self) -> untrusted::Input {
+        untrusted::Input::from(self.alg_id_value_())
+    }
+
+    fn alg_id_value_(&self) -> &[u8] {
+        &self.bytes[self.alg_id_range.start..self.alg_id_range.end]
+    }
+
+    #[inline]
+    pub fn curve_oid(&self) -> untrusted::Input {
+        untrusted::Input::from(&self.alg_id_value_()[self.curve_id_index..])
+    }
+}
+
+/// Parses an unencrypted PKCS#8 private key, verifies that it is the right type
+/// of key, and returns the key value.
+///
+/// PKCS#8 is specified in [RFC 5958].
+///
+/// [RFC 5958]: https://tools.ietf.org/html/rfc5958
+pub(crate) fn unwrap_key<'a>(
+    template: &Template,
+    version: Version,
+    input: untrusted::Input<'a>,
+) -> Result<(untrusted::Input<'a>, Option<untrusted::Input<'a>>), error::KeyRejected> {
+    unwrap_key_(template.alg_id_value(), version, input)
+}
+
+/// Parses an unencrypted PKCS#8 private key, verifies that it is the right type
+/// of key, and returns the key value.
+///
+/// `alg_id` must be the encoded value (not including the outermost `SEQUENCE`
+/// tag and length) of the `AlgorithmIdentifier` that identifies the key type.
+/// The result will be an encoded `RSAPrivateKey` or `ECPrivateKey` or similar.
+///
+/// PKCS#8 is specified in [RFC 5958].
+///
+/// [RFC 5958]: https://tools.ietf.org/html/rfc5958
+pub(crate) fn unwrap_key_<'a>(
+    alg_id: untrusted::Input,
+    version: Version,
+    input: untrusted::Input<'a>,
+) -> Result<(untrusted::Input<'a>, Option<untrusted::Input<'a>>), error::KeyRejected> {
+    input.read_all(error::KeyRejected::invalid_encoding(), |input| {
+        der::nested(
+            input,
+            der::Tag::Sequence,
+            error::KeyRejected::invalid_encoding(),
+            |input| unwrap_key__(alg_id, version, input),
+        )
+    })
+}
+
+fn unwrap_key__<'a>(
+    alg_id: untrusted::Input,
+    version: Version,
+    input: &mut untrusted::Reader<'a>,
+) -> Result<(untrusted::Input<'a>, Option<untrusted::Input<'a>>), error::KeyRejected> {
+    let actual_version = der::small_nonnegative_integer(input)
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+
+    // Do things in a specific order to return more useful errors:
+    // 1. Check for completely unsupported version.
+    // 2. Check for algorithm mismatch.
+    // 3. Check for algorithm-specific version mismatch.
+
+    if actual_version > 1 {
+        return Err(error::KeyRejected::version_not_supported());
+    };
+
+    let actual_alg_id = der::expect_tag_and_get_value(input, der::Tag::Sequence)
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+    if actual_alg_id.as_slice_less_safe() != alg_id.as_slice_less_safe() {
+        return Err(error::KeyRejected::wrong_algorithm());
+    }
+
+    let public_key_options = match (actual_version, version) {
+        (0, Version::V1Only) => None,
+        (0, Version::V1OrV2(_)) => None,
+        (1, Version::V1OrV2(options)) | (1, Version::V2Only(options)) => Some(options),
+        _ => {
+            return Err(error::KeyRejected::version_not_supported());
+        }
+    };
+
+    let private_key = der::expect_tag_and_get_value(input, der::Tag::OctetString)
+        .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+
+    // Ignore any attributes that are present.
+    if input.peek(der::Tag::ContextSpecificConstructed0.into()) {
+        let _ = der::expect_tag_and_get_value(input, der::Tag::ContextSpecificConstructed0)
+            .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+    }
+
+    let public_key = if let Some(options) = public_key_options {
+        if input.at_end() {
+            return Err(error::KeyRejected::public_key_is_missing());
+        }
+
+        const INCORRECT_LEGACY: der::Tag = der::Tag::ContextSpecificConstructed1;
+        let result = if options.accept_legacy_ed25519_public_key_tag
+            && input.peek(INCORRECT_LEGACY.into())
+        {
+            der::nested(
+                input,
+                INCORRECT_LEGACY,
+                error::Unspecified,
+                der::bit_string_with_no_unused_bits,
+            )
+        } else {
+            der::bit_string_tagged_with_no_unused_bits(der::Tag::ContextSpecific1, input)
+        };
+        let public_key =
+            result.map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?;
+        Some(public_key)
+    } else {
+        None
+    };
+
+    Ok((private_key, public_key))
+}
+
+/// A generated PKCS#8 document.
+pub struct Document {
+    bytes: [u8; ec::PKCS8_DOCUMENT_MAX_LEN],
+    len: usize,
+}
+
+impl AsRef<[u8]> for Document {
+    #[inline]
+    fn as_ref(&self) -> &[u8] {
+        &self.bytes[..self.len]
+    }
+}
+
+pub(crate) fn wrap_key(template: &Template, private_key: &[u8], public_key: &[u8]) -> Document {
+    let mut result = Document {
+        bytes: [0; ec::PKCS8_DOCUMENT_MAX_LEN],
+        len: template.bytes.len() + private_key.len() + public_key.len(),
+    };
+    wrap_key_(
+        template,
+        private_key,
+        public_key,
+        &mut result.bytes[..result.len],
+    );
+    result
+}
+
+/// Formats a private key "prefix||private_key||middle||public_key" where
+/// `template` is "prefix||middle" split at position `private_key_index`.
+fn wrap_key_(template: &Template, private_key: &[u8], public_key: &[u8], bytes: &mut [u8]) {
+    let (before_private_key, after_private_key) =
+        template.bytes.split_at(template.private_key_index);
+    let private_key_end_index = template.private_key_index + private_key.len();
+    bytes[..template.private_key_index].copy_from_slice(before_private_key);
+    bytes[template.private_key_index..private_key_end_index].copy_from_slice(private_key);
+    bytes[private_key_end_index..(private_key_end_index + after_private_key.len())]
+        .copy_from_slice(after_private_key);
+    bytes[(private_key_end_index + after_private_key.len())..].copy_from_slice(public_key);
+}
diff --git a/ring-0.17.14/src/polyfill.rs b/ring-0.17.14/src/polyfill.rs
new file mode 100644
index 0000000000..cf513208aa
--- /dev/null
+++ b/ring-0.17.14/src/polyfill.rs
@@ -0,0 +1,107 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Polyfills for functionality that will (hopefully) be added to Rust's
+//! standard library soon.
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+#[inline(always)]
+pub const fn u64_from_usize(x: usize) -> u64 {
+    x as u64
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+pub const fn usize_from_u32(x: u32) -> usize {
+    x as usize
+}
+
+#[cfg(all(
+    target_arch = "aarch64",
+    target_endian = "little",
+    target_pointer_width = "64"
+))]
+#[allow(clippy::cast_possible_truncation)]
+pub fn usize_from_u64(x: u64) -> usize {
+    x as usize
+}
+
+/// const-capable `x.try_into().unwrap_or(usize::MAX)`
+#[allow(clippy::cast_possible_truncation)]
+#[inline(always)]
+pub const fn usize_from_u64_saturated(x: u64) -> usize {
+    const USIZE_MAX: u64 = u64_from_usize(usize::MAX);
+    if x < USIZE_MAX {
+        x as usize
+    } else {
+        usize::MAX
+    }
+}
+
+#[macro_use]
+mod cold_error;
+
+mod array_flat_map;
+mod array_split_map;
+
+pub mod cstr;
+
+pub mod sliceutil;
+
+#[cfg(feature = "alloc")]
+mod leading_zeros_skipped;
+
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+pub mod once_cell {
+    pub mod race;
+}
+
+mod notsend;
+pub mod ptr;
+
+pub mod slice;
+
+#[cfg(test)]
+mod test;
+
+mod unwrap_const;
+
+pub use self::{
+    array_flat_map::ArrayFlatMap, array_split_map::ArraySplitMap, notsend::NotSend,
+    unwrap_const::unwrap_const,
+};
+
+#[cfg(feature = "alloc")]
+pub use leading_zeros_skipped::LeadingZerosStripped;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_usize_from_u64_saturated() {
+        const USIZE_MAX: u64 = u64_from_usize(usize::MAX);
+        assert_eq!(usize_from_u64_saturated(u64::MIN), usize::MIN);
+        assert_eq!(usize_from_u64_saturated(USIZE_MAX), usize::MAX);
+        assert_eq!(usize_from_u64_saturated(USIZE_MAX - 1), usize::MAX - 1);
+
+        #[cfg(not(target_pointer_width = "64"))]
+        {
+            assert_eq!(usize_from_u64_saturated(USIZE_MAX + 1), usize::MAX);
+        }
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/array_flat_map.rs b/ring-0.17.14/src/polyfill/array_flat_map.rs
new file mode 100644
index 0000000000..8b3abbefe1
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/array_flat_map.rs
@@ -0,0 +1,127 @@
+use core::iter::FlatMap;
+
+/// A specialized version of `core::iter::FlatMap` for mapping over exact-sized
+/// iterators with a function that returns an array.
+///
+/// `ArrayFlatMap` differs from `FlatMap` in that `ArrayFlatMap` implements
+/// `ExactSizeIterator`. Since the result of `F` always has `LEN` elements, if
+/// `I` is an exact-sized iterator of length `inner_len` then we know the
+/// length of the flat-mapped result is `inner_len * LEN`. (The constructor
+/// verifies that this multiplication doesn't overflow `usize`.)
+#[derive(Clone)]
+pub struct ArrayFlatMap<I, Item, F, const LEN: usize> {
+    inner: FlatMap<I, [Item; LEN], F>,
+    remaining: usize,
+}
+
+impl<I, Item, F, const LEN: usize> ArrayFlatMap<I, Item, F, LEN>
+where
+    I: ExactSizeIterator,
+    F: FnMut(I::Item) -> [Item; LEN],
+{
+    /// Constructs an `ArrayFlatMap` wrapping the given iterator, using the
+    /// given function
+    pub fn new(inner: I, f: F) -> Option<Self> {
+        let remaining = inner.len().checked_mul(LEN)?;
+        let inner = inner.flat_map(f);
+        Some(Self { inner, remaining })
+    }
+}
+
+impl<I, Item, F, const LEN: usize> Iterator for ArrayFlatMap<I, Item, F, LEN>
+where
+    I: Iterator,
+    F: FnMut(I::Item) -> [Item; LEN],
+{
+    type Item = Item;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let result = self.inner.next();
+        if result.is_some() {
+            self.remaining -= 1;
+        }
+        result
+    }
+
+    /// Required for implementing `ExactSizeIterator`.
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.remaining, Some(self.remaining))
+    }
+}
+
+impl<I, Item, F, const LEN: usize> ExactSizeIterator for ArrayFlatMap<I, Item, F, LEN>
+where
+    I: Iterator,
+    F: FnMut(I::Item) -> [Item; LEN],
+{
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use core::mem::size_of;
+
+    #[test]
+    fn test_array_flat_map() {
+        static TEST_CASES: &[(&[u16], fn(u16) -> [u8; 2], &[u8])] = &[
+            // Empty input
+            (&[], u16::to_be_bytes, &[]),
+            // Non-empty input.
+            (
+                &[0x0102, 0x0304, 0x0506],
+                u16::to_be_bytes,
+                &[1, 2, 3, 4, 5, 6],
+            ),
+            // Test with a different mapping function.
+            (
+                &[0x0102, 0x0304, 0x0506],
+                u16::to_le_bytes,
+                &[2, 1, 4, 3, 6, 5],
+            ),
+        ];
+        TEST_CASES.iter().copied().for_each(|(input, f, expected)| {
+            let mapped = ArrayFlatMap::new(input.iter().copied(), f).unwrap();
+            super::super::test::assert_iterator(mapped, expected);
+        });
+    }
+
+    // Does ArrayFlatMap::new() handle overflow correctly?
+    #[test]
+    fn test_array_flat_map_len_overflow() {
+        struct DownwardCounter {
+            remaining: usize,
+        }
+        impl Iterator for DownwardCounter {
+            type Item = usize;
+
+            fn next(&mut self) -> Option<Self::Item> {
+                if self.remaining > 0 {
+                    let result = self.remaining;
+                    self.remaining -= 1;
+                    Some(result)
+                } else {
+                    None
+                }
+            }
+
+            fn size_hint(&self) -> (usize, Option<usize>) {
+                (self.remaining, Some(self.remaining))
+            }
+        }
+        impl ExactSizeIterator for DownwardCounter {}
+
+        const MAX: usize = usize::MAX / size_of::<usize>();
+
+        static TEST_CASES: &[(usize, bool)] = &[(MAX, true), (MAX + 1, false)];
+        TEST_CASES.iter().copied().for_each(|(input_len, is_some)| {
+            let inner = DownwardCounter {
+                remaining: input_len,
+            };
+            let mapped = ArrayFlatMap::new(inner, usize::to_be_bytes);
+            assert_eq!(mapped.is_some(), is_some);
+            if let Some(mapped) = mapped {
+                assert_eq!(mapped.len(), input_len * size_of::<usize>());
+            }
+        });
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/array_split_map.rs b/ring-0.17.14/src/polyfill/array_split_map.rs
new file mode 100644
index 0000000000..c754330d23
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/array_split_map.rs
@@ -0,0 +1,71 @@
+// Copyright 2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+pub trait ArraySplitMap<I, O, const CN: usize, const ON: usize> {
+    fn array_split_map(self, f: impl Fn([I; CN]) -> O) -> [O; ON];
+}
+
+impl<I, O> ArraySplitMap<I, O, 4, 3> for [I; 12] {
+    #[inline]
+    fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 3] {
+        let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3] = self;
+        [
+            f([a0, a1, a2, a3]),
+            f([b0, b1, b2, b3]),
+            f([c0, c1, c2, c3]),
+        ]
+    }
+}
+
+impl<I, O> ArraySplitMap<I, O, 4, 4> for [I; 16] {
+    #[inline]
+    fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 4] {
+        let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3] = self;
+        [
+            f([a0, a1, a2, a3]),
+            f([b0, b1, b2, b3]),
+            f([c0, c1, c2, c3]),
+            f([d0, d1, d2, d3]),
+        ]
+    }
+}
+
+impl<I, O> ArraySplitMap<I, O, 4, 8> for [I; 32] {
+    #[inline]
+    fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 8] {
+        let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3, e0, e1, e2, e3, f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3] =
+            self;
+        [
+            f([a0, a1, a2, a3]),
+            f([b0, b1, b2, b3]),
+            f([c0, c1, c2, c3]),
+            f([d0, d1, d2, d3]),
+            f([e0, e1, e2, e3]),
+            f([f0, f1, f2, f3]),
+            f([g0, g1, g2, g3]),
+            f([h0, h1, h2, h3]),
+        ]
+    }
+}
+
+impl<I, O> ArraySplitMap<I, O, 8, 2> for [I; 16] {
+    #[inline]
+    fn array_split_map(self, f: impl Fn([I; 8]) -> O) -> [O; 2] {
+        let [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7] = self;
+        [
+            f([a0, a1, a2, a3, a4, a5, a6, a7]),
+            f([b0, b1, b2, b3, b4, b5, b6, b7]),
+        ]
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/cold_error.rs b/ring-0.17.14/src/polyfill/cold_error.rs
new file mode 100644
index 0000000000..13fc9f6baa
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/cold_error.rs
@@ -0,0 +1,101 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+/// Reduces boilerplate for defining error types where we want the compiler to
+/// optimize for the non-error path by assuming constructing an error is
+/// unlikely/cold code.
+///
+/// WARNING: Every struct/variant must contain some *non-constant* value so
+/// that the "invariant code" pass of the compiler doesn't recognize the
+/// constructor as being "invariant code" and optimizing it away;
+/// although such optimization would be nice to take advantage of, it
+/// seems to lose the `#[cold]` attribute.
+///
+/// Constructor functions ar marked `pub(super)` to ensure that instances can
+/// only be constructed from within the enclosing module (and its submodules).
+///
+/// XXX: #[inline(never)] is required to avoid the (MIR?) optimizer inlining
+/// away the function call and losing the `#[cold]` attribute in the process.
+/// We'd otherwise maybe prefer all constructors to be inline.
+///
+/// The type is defined in its own submodule `#mod_name` to hide the
+/// variant/struct constructor, ensuring instances are only constructed
+/// through the generated `$constructor` functions. The constructor methods
+/// work around the lack of the ability to mark an enum variant `#[cold]` and
+/// `#[inline(never)]`.
+macro_rules! cold_exhaustive_error {
+    // struct
+    {
+        struct $mod_name:ident::$Error:ident with $vis:vis constructor {
+            $field:ident: $ValueType:ty
+        }
+    } => {
+        mod $mod_name {
+            #[allow(unused_imports)]
+            use super::*; // So `$ValueType` is in scope.
+
+            pub struct $Error { #[allow(dead_code)] $field: $ValueType }
+
+            impl $Error {
+                #[cold]
+                #[inline(never)]
+                $vis fn new($field: $ValueType) -> Self {
+                    Self { $field }
+                }
+            }
+        }
+    };
+    // struct with default constructor visibility.
+    {
+        struct $mod_name:ident::$Error:ident {
+            $field:ident: $ValueType:ty
+        }
+    } => {
+        cold_exhaustive_error! {
+            struct $mod_name::$Error with pub(super) constructor {
+                $field: $ValueType
+            }
+        }
+    };
+
+    // enum
+    {
+        enum $mod_name:ident::$Error:ident {
+            $(
+                $constructor:ident => $Variant:ident($ValueType:ty),
+            )+
+        }
+    } => {
+        mod $mod_name {
+            #[allow(unused_imports)]
+            use super::*; // So `$ValueType` is in scope.
+
+            pub enum $Error {
+                $(
+                    $Variant(#[allow(dead_code)] $ValueType)
+                ),+
+            }
+
+            impl $Error {
+                $(
+                    #[cold]
+                    #[inline(never)]
+                    pub(super) fn $constructor(value: $ValueType) -> Self {
+                        Self::$Variant(value)
+                    }
+                )+
+            }
+        }
+    };
+}
diff --git a/ring-0.17.14/src/polyfill/cstr.rs b/ring-0.17.14/src/polyfill/cstr.rs
new file mode 100644
index 0000000000..10739f9b2a
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/cstr.rs
@@ -0,0 +1,107 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(all(
+    target_vendor = "apple",
+    any(
+        target_os = "ios",
+        target_os = "macos",
+        target_os = "tvos",
+        target_os = "visionos",
+        target_os = "watchos"
+    )
+))]
+
+//! Work around lack of `core::ffi::CStr` prior to Rust 1.64, and the lack of
+//! `const fn` support for `CStr` in later versions.
+
+#![cfg(all(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_vendor = "apple"
+))]
+
+use core::mem::{align_of, size_of};
+
+// TODO(MSRV 1.64): Use `core::ffi::c_char`.
+use libc::c_char;
+
+// TODO(MSRV 1.64): Replace with `&core::ffi::CStr`.
+pub struct Ref(&'static [u8]);
+
+impl Ref {
+    #[inline(always)]
+    pub fn as_ptr(&self) -> *const c_char {
+        const _SAME_ALIGNMENT: () = assert!(align_of::<u8>() == align_of::<c_char>());
+        const _SAME_SIZE: () = assert!(size_of::<u8>() == size_of::<c_char>());
+
+        // It is safe to cast a `*const u8` to a `const c_char` as they are the
+        // same size and alignment.
+        self.0.as_ptr().cast()
+    }
+
+    // SAFETY: Same as `CStr::from_bytes_with_nul_unchecked`.
+    const unsafe fn from_bytes_with_nul_unchecked(value: &'static [u8]) -> Self {
+        Self(value)
+    }
+}
+
+pub const fn unwrap_const_from_bytes_with_nul(value: &'static [u8]) -> Ref {
+    // XXX: We cannot use `unwrap_const` since `Ref`/`CStr` is not `Copy`.
+    match const_from_bytes_with_nul(value) {
+        Some(r) => r,
+        None => panic!("const_from_bytes_with_nul failed"),
+    }
+}
+
+// TODO(MSRV 1.72): Replace with `CStr::from_bytes_with_nul`.
+#[inline(always)]
+const fn const_from_bytes_with_nul(value: &'static [u8]) -> Option<Ref> {
+    const fn const_contains(mut value: &[u8], needle: &u8) -> bool {
+        while let [head, tail @ ..] = value {
+            if *head == *needle {
+                return true;
+            }
+            value = tail;
+        }
+        false
+    }
+
+    // TODO(MSRV 1.69): Use `core::ffi::CStr::from_bytes_until_nul`
+    match value {
+        [before_nul @ .., 0] if !const_contains(before_nul, &0) => {
+            // SAFETY:
+            //   * `value` is nul-terminated according to the slice pattern.
+            //   * `value` doesn't contain any interior null, by the guard.
+            // TODO(MSRV 1.64): Use `CStr::from_bytes_with_nul_unchecked`
+            Some(unsafe { Ref::from_bytes_with_nul_unchecked(value) })
+        }
+        _ => None,
+    }
+}
+
+mod tests {
+    use super::const_from_bytes_with_nul;
+
+    // Bad.
+    const _EMPTY_UNTERMINATED: () = assert!(const_from_bytes_with_nul(b"").is_none());
+    const _EMPTY_DOUBLE_TERMINATED: () = assert!(const_from_bytes_with_nul(b"\0\0").is_none());
+    const _DOUBLE_NUL: () = assert!(const_from_bytes_with_nul(b"\0\0").is_none());
+    const _LEADINGL_NUL: () = assert!(const_from_bytes_with_nul(b"\0a\0").is_none());
+    const _INTERNAL_NUL_UNTERMINATED: () = assert!(const_from_bytes_with_nul(b"\0a").is_none());
+
+    // Good.
+    const _EMPTY_TERMINATED: () = assert!(const_from_bytes_with_nul(b"\0").is_some());
+    const _NONEMPTY: () = assert!(const_from_bytes_with_nul(b"asdf\0").is_some());
+    const _1_CHAR: () = assert!(const_from_bytes_with_nul(b"a\0").is_some());
+}
diff --git a/ring-0.17.14/src/polyfill/leading_zeros_skipped.rs b/ring-0.17.14/src/polyfill/leading_zeros_skipped.rs
new file mode 100644
index 0000000000..36053f31f0
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/leading_zeros_skipped.rs
@@ -0,0 +1,78 @@
+use core::iter::Peekable;
+
+/// An iterator that skips all leading zeros.
+///
+/// When the wrapped iterator is all zeros, then the last item is retained.
+pub struct LeadingZerosStripped<I>
+where
+    I: Iterator,
+{
+    inner: Peekable<I>,
+}
+
+impl<I> Clone for LeadingZerosStripped<I>
+where
+    I: Iterator,
+    Peekable<I>: Clone,
+{
+    fn clone(&self) -> Self {
+        Self {
+            inner: self.inner.clone(),
+        }
+    }
+}
+
+impl<I> LeadingZerosStripped<I>
+where
+    I: ExactSizeIterator<Item = u8>,
+{
+    pub fn new(inner: I) -> Self {
+        let mut len = inner.len();
+        let mut inner = inner.peekable();
+        // Strip all leading zeroes, but don't strip the last byte if all bytes
+        // were zero.
+        while len > 1 && inner.next_if_eq(&0).is_some() {
+            len -= 1;
+        }
+        Self { inner }
+    }
+}
+
+impl<I> Iterator for LeadingZerosStripped<I>
+where
+    I: Iterator,
+{
+    type Item = I::Item;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.inner.next()
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+impl<I> ExactSizeIterator for LeadingZerosStripped<I> where I: ExactSizeIterator {}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_leading_zeroes_stripped() {
+        static TEST_CASES: &[(&[u8], &[u8])] = &[
+            (&[], &[]),
+            (&[0], &[0]),
+            (&[0, 1], &[1]),
+            (&[0, 0, 1], &[1]),
+            (&[0, 0, 0, 1], &[1]),
+            (&[1, 0], &[1, 0]),
+            (&[0, 1, 0], &[1, 0]),
+        ];
+        TEST_CASES.iter().copied().for_each(|(input, expected)| {
+            let stripped = LeadingZerosStripped::new(input.iter().copied());
+            super::super::test::assert_iterator(stripped, expected);
+        });
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/notsend.rs b/ring-0.17.14/src/polyfill/notsend.rs
new file mode 100644
index 0000000000..9c14006f57
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/notsend.rs
@@ -0,0 +1,30 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::testutil;
+use core::{marker::PhantomData, mem::size_of};
+
+/// A ZST that can be added to any type to make the type `!Send`.
+#[derive(Clone, Copy)]
+pub struct NotSend(PhantomData<*mut ()>);
+
+impl NotSend {
+    pub const VALUE: Self = Self(PhantomData);
+}
+
+#[allow(deprecated)]
+const _: () = testutil::compile_time_assert_clone::<NotSend>();
+#[allow(deprecated)]
+const _: () = testutil::compile_time_assert_copy::<NotSend>();
+const _: () = assert!(size_of::<NotSend>() == 0);
diff --git a/ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE b/ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE
new file mode 100644
index 0000000000..16fe87b06e
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT b/ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT
new file mode 100644
index 0000000000..51feadc5aa
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHOR OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/ring-0.17.14/src/polyfill/once_cell/race.rs b/ring-0.17.14/src/polyfill/once_cell/race.rs
new file mode 100644
index 0000000000..6fc4056913
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/once_cell/race.rs
@@ -0,0 +1,77 @@
+//! Thread-safe, non-blocking, "first one wins" flavor of `OnceCell`.
+//!
+//! If two threads race to initialize a type from the `race` module, they
+//! don't block, execute initialization function together, but only one of
+//! them stores the result.
+//!
+//! This module does not require `std` feature.
+//!
+//! # Atomic orderings
+//!
+//! All types in this module use `Acquire` and `Release`
+//! [atomic orderings](Ordering) for all their operations. While this is not
+//! strictly necessary for types other than `OnceBox`, it is useful for users as
+//! it allows them to be certain that after `get` or `get_or_init` returns on
+//! one thread, any side-effects caused by the setter thread prior to them
+//! calling `set` or `get_or_init` will be made visible to that thread; without
+//! it, it's possible for it to appear as if they haven't happened yet from the
+//! getter thread's perspective. This is an acceptable tradeoff to make since
+//! `Acquire` and `Release` have very little performance overhead on most
+//! architectures versus `Relaxed`.
+
+use core::sync::atomic;
+
+use atomic::{AtomicUsize, Ordering};
+use core::num::NonZeroUsize;
+
+/// A thread-safe cell which can be written to only once.
+pub struct OnceNonZeroUsize {
+    inner: AtomicUsize,
+}
+
+impl OnceNonZeroUsize {
+    /// Creates a new empty cell.
+    #[inline]
+    pub const fn new() -> OnceNonZeroUsize {
+        OnceNonZeroUsize {
+            inner: AtomicUsize::new(0),
+        }
+    }
+
+    /// Gets the underlying value.
+    #[inline]
+    pub fn get(&self) -> Option<NonZeroUsize> {
+        let val = self.inner.load(Ordering::Acquire);
+        NonZeroUsize::new(val)
+    }
+
+    /// Gets the contents of the cell, initializing it with `f` if the cell was
+    /// empty.
+    ///
+    /// If several threads concurrently run `get_or_init`, more than one `f` can
+    /// be called. However, all threads will return the same value, produced by
+    /// some `f`.
+    pub fn get_or_init<F>(&self, f: F) -> NonZeroUsize
+    where
+        F: FnOnce() -> NonZeroUsize,
+    {
+        let val = self.inner.load(Ordering::Acquire);
+        match NonZeroUsize::new(val) {
+            Some(it) => it,
+            None => self.init(f),
+        }
+    }
+
+    #[cold]
+    #[inline(never)]
+    fn init(&self, f: impl FnOnce() -> NonZeroUsize) -> NonZeroUsize {
+        let mut val = f().get();
+        let exchange = self
+            .inner
+            .compare_exchange(0, val, Ordering::AcqRel, Ordering::Acquire);
+        if let Err(old) = exchange {
+            val = old;
+        }
+        unsafe { NonZeroUsize::new_unchecked(val) }
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/ptr.rs b/ring-0.17.14/src/polyfill/ptr.rs
new file mode 100644
index 0000000000..58df760c99
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/ptr.rs
@@ -0,0 +1,27 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// TODO(MSRV 1.76): Replace with `core::ptr::from_mut`.
+#[allow(dead_code)]
+#[inline(always)]
+pub fn from_mut<T: ?Sized>(r: &mut T) -> *mut T {
+    r
+}
+
+// TODO(MSRV 1.76): Replace with `core::ptr::from_ref`.
+#[allow(dead_code)]
+#[inline(always)]
+pub const fn from_ref<T: ?Sized>(r: &T) -> *const T {
+    r
+}
diff --git a/ring-0.17.14/src/polyfill/slice.rs b/ring-0.17.14/src/polyfill/slice.rs
new file mode 100644
index 0000000000..8299cc62d5
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/slice.rs
@@ -0,0 +1,57 @@
+// Permission is hereby granted, free of charge, to any
+// person obtaining a copy of this software and associated
+// documentation files (the "Software"), to deal in the
+// Software without restriction, including without
+// limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of
+// the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice
+// shall be included in all copies or substantial portions
+// of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+// SHALL THE AUTHOR OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+mod as_chunks;
+mod as_chunks_mut;
+
+pub use as_chunks::{as_chunks, AsChunks};
+pub use as_chunks_mut::{as_chunks_mut, AsChunksMut};
+
+// TODO(MSRV feature(split_at_checked)): Use `slice::split_at_checked`.
+//
+// Note that the libcore version is implemented in terms of
+// `slice::split_at_unchecked()`, and `slice::split_at()` was changed to be
+// implemented in terms of `split_at_checked`. For now, we implement this in
+// terms of `split_at` and rely on the optimizer to eliminate the panic.
+#[inline(always)]
+pub fn split_at_checked<T>(slice: &[T], i: usize) -> Option<(&[T], &[T])> {
+    if slice.len() >= i {
+        Some(slice.split_at(i))
+    } else {
+        None
+    }
+}
+
+// TODO(MSRV-1.77): Use `slice::split_first_chunk_mut`.
+#[inline(always)]
+pub fn split_first_chunk_mut<T, const N: usize>(
+    slice: &mut [T],
+) -> Option<(&mut [T; N], &mut [T])> {
+    if slice.len() >= N {
+        let (head, tail) = slice.split_at_mut(N);
+        head.try_into().ok().map(|head| (head, tail))
+    } else {
+        None
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/slice/as_chunks.rs b/ring-0.17.14/src/polyfill/slice/as_chunks.rs
new file mode 100644
index 0000000000..bfe5ab5709
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/slice/as_chunks.rs
@@ -0,0 +1,114 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::AsChunksMut;
+use core::ops;
+
+#[inline(always)]
+pub fn as_chunks<T, const N: usize>(slice: &[T]) -> (AsChunks<T, N>, &[T]) {
+    assert!(N != 0, "chunk size must be non-zero");
+    let len = slice.len() / N;
+    let (multiple_of_n, remainder) = slice.split_at(len * N);
+    (AsChunks(multiple_of_n), remainder)
+}
+
+#[derive(Clone, Copy)]
+pub struct AsChunks<'a, T, const N: usize>(&'a [T]);
+
+impl<'a, T, const N: usize> AsChunks<'a, T, N> {
+    #[inline(always)]
+    pub fn from_ref(value: &'a [T; N]) -> Self {
+        Self(value)
+    }
+
+    #[inline(always)]
+    pub fn as_flattened(&self) -> &[T] {
+        self.0
+    }
+
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))]
+    #[inline(always)]
+    pub fn as_ptr(&self) -> *const [T; N] {
+        self.0.as_ptr().cast()
+    }
+
+    #[inline(always)]
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    #[inline(always)]
+    pub fn len(&self) -> usize {
+        self.0.len() / N
+    }
+}
+
+impl<T, const N: usize> ops::Index<usize> for AsChunks<'_, T, N>
+where
+    [T]: ops::Index<ops::Range<usize>, Output = [T]>,
+{
+    type Output = [T; N];
+
+    #[inline(always)]
+    fn index(&self, index: usize) -> &Self::Output {
+        let start = N * index;
+        let slice = &self.0[start..(start + N)];
+        slice.try_into().unwrap()
+    }
+}
+
+impl<'a, T, const N: usize> IntoIterator for AsChunks<'a, T, N> {
+    type IntoIter = AsChunksIter<'a, T, N>;
+    type Item = &'a [T; N];
+
+    #[inline(always)]
+    fn into_iter(self) -> Self::IntoIter {
+        AsChunksIter(self.0.chunks_exact(N))
+    }
+}
+
+pub struct AsChunksIter<'a, T, const N: usize>(core::slice::ChunksExact<'a, T>);
+
+impl<'a, T, const N: usize> Iterator for AsChunksIter<'a, T, N> {
+    type Item = &'a [T; N];
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next().map(|x| x.try_into().unwrap())
+    }
+}
+
+// `&mut [[T; N]]` is implicitly convertable to `&[[T; N]]` but our types can't
+// do that.
+impl<'a, T, const N: usize> From<&'a AsChunksMut<'_, T, N>> for AsChunks<'a, T, N> {
+    #[inline(always)]
+    fn from(as_mut: &'a AsChunksMut<'_, T, N>) -> Self {
+        Self(as_mut.as_flattened())
+    }
+}
+
+impl<'a, T, const N: usize> From<&'a [T; N]> for AsChunks<'a, T, N> {
+    #[inline(always)]
+    fn from(array: &'a [T; N]) -> Self {
+        Self(array)
+    }
+}
+
+// TODO: `impl From<AsChunks<'a, T, {2*N}> for AsChunks<'a, T, N>`.
+impl<'a, T> From<AsChunks<'a, T, 8>> for AsChunks<'a, T, 4> {
+    #[inline(always)]
+    fn from(as_2x: AsChunks<'a, T, 8>) -> Self {
+        Self(as_2x.0)
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs b/ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs
new file mode 100644
index 0000000000..ae32f04e8b
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs
@@ -0,0 +1,88 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::AsChunks;
+
+#[inline(always)]
+pub fn as_chunks_mut<T, const N: usize>(slice: &mut [T]) -> (AsChunksMut<T, N>, &mut [T]) {
+    assert!(N != 0, "chunk size must be non-zero");
+    let len = slice.len() / N;
+    let (multiple_of_n, remainder) = slice.split_at_mut(len * N);
+    (AsChunksMut(multiple_of_n), remainder)
+}
+
+pub struct AsChunksMut<'a, T, const N: usize>(&'a mut [T]);
+
+impl<T, const N: usize> AsChunksMut<'_, T, N> {
+    #[inline(always)]
+    pub fn as_flattened(&self) -> &[T] {
+        self.0
+    }
+
+    #[inline(always)]
+    pub fn as_flattened_mut(&mut self) -> &mut [T] {
+        self.0
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    pub fn as_ptr(&self) -> *const [T; N] {
+        self.0.as_ptr().cast()
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub fn as_ptr(&self) -> *const [T; N] {
+        self.0.as_ptr().cast()
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    pub fn as_mut_ptr(&mut self) -> *mut [T; N] {
+        self.0.as_mut_ptr().cast()
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[inline(always)]
+    pub fn as_mut(&mut self) -> AsChunksMut<T, N> {
+        AsChunksMut(self.0)
+    }
+
+    #[inline(always)]
+    pub fn as_ref(&self) -> AsChunks<T, N> {
+        AsChunks::<T, N>::from(self)
+    }
+
+    // Argument moved from runtime argument to `const` argument so that
+    // `CHUNK_LEN * N` is checked at compile time for overflow.
+    #[inline(always)]
+    pub fn chunks_mut<const CHUNK_LEN: usize>(&mut self) -> AsChunksMutChunksMutIter<T, N> {
+        AsChunksMutChunksMutIter(self.0.chunks_mut(CHUNK_LEN * N))
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[inline(always)]
+    pub fn split_at_mut(&mut self, mid: usize) -> (AsChunksMut<T, N>, AsChunksMut<T, N>) {
+        let (before, after) = self.0.split_at_mut(mid * N);
+        (AsChunksMut(before), AsChunksMut(after))
+    }
+}
+
+pub struct AsChunksMutChunksMutIter<'a, T, const N: usize>(core::slice::ChunksMut<'a, T>);
+
+impl<'a, T, const N: usize> Iterator for AsChunksMutChunksMutIter<'a, T, N> {
+    type Item = AsChunksMut<'a, T, N>;
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next().map(AsChunksMut)
+    }
+}
diff --git a/ring-0.17.14/src/polyfill/sliceutil.rs b/ring-0.17.14/src/polyfill/sliceutil.rs
new file mode 100644
index 0000000000..a6765f17b4
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/sliceutil.rs
@@ -0,0 +1,23 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Utilities to make dealing with slices less tediuous.
+
+/// Replaces the first N elements of `a` with the first N elements of `b`, where
+/// N is `core::cmp::min(a.len(), b.len())`, leaving the rest unchanged.
+pub fn overwrite_at_start<T: Copy>(a: &mut [T], b: &[T]) {
+    a.iter_mut().zip(b).for_each(|(a, b)| {
+        *a = *b;
+    });
+}
diff --git a/ring-0.17.14/src/polyfill/test.rs b/ring-0.17.14/src/polyfill/test.rs
new file mode 100644
index 0000000000..ea90a2437a
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/test.rs
@@ -0,0 +1,29 @@
+pub fn assert_iterator<T>(it: impl ExactSizeIterator<Item = T> + Clone, expected: &[T])
+where
+    T: Copy + core::fmt::Debug + PartialEq,
+{
+    // Assert that the cloned iterator is correct.
+    assert_exact_size_iterator(it.clone(), expected);
+    // Assert that the original iterator is correct.
+    assert_exact_size_iterator(it, expected);
+}
+
+/// Asserts that `it` adheres to the `ExactSizeIterator` contract.
+fn assert_exact_size_iterator<T>(mut it: impl ExactSizeIterator<Item = T>, expected: &[T])
+where
+    T: Copy + core::fmt::Debug + PartialEq,
+{
+    assert_eq!(it.len(), expected.len());
+    assert_eq!(it.size_hint(), expected.iter().size_hint());
+
+    for i in 0..expected.len() {
+        let len = it.len();
+        assert_eq!(len, expected.len() - i);
+        assert_eq!(it.size_hint(), (len, Some(len)));
+        assert_eq!(it.next(), Some(expected[i]));
+    }
+
+    assert_eq!(it.len(), 0);
+    assert_eq!(it.size_hint(), (0, Some(0)));
+    assert_eq!(it.next(), None);
+}
diff --git a/ring-0.17.14/src/polyfill/unwrap_const.rs b/ring-0.17.14/src/polyfill/unwrap_const.rs
new file mode 100644
index 0000000000..a1c982f97c
--- /dev/null
+++ b/ring-0.17.14/src/polyfill/unwrap_const.rs
@@ -0,0 +1,29 @@
+// Copyright 2022 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+/// Polyfill for `Option::unwrap()` as a const fn; feature `const_option`.
+/// https://github.com/rust-lang/rust/issues/67441.
+/// TODO(MSRV): Replace this with `x.unwrap()`.
+///
+/// `T: Copy` avoids "constant functions cannot evaluate destructors."
+pub const fn unwrap_const<T>(x: Option<T>) -> T
+where
+    T: Copy,
+{
+    if let Some(x) = x {
+        x
+    } else {
+        panic!("unwrap_const on `None`");
+    }
+}
diff --git a/ring-0.17.14/src/prefixed.rs b/ring-0.17.14/src/prefixed.rs
new file mode 100644
index 0000000000..39f3f9fa06
--- /dev/null
+++ b/ring-0.17.14/src/prefixed.rs
@@ -0,0 +1,118 @@
+// Keep in sync with `core_name_and_version` in build.rs.
+macro_rules! core_name_and_version {
+    () => {
+        concat!(
+            env!("CARGO_PKG_NAME"),
+            "_core_",
+            env!("CARGO_PKG_VERSION_MAJOR"),
+            "_",
+            env!("CARGO_PKG_VERSION_MINOR"),
+            "_",
+            env!("CARGO_PKG_VERSION_PATCH"),
+            "_",
+            env!("CARGO_PKG_VERSION_PRE"), // Often empty
+        )
+    };
+}
+
+// Keep in sync with `prefix` in build.rs.
+macro_rules! prefix {
+    ( ) => {
+        concat!(core_name_and_version!(), "_")
+    };
+}
+
+macro_rules! prefixed_extern {
+    // Functions.
+    {
+        $(
+            $( #[$meta:meta] )*
+            $vis:vis fn $name:ident ( $( $arg_pat:ident : $arg_ty:ty ),* $(,)? )
+            $( -> $ret_ty:ty )?;
+        )+
+    } => {
+        extern "C" {
+            $(
+                prefixed_item! {
+                    link_name
+                    $name
+                    {
+                        $( #[$meta] )*
+                        $vis fn $name ( $( $arg_pat : $arg_ty ),* ) $( -> $ret_ty )?;
+                    }
+
+                }
+            )+
+        }
+    };
+
+    // A `static` global variable.
+    {
+        $( #[$meta:meta] )*
+        $vis:vis static $name:ident: $typ:ty;
+    } => {
+        extern "C" {
+            prefixed_item! {
+                link_name
+                $name
+                {
+                    $( #[$meta] )*
+                    $vis static $name: $typ;
+                }
+            }
+        }
+    };
+
+    // A `static mut` global variable.
+    {
+        $( #[$meta:meta] )*
+        $vis:vis static mut $name:ident: $typ:ty;
+    } => {
+        extern "C" {
+            prefixed_item! {
+                link_name
+                $name
+                {
+                    $( #[$meta] )*
+                    $vis static mut $name: $typ;
+                }
+            }
+        }
+    };
+}
+
+#[deprecated = "`#[export_name]` creates problems and we will stop doing it."]
+#[cfg(not(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    all(target_arch = "arm", target_endian = "little"),
+    target_arch = "x86",
+    target_arch = "x86_64"
+)))]
+macro_rules! prefixed_export {
+    // A function.
+    {
+        $( #[$meta:meta] )*
+        $vis:vis unsafe extern "C"
+        fn $name:ident ( $( $arg_pat:ident : $arg_ty:ty ),* $(,)? ) $body:block
+    } => {
+        prefixed_item! {
+            export_name
+            $name
+            {
+                $( #[$meta] )*
+                $vis unsafe extern "C" fn $name ( $( $arg_pat : $arg_ty ),* ) $body
+            }
+        }
+    };
+}
+
+macro_rules! prefixed_item {
+    {
+        $attr:ident
+        $name:ident
+        { $item:item }
+    } => {
+        #[$attr = concat!(prefix!(), stringify!($name))]
+        $item
+    };
+}
diff --git a/ring-0.17.14/src/rand.rs b/ring-0.17.14/src/rand.rs
new file mode 100644
index 0000000000..dc1c2a24e5
--- /dev/null
+++ b/ring-0.17.14/src/rand.rs
@@ -0,0 +1,176 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Cryptographic pseudo-random number generation.
+//!
+//! *ring* functions that generate random bytes take a `&dyn SecureRandom`
+//! parameter to make it clear which functions are non-deterministic.
+
+use crate::error;
+
+/// A secure random number generator.
+pub trait SecureRandom: sealed::SecureRandom {
+    /// Fills `dest` with random bytes.
+    fn fill(&self, dest: &mut [u8]) -> Result<(), error::Unspecified>;
+}
+
+impl<T> SecureRandom for T
+where
+    T: sealed::SecureRandom,
+{
+    #[inline(always)]
+    fn fill(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+        self.fill_impl(dest)
+    }
+}
+
+/// A random value constructed from a `SecureRandom` that hasn't been exposed
+/// through any safe Rust interface.
+///
+/// Intentionally does not implement any traits other than `Sized`.
+pub struct Random<T: RandomlyConstructable>(T);
+
+impl<T: RandomlyConstructable> Random<T> {
+    /// Expose the random value.
+    #[inline]
+    pub fn expose(self) -> T {
+        self.0
+    }
+}
+
+/// Generate the new random value using `rng`.
+#[inline]
+pub fn generate<T: RandomlyConstructable>(
+    rng: &dyn SecureRandom,
+) -> Result<Random<T>, error::Unspecified> {
+    let mut r = T::zero();
+    rng.fill(r.as_mut_bytes())?;
+    Ok(Random(r))
+}
+
+pub(crate) mod sealed {
+    use crate::error;
+
+    pub trait SecureRandom: core::fmt::Debug {
+        /// Fills `dest` with random bytes.
+        fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified>;
+    }
+
+    pub trait RandomlyConstructable: Sized {
+        fn zero() -> Self; // `Default::default()`
+        fn as_mut_bytes(&mut self) -> &mut [u8]; // `AsMut<[u8]>::as_mut`
+    }
+
+    impl<const N: usize> RandomlyConstructable for [u8; N] {
+        #[inline]
+        fn zero() -> Self {
+            [0; N]
+        }
+
+        #[inline]
+        fn as_mut_bytes(&mut self) -> &mut [u8] {
+            &mut self[..]
+        }
+    }
+}
+
+/// A type that can be returned by `ring::rand::generate()`.
+pub trait RandomlyConstructable: sealed::RandomlyConstructable {}
+impl<T> RandomlyConstructable for T where T: sealed::RandomlyConstructable {}
+
+/// A secure random number generator where the random values come directly
+/// from the operating system.
+///
+/// "Directly from the operating system" here presently means "whatever the
+/// `getrandom` crate does" but that may change in the future. That roughly
+/// means calling libc's `getrandom` function or whatever is analogous to that;
+/// see the `getrandom` crate's documentation for more info.
+///
+/// A single `SystemRandom` may be shared across multiple threads safely.
+///
+/// `new()` is guaranteed to always succeed and to have low latency; it won't
+/// try to open or read from a file or do similar things. The first call to
+/// `fill()` may block a substantial amount of time since any and all
+/// initialization is deferred to it. Therefore, it may be a good idea to call
+/// `fill()` once at a non-latency-sensitive time to minimize latency for
+/// future calls.
+#[derive(Clone, Debug)]
+pub struct SystemRandom(());
+
+impl SystemRandom {
+    /// Constructs a new `SystemRandom`.
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(())
+    }
+}
+
+impl crate::sealed::Sealed for SystemRandom {}
+
+// Use the `getrandom` crate whenever it is using the environment's (operating
+// system's) CSPRNG. Avoid using it on targets where it uses the `rdrand`
+// implementation.
+#[cfg(all(not(target_os = "optee"), any(
+    all(feature = "less-safe-getrandom-custom-or-rdrand", target_os = "none"),
+    all(feature = "less-safe-getrandom-espidf", target_os = "espidf"),
+    target_os = "aix",
+    target_os = "android",
+    target_os = "dragonfly",
+    target_os = "freebsd",
+    target_os = "fuchsia",
+    target_os = "haiku",
+    target_os = "hermit",
+    target_os = "hurd",
+    target_os = "horizon",
+    target_os = "illumos",
+    target_os = "linux",
+    target_os = "netbsd",
+    target_os = "openbsd",
+    target_os = "redox",
+    target_os = "solaris",
+    target_os = "vita",
+    target_os = "windows",
+    all(
+        target_vendor = "apple",
+        any(
+            target_os = "ios",
+            target_os = "macos",
+            target_os = "tvos",
+            target_os = "visionos",
+            target_os = "watchos",
+        )
+    ),
+    all(
+        target_arch = "wasm32",
+        any(
+            target_os = "wasi",
+            all(target_os = "unknown", feature = "wasm32_unknown_unknown_js")
+        )
+    ),
+)))]
+impl sealed::SecureRandom for SystemRandom {
+    #[inline(always)]
+    fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+        getrandom::getrandom(dest).map_err(|_| error::Unspecified)
+    }
+}
+
+#[cfg(target_os = "optee")]
+impl sealed::SecureRandom for SystemRandom {
+    #[inline(always)]
+    fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+        optee_utee::Random::generate(dest);
+        Ok(())
+    }
+}
\ No newline at end of file
diff --git a/ring-0.17.14/src/rsa.rs b/ring-0.17.14/src/rsa.rs
new file mode 100644
index 0000000000..e1fb2e7fc6
--- /dev/null
+++ b/ring-0.17.14/src/rsa.rs
@@ -0,0 +1,76 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// *R* and *r* in Montgomery math refer to different things, so we always use
+// `R` to refer to *R* to avoid confusion, even when that's against the normal
+// naming conventions. Also the standard camelCase names are used for `KeyPair`
+// components.
+
+//! RSA.
+
+use crate::{
+    arithmetic::bigint,
+    bits, error,
+    io::{self, der},
+};
+
+pub(crate) mod padding;
+
+// Maximum RSA modulus size supported for signature verification (in bytes).
+const PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN: usize =
+    bits::BitLength::from_bits(8192).as_usize_bytes_rounded_up();
+
+// Keep in sync with the documentation comment for `KeyPair`.
+const PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS: bits::BitLength = bits::BitLength::from_bits(4096);
+
+/// Parameters for RSA verification.
+#[derive(Debug)]
+pub struct RsaParameters {
+    padding_alg: &'static dyn padding::Verification,
+    min_bits: bits::BitLength,
+}
+
+fn parse_public_key(
+    input: untrusted::Input,
+) -> Result<(io::Positive, io::Positive), error::Unspecified> {
+    input.read_all(error::Unspecified, |input| {
+        der::nested(input, der::Tag::Sequence, error::Unspecified, |input| {
+            let n = der::positive_integer(input)?;
+            let e = der::positive_integer(input)?;
+            Ok((n, e))
+        })
+    })
+}
+
+// Type-level representation of an RSA public modulus *n*. See
+// `super::bigint`'s modulue-level documentation.
+enum N {}
+
+impl bigint::PublicModulus for N {}
+
+mod keypair;
+mod keypair_components;
+mod public_exponent;
+mod public_key;
+mod public_key_components;
+mod public_modulus;
+
+pub(crate) mod verification;
+
+use self::{public_exponent::PublicExponent, public_modulus::PublicModulus};
+
+pub use self::{
+    keypair::KeyPair, keypair_components::KeyPairComponents, public_key::PublicKey,
+    public_key_components::PublicKeyComponents,
+};
diff --git a/ring-0.17.14/src/rsa/keypair.rs b/ring-0.17.14/src/rsa/keypair.rs
new file mode 100644
index 0000000000..5994e55882
--- /dev/null
+++ b/ring-0.17.14/src/rsa/keypair.rs
@@ -0,0 +1,685 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    padding::{self, RsaEncoding},
+    KeyPairComponents, PublicExponent, PublicKey, PublicKeyComponents, N,
+};
+
+/// RSA PKCS#1 1.5 signatures.
+use crate::{
+    arithmetic::{
+        bigint,
+        montgomery::{R, RR, RRR},
+        LimbSliceError,
+    },
+    bits::BitLength,
+    cpu, digest,
+    error::{self, KeyRejected},
+    io::der,
+    pkcs8, rand, signature,
+};
+
+/// An RSA key pair, used for signing.
+pub struct KeyPair {
+    p: PrivateCrtPrime<P>,
+    q: PrivateCrtPrime<Q>,
+    qInv: bigint::Elem<P, R>,
+    public: PublicKey,
+}
+
+derive_debug_via_field!(KeyPair, stringify!(RsaKeyPair), public);
+
+impl KeyPair {
+    /// Parses an unencrypted PKCS#8-encoded RSA private key.
+    ///
+    /// This will generate a 2048-bit RSA private key of the correct form using
+    /// OpenSSL's command line tool:
+    ///
+    /// ```sh
+    ///    openssl genpkey -algorithm RSA \
+    ///        -pkeyopt rsa_keygen_bits:2048 \
+    ///        -pkeyopt rsa_keygen_pubexp:65537 | \
+    ///      openssl pkcs8 -topk8 -nocrypt -outform der > rsa-2048-private-key.pk8
+    /// ```
+    ///
+    /// This will generate a 3072-bit RSA private key of the correct form:
+    ///
+    /// ```sh
+    ///    openssl genpkey -algorithm RSA \
+    ///        -pkeyopt rsa_keygen_bits:3072 \
+    ///        -pkeyopt rsa_keygen_pubexp:65537 | \
+    ///      openssl pkcs8 -topk8 -nocrypt -outform der > rsa-3072-private-key.pk8
+    /// ```
+    ///
+    /// Often, keys generated for use in OpenSSL-based software are stored in
+    /// the Base64 “PEM” format without the PKCS#8 wrapper. Such keys can be
+    /// converted to binary PKCS#8 form using the OpenSSL command line tool like
+    /// this:
+    ///
+    /// ```sh
+    /// openssl pkcs8 -topk8 -nocrypt -outform der \
+    ///     -in rsa-2048-private-key.pem > rsa-2048-private-key.pk8
+    /// ```
+    ///
+    /// Base64 (“PEM”) PKCS#8-encoded keys can be converted to the binary PKCS#8
+    /// form like this:
+    ///
+    /// ```sh
+    /// openssl pkcs8 -nocrypt -outform der \
+    ///     -in rsa-2048-private-key.pem > rsa-2048-private-key.pk8
+    /// ```
+    ///
+    /// See [`Self::from_components`] for more details on how the input is
+    /// validated.
+    ///
+    /// See [RFC 5958] and [RFC 3447 Appendix A.1.2] for more details of the
+    /// encoding of the key.
+    ///
+    /// [NIST SP-800-56B rev. 1]:
+    ///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Br1.pdf
+    ///
+    /// [RFC 3447 Appendix A.1.2]:
+    ///     https://tools.ietf.org/html/rfc3447#appendix-A.1.2
+    ///
+    /// [RFC 5958]:
+    ///     https://tools.ietf.org/html/rfc5958
+    pub fn from_pkcs8(pkcs8: &[u8]) -> Result<Self, KeyRejected> {
+        const RSA_ENCRYPTION: &[u8] = include_bytes!("../data/alg-rsa-encryption.der");
+        let (der, _) = pkcs8::unwrap_key_(
+            untrusted::Input::from(RSA_ENCRYPTION),
+            pkcs8::Version::V1Only,
+            untrusted::Input::from(pkcs8),
+        )?;
+        Self::from_der(der.as_slice_less_safe())
+    }
+
+    /// Parses an RSA private key that is not inside a PKCS#8 wrapper.
+    ///
+    /// The private key must be encoded as a binary DER-encoded ASN.1
+    /// `RSAPrivateKey` as described in [RFC 3447 Appendix A.1.2]). In all other
+    /// respects, this is just like `from_pkcs8()`. See the documentation for
+    /// `from_pkcs8()` for more details.
+    ///
+    /// It is recommended to use `from_pkcs8()` (with a PKCS#8-encoded key)
+    /// instead.
+    ///
+    /// See [`Self::from_components()`] for more details on how the input is
+    /// validated.
+    ///
+    /// [RFC 3447 Appendix A.1.2]:
+    ///     https://tools.ietf.org/html/rfc3447#appendix-A.1.2
+    ///
+    /// [NIST SP-800-56B rev. 1]:
+    ///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Br1.pdf
+    pub fn from_der(input: &[u8]) -> Result<Self, KeyRejected> {
+        untrusted::Input::from(input).read_all(KeyRejected::invalid_encoding(), |input| {
+            der::nested(
+                input,
+                der::Tag::Sequence,
+                KeyRejected::invalid_encoding(),
+                Self::from_der_reader,
+            )
+        })
+    }
+
+    fn from_der_reader(input: &mut untrusted::Reader) -> Result<Self, KeyRejected> {
+        let version = der::small_nonnegative_integer(input)
+            .map_err(|error::Unspecified| KeyRejected::invalid_encoding())?;
+        if version != 0 {
+            return Err(KeyRejected::version_not_supported());
+        }
+
+        fn nonnegative_integer<'a>(
+            input: &mut untrusted::Reader<'a>,
+        ) -> Result<&'a [u8], KeyRejected> {
+            der::nonnegative_integer(input)
+                .map(|input| input.as_slice_less_safe())
+                .map_err(|error::Unspecified| KeyRejected::invalid_encoding())
+        }
+
+        let n = nonnegative_integer(input)?;
+        let e = nonnegative_integer(input)?;
+        let d = nonnegative_integer(input)?;
+        let p = nonnegative_integer(input)?;
+        let q = nonnegative_integer(input)?;
+        let dP = nonnegative_integer(input)?;
+        let dQ = nonnegative_integer(input)?;
+        let qInv = nonnegative_integer(input)?;
+
+        let components = KeyPairComponents {
+            public_key: PublicKeyComponents { n, e },
+            d,
+            p,
+            q,
+            dP,
+            dQ,
+            qInv,
+        };
+
+        Self::from_components(&components)
+    }
+
+    /// Constructs an RSA private key from its big-endian-encoded components.
+    ///
+    /// Only two-prime (not multi-prime) keys are supported. The public modulus
+    /// (n) must be at least 2047 bits. The public modulus must be no larger
+    /// than 4096 bits. It is recommended that the public modulus be exactly
+    /// 2048 or 3072 bits. The public exponent must be at least 65537 and must
+    /// be no more than 33 bits long.
+    ///
+    /// The private key is validated according to [NIST SP-800-56B rev. 1]
+    /// section 6.4.1.4.3, crt_pkv (Intended Exponent-Creation Method Unknown),
+    /// with the following exceptions:
+    ///
+    /// * Section 6.4.1.2.1, Step 1: Neither a target security level nor an
+    ///   expected modulus length is provided as a parameter, so checks
+    ///   regarding these expectations are not done.
+    /// * Section 6.4.1.2.1, Step 3: Since neither the public key nor the
+    ///   expected modulus length is provided as a parameter, the consistency
+    ///   check between these values and the private key's value of n isn't
+    ///   done.
+    /// * Section 6.4.1.2.1, Step 5: No primality tests are done, both for
+    ///   performance reasons and to avoid any side channels that such tests
+    ///   would provide.
+    /// * Section 6.4.1.2.1, Step 6, and 6.4.1.4.3, Step 7:
+    ///   * *ring* has a slightly looser lower bound for the values of `p`
+    ///     and `q` than what the NIST document specifies. This looser lower
+    ///     bound matches what most other crypto libraries do. The check might
+    ///     be tightened to meet NIST's requirements in the future. Similarly,
+    ///     the check that `p` and `q` are not too close together is skipped
+    ///     currently, but may be added in the future.
+    ///   * The validity of the mathematical relationship of `dP`, `dQ`, `e`
+    ///     and `n` is verified only during signing. Some size checks of `d`,
+    ///     `dP` and `dQ` are performed at construction, but some NIST checks
+    ///     are skipped because they would be expensive and/or they would leak
+    ///     information through side channels. If a preemptive check of the
+    ///     consistency of `dP`, `dQ`, `e` and `n` with each other is
+    ///     necessary, that can be done by signing any message with the key
+    ///     pair.
+    ///
+    ///   * `d` is not fully validated, neither at construction nor during
+    ///     signing. This is OK as far as *ring*'s usage of the key is
+    ///     concerned because *ring* never uses the value of `d` (*ring* always
+    ///     uses `p`, `q`, `dP` and `dQ` via the Chinese Remainder Theorem,
+    ///     instead). However, *ring*'s checks would not be sufficient for
+    ///     validating a key pair for use by some other system; that other
+    ///     system must check the value of `d` itself if `d` is to be used.
+    pub fn from_components<Public, Private>(
+        components: &KeyPairComponents<Public, Private>,
+    ) -> Result<Self, KeyRejected>
+    where
+        Public: AsRef<[u8]>,
+        Private: AsRef<[u8]>,
+    {
+        let components = KeyPairComponents {
+            public_key: PublicKeyComponents {
+                n: components.public_key.n.as_ref(),
+                e: components.public_key.e.as_ref(),
+            },
+            d: components.d.as_ref(),
+            p: components.p.as_ref(),
+            q: components.q.as_ref(),
+            dP: components.dP.as_ref(),
+            dQ: components.dQ.as_ref(),
+            qInv: components.qInv.as_ref(),
+        };
+        Self::from_components_(&components, cpu::features())
+    }
+
+    fn from_components_(
+        &KeyPairComponents {
+            public_key,
+            d,
+            p,
+            q,
+            dP,
+            dQ,
+            qInv,
+        }: &KeyPairComponents<&[u8]>,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, KeyRejected> {
+        let d = untrusted::Input::from(d);
+        let p = untrusted::Input::from(p);
+        let q = untrusted::Input::from(q);
+        let dP = untrusted::Input::from(dP);
+        let dQ = untrusted::Input::from(dQ);
+        let qInv = untrusted::Input::from(qInv);
+
+        // XXX: Some steps are done out of order, but the NIST steps are worded
+        // in such a way that it is clear that NIST intends for them to be done
+        // in order. TODO: Does this matter at all?
+
+        // 6.4.1.4.3/6.4.1.2.1 - Step 1.
+
+        // Step 1.a is omitted, as explained above.
+
+        // Step 1.b is omitted per above. Instead, we check that the public
+        // modulus is 2048 to `PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS` bits.
+        // XXX: The maximum limit of 4096 bits is primarily due to lack of
+        // testing of larger key sizes; see, in particular,
+        // https://www.mail-archive.com/openssl-dev@openssl.org/msg44586.html
+        // and
+        // https://www.mail-archive.com/openssl-dev@openssl.org/msg44759.html.
+        // Also, this limit might help with memory management decisions later.
+
+        // Step 1.c. We validate e >= 65537.
+        let n = untrusted::Input::from(public_key.n);
+        let e = untrusted::Input::from(public_key.e);
+        let public_key = PublicKey::from_modulus_and_exponent(
+            n,
+            e,
+            BitLength::from_bits(2048),
+            super::PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS,
+            PublicExponent::_65537,
+            cpu_features,
+        )?;
+
+        let n_one = public_key.inner().n().oneRR();
+        let n = &public_key.inner().n().value(cpu_features);
+
+        // 6.4.1.4.3 says to skip 6.4.1.2.1 Step 2.
+
+        // 6.4.1.4.3 Step 3.
+
+        // Step 3.a is done below, out of order.
+        // Step 3.b is unneeded since `n_bits` is derived here from `n`.
+
+        // 6.4.1.4.3 says to skip 6.4.1.2.1 Step 4. (We don't need to recover
+        // the prime factors since they are already given.)
+
+        // 6.4.1.4.3 - Step 5.
+
+        // Steps 5.a and 5.b are omitted, as explained above.
+
+        let n_bits = public_key.inner().n().len_bits();
+
+        let p = PrivatePrime::new(p, n_bits, cpu_features)?;
+        let q = PrivatePrime::new(q, n_bits, cpu_features)?;
+
+        // TODO: Step 5.i
+        //
+        // 3.b is unneeded since `n_bits` is derived here from `n`.
+
+        // 6.4.1.4.3 - Step 3.a (out of order).
+        //
+        // Verify that p * q == n. We restrict ourselves to modular
+        // multiplication. We rely on the fact that we've verified
+        // 0 < q < p < n. We check that q and p are close to sqrt(n) and then
+        // assume that these preconditions are enough to let us assume that
+        // checking p * q == 0 (mod n) is equivalent to checking p * q == n.
+        let q_mod_n = q
+            .modulus
+            .to_elem(n)
+            .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?;
+        let p_mod_n = p
+            .modulus
+            .to_elem(n)
+            .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?;
+        let p_mod_n = bigint::elem_mul(n_one, p_mod_n, n);
+        let pq_mod_n = bigint::elem_mul(&q_mod_n, p_mod_n, n);
+        if !pq_mod_n.is_zero() {
+            return Err(KeyRejected::inconsistent_components());
+        }
+
+        // 6.4.1.4.3/6.4.1.2.1 - Step 6.
+
+        // Step 6.a, partial.
+        //
+        // First, validate `2**half_n_bits < d`. Since 2**half_n_bits has a bit
+        // length of half_n_bits + 1, this check gives us 2**half_n_bits <= d,
+        // and knowing d is odd makes the inequality strict.
+        let d = bigint::OwnedModulusValue::<D>::from_be_bytes(d)
+            .map_err(|_| KeyRejected::invalid_component())?;
+        if !(n_bits.half_rounded_up() < d.len_bits()) {
+            return Err(KeyRejected::inconsistent_components());
+        }
+        // XXX: This check should be `d < LCM(p - 1, q - 1)`, but we don't have
+        // a good way of calculating LCM, so it is omitted, as explained above.
+        d.verify_less_than(n)
+            .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?;
+
+        // Step 6.b is omitted as explained above.
+
+        let pm = &p.modulus.modulus(cpu_features);
+
+        // 6.4.1.4.3 - Step 7.
+
+        // Step 7.c.
+        let qInv = bigint::Elem::from_be_bytes_padded(qInv, pm)
+            .map_err(|error::Unspecified| KeyRejected::invalid_component())?;
+
+        // Steps 7.d and 7.e are omitted per the documentation above, and
+        // because we don't (in the long term) have a good way to do modulo
+        // with an even modulus.
+
+        // Step 7.f.
+        let qInv = bigint::elem_mul(p.oneRR.as_ref(), qInv, pm);
+        let q_mod_p = bigint::elem_reduced(pm.alloc_zero(), &q_mod_n, pm, q.modulus.len_bits());
+        let q_mod_p = bigint::elem_mul(p.oneRR.as_ref(), q_mod_p, pm);
+        bigint::verify_inverses_consttime(&qInv, q_mod_p, pm)
+            .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?;
+
+        // This should never fail since `n` and `e` were validated above.
+
+        let p = PrivateCrtPrime::new(p, dP, cpu_features)?;
+        let q = PrivateCrtPrime::new(q, dQ, cpu_features)?;
+
+        Ok(Self {
+            p,
+            q,
+            qInv,
+            public: public_key,
+        })
+    }
+
+    /// Returns a reference to the public key.
+    pub fn public(&self) -> &PublicKey {
+        &self.public
+    }
+
+    /// Returns the length in bytes of the key pair's public modulus.
+    ///
+    /// A signature has the same length as the public modulus.
+    #[deprecated = "Use `public().modulus_len()`"]
+    #[inline]
+    pub fn public_modulus_len(&self) -> usize {
+        self.public().modulus_len()
+    }
+}
+
+impl signature::KeyPair for KeyPair {
+    type PublicKey = PublicKey;
+
+    fn public_key(&self) -> &Self::PublicKey {
+        self.public()
+    }
+}
+
+struct PrivatePrime<M> {
+    modulus: bigint::OwnedModulus<M>,
+    oneRR: bigint::One<M, RR>,
+}
+
+impl<M> PrivatePrime<M> {
+    fn new(
+        p: untrusted::Input,
+        n_bits: BitLength,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, KeyRejected> {
+        let p = bigint::OwnedModulusValue::from_be_bytes(p)?;
+
+        // 5.c / 5.g:
+        //
+        // TODO: First, stop if `p < (√2) * 2**((nBits/2) - 1)`.
+        // TODO: First, stop if `q < (√2) * 2**((nBits/2) - 1)`.
+        //
+        // Second, stop if `p > 2**(nBits/2) - 1`.
+        // Second, stop if `q > 2**(nBits/2) - 1`.
+        if p.len_bits() != n_bits.half_rounded_up() {
+            return Err(KeyRejected::inconsistent_components());
+        }
+
+        if p.len_bits().as_bits() % 512 != 0 {
+            return Err(KeyRejected::private_modulus_len_not_multiple_of_512_bits());
+        }
+
+        // TODO: Step 5.d: Verify GCD(p - 1, e) == 1.
+        // TODO: Step 5.h: Verify GCD(q - 1, e) == 1.
+
+        // Steps 5.e and 5.f are omitted as explained above.
+        let p = bigint::OwnedModulus::from(p);
+        let pm = p.modulus(cpu_features);
+        let oneRR = bigint::One::newRR(pm.alloc_zero(), &pm);
+
+        Ok(Self { modulus: p, oneRR })
+    }
+}
+
+struct PrivateCrtPrime<M> {
+    modulus: bigint::OwnedModulus<M>,
+    oneRRR: bigint::One<M, RRR>,
+    exponent: bigint::PrivateExponent,
+}
+
+impl<M> PrivateCrtPrime<M> {
+    /// Constructs a `PrivateCrtPrime` from the private prime `p` and `dP` where
+    /// dP == d % (p - 1).
+    fn new(
+        p: PrivatePrime<M>,
+        dP: untrusted::Input,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, KeyRejected> {
+        let m = &p.modulus.modulus(cpu_features);
+        // [NIST SP-800-56B rev. 1] 6.4.1.4.3 - Steps 7.a & 7.b.
+        let dP = bigint::PrivateExponent::from_be_bytes_padded(dP, m)
+            .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?;
+
+        // XXX: Steps 7.d and 7.e are omitted. We don't check that
+        // `dP == d % (p - 1)` because we don't (in the long term) have a good
+        // way to do modulo with an even modulus. Instead we just check that
+        // `1 <= dP < p - 1`. We'll check it, to some unknown extent, when we
+        // do the private key operation, since we verify that the result of the
+        // private key operation using the CRT parameters is consistent with `n`
+        // and `e`. TODO: Either prove that what we do is sufficient, or make
+        // it so.
+
+        let oneRRR = bigint::One::newRRR(p.oneRR, m);
+
+        Ok(Self {
+            modulus: p.modulus,
+            oneRRR,
+            exponent: dP,
+        })
+    }
+}
+
+fn elem_exp_consttime<M>(
+    c: &bigint::Elem<N>,
+    p: &PrivateCrtPrime<M>,
+    other_prime_len_bits: BitLength,
+    cpu_features: cpu::Features,
+) -> Result<bigint::Elem<M>, error::Unspecified> {
+    let m = &p.modulus.modulus(cpu_features);
+    bigint::elem_exp_consttime(
+        m.alloc_zero(),
+        c,
+        &p.oneRRR,
+        &p.exponent,
+        m,
+        other_prime_len_bits,
+    )
+    .map_err(error::erase::<LimbSliceError>)
+}
+
+// Type-level representations of the different moduli used in RSA signing, in
+// addition to `super::N`. See `super::bigint`'s modulue-level documentation.
+
+enum P {}
+
+enum Q {}
+
+enum D {}
+
+impl KeyPair {
+    /// Computes the signature of `msg` and writes it into `signature`.
+    ///
+    /// `msg` is digested using the digest algorithm from `padding_alg` and the
+    /// digest is then padded using the padding algorithm from `padding_alg`.
+    ///
+    /// The signature it written into `signature`; `signature`'s length must be
+    /// exactly the length returned by `self::public().modulus_len()` or else
+    /// an error will be returned. On failure, `signature` may contain
+    /// intermediate results, but won't contain anything that would endanger the
+    /// private key.
+    ///
+    /// `rng` may be used to randomize the padding (e.g. for PSS).
+    ///
+    /// Many other crypto libraries have signing functions that takes a
+    /// precomputed digest as input, instead of the message to digest. This
+    /// function does *not* take a precomputed digest; instead, `sign`
+    /// calculates the digest itself.
+    pub fn sign(
+        &self,
+        padding_alg: &'static dyn RsaEncoding,
+        rng: &dyn rand::SecureRandom,
+        msg: &[u8],
+        signature: &mut [u8],
+    ) -> Result<(), error::Unspecified> {
+        let cpu_features = cpu::features();
+
+        if signature.len() != self.public().modulus_len() {
+            return Err(error::Unspecified);
+        }
+
+        let m_hash = digest::digest(padding_alg.digest_alg(), msg);
+
+        // Use the output buffer as the scratch space for the signature to
+        // reduce the required stack space.
+        padding::encode(
+            padding_alg,
+            m_hash,
+            signature,
+            self.public().inner().n().len_bits(),
+            rng,
+        )?;
+
+        // RFC 8017 Section 5.1.2: RSADP, using the Chinese Remainder Theorem
+        // with Garner's algorithm.
+
+        // Steps 1 and 2.
+        let m = self.private_exponentiate(signature, cpu_features)?;
+
+        // Step 3.
+        m.fill_be_bytes(signature);
+
+        Ok(())
+    }
+
+    /// Returns base**d (mod n).
+    ///
+    /// This does not return or write any intermediate results into any buffers
+    /// that are provided by the caller so that no intermediate state will be
+    /// leaked that would endanger the private key.
+    ///
+    /// Panics if `in_out` is not `self.public().modulus_len()`.
+    fn private_exponentiate(
+        &self,
+        base: &[u8],
+        cpu_features: cpu::Features,
+    ) -> Result<bigint::Elem<N>, error::Unspecified> {
+        assert_eq!(base.len(), self.public().modulus_len());
+
+        // RFC 8017 Section 5.1.2: RSADP, using the Chinese Remainder Theorem
+        // with Garner's algorithm.
+
+        let n = &self.public.inner().n().value(cpu_features);
+        let n_one = self.public.inner().n().oneRR();
+
+        // Step 1. The value zero is also rejected.
+        let base = bigint::Elem::from_be_bytes_padded(untrusted::Input::from(base), n)?;
+
+        // Step 2
+        let c = base;
+
+        // Step 2.b.i.
+        let q_bits = self.q.modulus.len_bits();
+        let m_1 = elem_exp_consttime(&c, &self.p, q_bits, cpu_features)?;
+        let m_2 = elem_exp_consttime(&c, &self.q, self.p.modulus.len_bits(), cpu_features)?;
+
+        // Step 2.b.ii isn't needed since there are only two primes.
+
+        // Step 2.b.iii.
+        let h = {
+            let p = &self.p.modulus.modulus(cpu_features);
+            let m_2 = bigint::elem_reduced_once(p.alloc_zero(), &m_2, p, q_bits);
+            let m_1_minus_m_2 = bigint::elem_sub(m_1, &m_2, p);
+            bigint::elem_mul(&self.qInv, m_1_minus_m_2, p)
+        };
+
+        // Step 2.b.iv. The reduction in the modular multiplication isn't
+        // necessary because `h < p` and `p * q == n` implies `h * q < n`.
+        // Modular arithmetic is used simply to avoid implementing
+        // non-modular arithmetic.
+        let p_bits = self.p.modulus.len_bits();
+        let h = bigint::elem_widen(n.alloc_zero(), h, n, p_bits)?;
+        let q_mod_n = self.q.modulus.to_elem(n)?;
+        let q_mod_n = bigint::elem_mul(n_one, q_mod_n, n);
+        let q_times_h = bigint::elem_mul(&q_mod_n, h, n);
+        let m_2 = bigint::elem_widen(n.alloc_zero(), m_2, n, q_bits)?;
+        let m = bigint::elem_add(m_2, q_times_h, n);
+
+        // Step 2.b.v isn't needed since there are only two primes.
+
+        // Verify the result to protect against fault attacks as described
+        // in "On the Importance of Checking Cryptographic Protocols for
+        // Faults" by Dan Boneh, Richard A. DeMillo, and Richard J. Lipton.
+        // This check is cheap assuming `e` is small, which is ensured during
+        // `KeyPair` construction. Note that this is the only validation of `e`
+        // that is done other than basic checks on its size, oddness, and
+        // minimum value, since the relationship of `e` to `d`, `p`, and `q` is
+        // not verified during `KeyPair` construction.
+        {
+            let verify = n.alloc_zero();
+            let verify = self
+                .public
+                .inner()
+                .exponentiate_elem(verify, &m, cpu_features);
+            bigint::elem_verify_equal_consttime(&verify, &c)?;
+        }
+
+        // Step 3 will be done by the caller.
+
+        Ok(m)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::testutil as test;
+    use alloc::vec;
+
+    #[test]
+    fn test_rsakeypair_private_exponentiate() {
+        let cpu = cpu::features();
+        test::run(
+            test_vector_file!("keypair_private_exponentiate_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let key = test_case.consume_bytes("Key");
+                let key = KeyPair::from_pkcs8(&key).unwrap();
+                let test_cases = &[
+                    test_case.consume_bytes("p"),
+                    test_case.consume_bytes("p_plus_1"),
+                    test_case.consume_bytes("p_minus_1"),
+                    test_case.consume_bytes("q"),
+                    test_case.consume_bytes("q_plus_1"),
+                    test_case.consume_bytes("q_minus_1"),
+                ];
+                for test_case in test_cases {
+                    // THe call to `elem_verify_equal_consttime` will cause
+                    // `private_exponentiate` to fail if the computation is
+                    // incorrect.
+                    let mut padded = vec![0; key.public.modulus_len()];
+                    let zeroes = padded.len() - test_case.len();
+                    padded[zeroes..].copy_from_slice(test_case);
+                    let _: bigint::Elem<_> = key.private_exponentiate(&padded, cpu).unwrap();
+                }
+                Ok(())
+            },
+        );
+    }
+}
diff --git a/ring-0.17.14/src/rsa/keypair_components.rs b/ring-0.17.14/src/rsa/keypair_components.rs
new file mode 100644
index 0000000000..66e76792ae
--- /dev/null
+++ b/ring-0.17.14/src/rsa/keypair_components.rs
@@ -0,0 +1,38 @@
+use super::PublicKeyComponents;
+
+/// RSA key pair components.
+#[derive(Clone, Copy)]
+pub struct KeyPairComponents<Public, Private = Public> {
+    /// The public key components.
+    pub public_key: PublicKeyComponents<Public>,
+
+    /// The private exponent.
+    pub d: Private,
+
+    /// The first prime factor of `d`.
+    pub p: Private,
+
+    /// The second prime factor of `d`.
+    pub q: Private,
+
+    /// `p`'s public Chinese Remainder Theorem exponent.
+    pub dP: Private,
+
+    /// `q`'s public Chinese Remainder Theorem exponent.
+    pub dQ: Private,
+
+    /// `q**-1 mod p`.
+    pub qInv: Private,
+}
+
+impl<Public, Private> core::fmt::Debug for KeyPairComponents<Public, Private>
+where
+    PublicKeyComponents<Public>: core::fmt::Debug,
+{
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
+        // Non-public components are intentionally skipped
+        f.debug_struct("KeyPairComponents")
+            .field("public_key", &self.public_key)
+            .finish()
+    }
+}
diff --git a/ring-0.17.14/src/rsa/padding.rs b/ring-0.17.14/src/rsa/padding.rs
new file mode 100644
index 0000000000..98a45fc3c1
--- /dev/null
+++ b/ring-0.17.14/src/rsa/padding.rs
@@ -0,0 +1,177 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{bb, bits, digest, error, rand};
+
+mod pkcs1;
+mod pss;
+
+pub use self::{
+    pkcs1::{RSA_PKCS1_SHA256, RSA_PKCS1_SHA384, RSA_PKCS1_SHA512},
+    pss::{RSA_PSS_SHA256, RSA_PSS_SHA384, RSA_PSS_SHA512},
+};
+pub(super) use pkcs1::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY;
+
+/// Common features of both RSA padding encoding and RSA padding verification.
+pub trait Padding: 'static + Sync + crate::sealed::Sealed + core::fmt::Debug {
+    // The digest algorithm used for digesting the message (and maybe for
+    // other things).
+    fn digest_alg(&self) -> &'static digest::Algorithm;
+}
+
+pub(super) fn encode(
+    encoding: &dyn RsaEncoding,
+    m_hash: digest::Digest,
+    m_out: &mut [u8],
+    mod_bits: bits::BitLength,
+    rng: &dyn rand::SecureRandom,
+) -> Result<(), error::Unspecified> {
+    #[allow(deprecated)]
+    encoding.encode(m_hash, m_out, mod_bits, rng)
+}
+
+/// An RSA signature encoding as described in [RFC 3447 Section 8].
+///
+/// [RFC 3447 Section 8]: https://tools.ietf.org/html/rfc3447#section-8
+#[cfg(feature = "alloc")]
+pub trait RsaEncoding: Padding {
+    #[deprecated(note = "internal API that will be removed")]
+    #[doc(hidden)]
+    fn encode(
+        &self,
+        m_hash: digest::Digest,
+        m_out: &mut [u8],
+        mod_bits: bits::BitLength,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<(), error::Unspecified>;
+}
+
+/// Verification of an RSA signature encoding as described in
+/// [RFC 3447 Section 8].
+///
+/// [RFC 3447 Section 8]: https://tools.ietf.org/html/rfc3447#section-8
+pub trait Verification: Padding {
+    fn verify(
+        &self,
+        m_hash: digest::Digest,
+        m: &mut untrusted::Reader,
+        mod_bits: bits::BitLength,
+    ) -> Result<(), error::Unspecified>;
+}
+
+// Masks `out` with the output of the mask-generating function MGF1 as
+// described in https://tools.ietf.org/html/rfc3447#appendix-B.2.1.
+fn mgf1(digest_alg: &'static digest::Algorithm, seed: &[u8], out: &mut [u8]) {
+    let digest_len = digest_alg.output_len();
+
+    // Maximum counter value is the value of (mask_len / digest_len) rounded up.
+    for (i, out) in out.chunks_mut(digest_len).enumerate() {
+        let mut ctx = digest::Context::new(digest_alg);
+        ctx.update(seed);
+        // The counter will always fit in a `u32` because we reject absurdly
+        // long inputs very early.
+        ctx.update(&u32::to_be_bytes(i.try_into().unwrap()));
+        let digest = ctx.finish();
+
+        // The last chunk may legitimately be shorter than `digest`, but
+        // `digest` will never be shorter than `out`.
+        bb::xor_assign_at_start(out, digest.as_ref());
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::testutil as test;
+    use crate::{digest, error};
+    use alloc::vec;
+
+    #[test]
+    fn test_pss_padding_verify() {
+        test::run(
+            test_vector_file!("rsa_pss_padding_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let digest_name = test_case.consume_string("Digest");
+                let alg = match digest_name.as_ref() {
+                    "SHA256" => &RSA_PSS_SHA256,
+                    "SHA384" => &RSA_PSS_SHA384,
+                    "SHA512" => &RSA_PSS_SHA512,
+                    _ => panic!("Unsupported digest: {}", digest_name),
+                };
+
+                let msg = test_case.consume_bytes("Msg");
+                let msg = untrusted::Input::from(&msg);
+                let m_hash = digest::digest(alg.digest_alg(), msg.as_slice_less_safe());
+
+                let encoded = test_case.consume_bytes("EM");
+                let encoded = untrusted::Input::from(&encoded);
+
+                // Salt is recomputed in verification algorithm.
+                let _ = test_case.consume_bytes("Salt");
+
+                let bit_len = test_case.consume_usize_bits("Len");
+                let is_valid = test_case.consume_string("Result") == "P";
+
+                let actual_result =
+                    encoded.read_all(error::Unspecified, |m| alg.verify(m_hash, m, bit_len));
+                assert_eq!(actual_result.is_ok(), is_valid);
+
+                Ok(())
+            },
+        );
+    }
+
+    // Tests PSS encoding for various public modulus lengths.
+    #[cfg(feature = "alloc")]
+    #[test]
+    fn test_pss_padding_encode() {
+        test::run(
+            test_vector_file!("rsa_pss_padding_tests.txt"),
+            |section, test_case| {
+                assert_eq!(section, "");
+
+                let digest_name = test_case.consume_string("Digest");
+                let alg = match digest_name.as_ref() {
+                    "SHA256" => &RSA_PSS_SHA256,
+                    "SHA384" => &RSA_PSS_SHA384,
+                    "SHA512" => &RSA_PSS_SHA512,
+                    _ => panic!("Unsupported digest: {}", digest_name),
+                };
+
+                let msg = test_case.consume_bytes("Msg");
+                let salt = test_case.consume_bytes("Salt");
+                let encoded = test_case.consume_bytes("EM");
+                let bit_len = test_case.consume_usize_bits("Len");
+                let expected_result = test_case.consume_string("Result");
+
+                // Only test the valid outputs
+                if expected_result != "P" {
+                    return Ok(());
+                }
+
+                let rng = test::rand::FixedSliceRandom { bytes: &salt };
+
+                let mut m_out = vec![0u8; bit_len.as_usize_bytes_rounded_up()];
+                let digest = digest::digest(alg.digest_alg(), &msg);
+                #[allow(deprecated)]
+                alg.encode(digest, &mut m_out, bit_len, &rng).unwrap();
+                assert_eq!(m_out, encoded);
+
+                Ok(())
+            },
+        );
+    }
+}
diff --git a/ring-0.17.14/src/rsa/padding/pkcs1.rs b/ring-0.17.14/src/rsa/padding/pkcs1.rs
new file mode 100644
index 0000000000..880b1dc115
--- /dev/null
+++ b/ring-0.17.14/src/rsa/padding/pkcs1.rs
@@ -0,0 +1,177 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, Padding, RsaEncoding, Verification};
+use crate::{bits, digest, error, io::der, rand};
+
+/// PKCS#1 1.5 padding as described in [RFC 3447 Section 8.2].
+///
+/// See "`RSA_PSS_*` Details\" in `ring::signature`'s module-level
+/// documentation for more details.
+///
+/// [RFC 3447 Section 8.2]: https://tools.ietf.org/html/rfc3447#section-8.2
+#[derive(Debug)]
+pub struct PKCS1 {
+    digest_alg: &'static digest::Algorithm,
+    digestinfo_prefix: &'static [u8],
+}
+
+impl crate::sealed::Sealed for PKCS1 {}
+
+impl Padding for PKCS1 {
+    fn digest_alg(&self) -> &'static digest::Algorithm {
+        self.digest_alg
+    }
+}
+
+impl RsaEncoding for PKCS1 {
+    fn encode(
+        &self,
+        m_hash: digest::Digest,
+        m_out: &mut [u8],
+        _mod_bits: bits::BitLength,
+        _rng: &dyn rand::SecureRandom,
+    ) -> Result<(), error::Unspecified> {
+        pkcs1_encode(self, m_hash, m_out);
+        Ok(())
+    }
+}
+
+impl Verification for PKCS1 {
+    fn verify(
+        &self,
+        m_hash: digest::Digest,
+        m: &mut untrusted::Reader,
+        mod_bits: bits::BitLength,
+    ) -> Result<(), error::Unspecified> {
+        // `mod_bits.as_usize_bytes_rounded_up() <=
+        //      PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN` is ensured by `verify_rsa_()`.
+        let mut calculated = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN];
+        let calculated = &mut calculated[..mod_bits.as_usize_bytes_rounded_up()];
+        pkcs1_encode(self, m_hash, calculated);
+        if m.read_bytes_to_end().as_slice_less_safe() != calculated {
+            return Err(error::Unspecified);
+        }
+        Ok(())
+    }
+}
+
+// Implement padding procedure per EMSA-PKCS1-v1_5,
+// https://tools.ietf.org/html/rfc3447#section-9.2. This is used by both
+// verification and signing so it needs to be able to handle moduli of the
+// minimum and maximum sizes for both operations.
+fn pkcs1_encode(pkcs1: &PKCS1, m_hash: digest::Digest, m_out: &mut [u8]) {
+    let em = m_out;
+
+    let digest_len = pkcs1.digestinfo_prefix.len() + pkcs1.digest_alg.output_len();
+
+    // The specification requires at least 8 bytes of padding. Since we
+    // disallow keys smaller than 1024 bits, this should always be true.
+    assert!(em.len() >= digest_len + 11);
+    let pad_len = em.len() - digest_len - 3;
+    em[0] = 0;
+    em[1] = 1;
+    for i in 0..pad_len {
+        em[2 + i] = 0xff;
+    }
+    em[2 + pad_len] = 0;
+
+    let (digest_prefix, digest_dst) = em[3 + pad_len..].split_at_mut(pkcs1.digestinfo_prefix.len());
+    digest_prefix.copy_from_slice(pkcs1.digestinfo_prefix);
+    digest_dst.copy_from_slice(m_hash.as_ref());
+}
+
+macro_rules! rsa_pkcs1_padding {
+    ( $vis:vis $PADDING_ALGORITHM:ident, $digest_alg:expr, $digestinfo_prefix:expr,
+      $doc_str:expr ) => {
+        #[doc=$doc_str]
+        $vis static $PADDING_ALGORITHM: PKCS1 = PKCS1 {
+            digest_alg: $digest_alg,
+            digestinfo_prefix: $digestinfo_prefix,
+        };
+    };
+}
+
+// Intentionally not exposed except internally for signature verification. At a
+// minimum, we'd need to create test vectors for signing with it, which we
+// don't currently have. But, it's a bad idea to use SHA-1 anyway, so perhaps
+// we just won't ever expose it.
+rsa_pkcs1_padding!(
+    pub(in super::super) RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY,
+    &digest::SHA1_FOR_LEGACY_USE_ONLY,
+    &SHA1_PKCS1_DIGESTINFO_PREFIX,
+    "PKCS#1 1.5 padding using SHA-1 for RSA signatures."
+);
+
+rsa_pkcs1_padding!(
+    pub RSA_PKCS1_SHA256,
+    &digest::SHA256,
+    &SHA256_PKCS1_DIGESTINFO_PREFIX,
+    "PKCS#1 1.5 padding using SHA-256 for RSA signatures."
+);
+
+rsa_pkcs1_padding!(
+    pub RSA_PKCS1_SHA384,
+    &digest::SHA384,
+    &SHA384_PKCS1_DIGESTINFO_PREFIX,
+    "PKCS#1 1.5 padding using SHA-384 for RSA signatures."
+);
+
+rsa_pkcs1_padding!(
+    pub RSA_PKCS1_SHA512,
+    &digest::SHA512,
+    &SHA512_PKCS1_DIGESTINFO_PREFIX,
+    "PKCS#1 1.5 padding using SHA-512 for RSA signatures."
+);
+
+macro_rules! pkcs1_digestinfo_prefix {
+    ( $name:ident, $digest_len:expr, $digest_oid_len:expr,
+      [ $( $digest_oid:expr ),* ] ) => {
+        static $name: [u8; 2 + 8 + $digest_oid_len] = [
+            der::Tag::Sequence.into(), 8 + $digest_oid_len + $digest_len,
+                der::Tag::Sequence.into(), 2 + $digest_oid_len + 2,
+                    der::Tag::OID.into(), $digest_oid_len, $( $digest_oid ),*,
+                    der::Tag::Null.into(), 0,
+                der::Tag::OctetString.into(), $digest_len,
+        ];
+    }
+}
+
+pkcs1_digestinfo_prefix!(
+    SHA1_PKCS1_DIGESTINFO_PREFIX,
+    20,
+    5,
+    [0x2b, 0x0e, 0x03, 0x02, 0x1a]
+);
+
+pkcs1_digestinfo_prefix!(
+    SHA256_PKCS1_DIGESTINFO_PREFIX,
+    32,
+    9,
+    [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01]
+);
+
+pkcs1_digestinfo_prefix!(
+    SHA384_PKCS1_DIGESTINFO_PREFIX,
+    48,
+    9,
+    [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02]
+);
+
+pkcs1_digestinfo_prefix!(
+    SHA512_PKCS1_DIGESTINFO_PREFIX,
+    64,
+    9,
+    [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03]
+);
diff --git a/ring-0.17.14/src/rsa/padding/pss.rs b/ring-0.17.14/src/rsa/padding/pss.rs
new file mode 100644
index 0000000000..8dd126ccc0
--- /dev/null
+++ b/ring-0.17.14/src/rsa/padding/pss.rs
@@ -0,0 +1,289 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, mgf1, Padding, RsaEncoding, Verification};
+use crate::{bb, bits, digest, error, rand};
+
+/// RSA PSS padding as described in [RFC 3447 Section 8.1].
+///
+/// See "`RSA_PSS_*` Details\" in `ring::signature`'s module-level
+/// documentation for more details.
+///
+/// [RFC 3447 Section 8.1]: https://tools.ietf.org/html/rfc3447#section-8.1
+#[allow(clippy::upper_case_acronyms)] // TODO: Until we implement cargo-semver-checks
+#[derive(Debug)]
+pub struct PSS {
+    digest_alg: &'static digest::Algorithm,
+}
+
+impl crate::sealed::Sealed for PSS {}
+
+impl Padding for PSS {
+    fn digest_alg(&self) -> &'static digest::Algorithm {
+        self.digest_alg
+    }
+}
+
+impl RsaEncoding for PSS {
+    // Implement padding procedure per EMSA-PSS,
+    // https://tools.ietf.org/html/rfc3447#section-9.1.
+    fn encode(
+        &self,
+        m_hash: digest::Digest,
+        m_out: &mut [u8],
+        mod_bits: bits::BitLength,
+        rng: &dyn rand::SecureRandom,
+    ) -> Result<(), error::Unspecified> {
+        let metrics = PSSMetrics::new(self.digest_alg, mod_bits)?;
+
+        // The `m_out` this function fills is the big-endian-encoded value of `m`
+        // from the specification, padded to `k` bytes, where `k` is the length
+        // in bytes of the public modulus. The spec says "Note that emLen will
+        // be one less than k if modBits - 1 is divisible by 8 and equal to k
+        // otherwise." In other words we might need to prefix `em` with a
+        // leading zero byte to form a correct value of `m`.
+        let em = if metrics.top_byte_mask == 0xff {
+            m_out[0] = 0;
+            &mut m_out[1..]
+        } else {
+            m_out
+        };
+        assert_eq!(em.len(), metrics.em_len);
+
+        // Steps 1 and 2 are done by the caller to produce `m_hash`.
+
+        // Step 3 is done by `PSSMetrics::new()` above.
+
+        let (db, digest_terminator) = em.split_at_mut(metrics.db_len);
+
+        let separator_pos = db.len() - 1 - metrics.s_len;
+
+        // Step 4.
+        let salt: &[u8] = {
+            let salt = &mut db[(separator_pos + 1)..];
+            rng.fill(salt)?; // salt
+            salt
+        };
+
+        // Steps 5 and 6.
+        let h = pss_digest(self.digest_alg, m_hash, salt);
+
+        // Step 7.
+        db[..separator_pos].fill(0); // ps
+
+        // Step 8.
+        db[separator_pos] = 0x01;
+
+        // Steps 9 and 10.
+        mgf1(self.digest_alg, h.as_ref(), db);
+
+        // Step 11.
+        db[0] &= metrics.top_byte_mask;
+
+        // Step 12.
+        digest_terminator[..metrics.h_len].copy_from_slice(h.as_ref());
+        digest_terminator[metrics.h_len] = 0xbc;
+
+        Ok(())
+    }
+}
+
+impl Verification for PSS {
+    // RSASSA-PSS-VERIFY from https://tools.ietf.org/html/rfc3447#section-8.1.2
+    // where steps 1, 2(a), and 2(b) have been done for us.
+    fn verify(
+        &self,
+        m_hash: digest::Digest,
+        m: &mut untrusted::Reader,
+        mod_bits: bits::BitLength,
+    ) -> Result<(), error::Unspecified> {
+        let metrics = PSSMetrics::new(self.digest_alg, mod_bits)?;
+
+        // RSASSA-PSS-VERIFY Step 2(c). The `m` this function is given is the
+        // big-endian-encoded value of `m` from the specification, padded to
+        // `k` bytes, where `k` is the length in bytes of the public modulus.
+        // The spec. says "Note that emLen will be one less than k if
+        // modBits - 1 is divisible by 8 and equal to k otherwise," where `k`
+        // is the length in octets of the RSA public modulus `n`. In other
+        // words, `em` might have an extra leading zero byte that we need to
+        // strip before we start the PSS decoding steps which is an artifact of
+        // the `Verification` interface.
+        if metrics.top_byte_mask == 0xff {
+            if m.read_byte()? != 0 {
+                return Err(error::Unspecified);
+            }
+        };
+        let em = m;
+
+        // The rest of this function is EMSA-PSS-VERIFY from
+        // https://tools.ietf.org/html/rfc3447#section-9.1.2.
+
+        // Steps 1 and 2 are done by the caller to produce `m_hash`.
+
+        // Step 3 is done by `PSSMetrics::new()` above.
+
+        // Step 5, out of order.
+        let masked_db = em.read_bytes(metrics.db_len)?;
+        let h_hash = em.read_bytes(metrics.h_len)?;
+
+        // Step 4.
+        if em.read_byte()? != 0xbc {
+            return Err(error::Unspecified);
+        }
+
+        // Step 7.
+        let mut db = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN];
+        let db = &mut db[..metrics.db_len];
+
+        mgf1(self.digest_alg, h_hash.as_slice_less_safe(), db);
+
+        masked_db.read_all(error::Unspecified, |masked_bytes| {
+            // Step 6. Check the top bits of first byte are zero.
+            let b = masked_bytes.read_byte()?;
+            if b & !metrics.top_byte_mask != 0 {
+                return Err(error::Unspecified);
+            }
+            db[0] ^= b;
+
+            // Step 8.
+            let db_rest = &mut db[1..];
+            let masked_bytes = masked_bytes.read_bytes(db_rest.len())?;
+            bb::xor_assign_at_start(db_rest, masked_bytes.as_slice_less_safe());
+            Ok(())
+        })?;
+
+        // Step 9.
+        db[0] &= metrics.top_byte_mask;
+
+        // Step 10.
+        let ps_len = metrics.ps_len;
+        if db[0..ps_len].iter().any(|&db| db != 0) {
+            return Err(error::Unspecified);
+        }
+        if db[metrics.ps_len] != 1 {
+            return Err(error::Unspecified);
+        }
+
+        // Step 11.
+        let salt = &db[(db.len() - metrics.s_len)..];
+
+        // Step 12 and 13.
+        let h_prime = pss_digest(self.digest_alg, m_hash, salt);
+
+        // Step 14.
+        if h_hash.as_slice_less_safe() != h_prime.as_ref() {
+            return Err(error::Unspecified);
+        }
+
+        Ok(())
+    }
+}
+
+struct PSSMetrics {
+    #[cfg_attr(not(feature = "alloc"), allow(dead_code))]
+    em_len: usize,
+    db_len: usize,
+    ps_len: usize,
+    s_len: usize,
+    h_len: usize,
+    top_byte_mask: u8,
+}
+
+impl PSSMetrics {
+    fn new(
+        digest_alg: &'static digest::Algorithm,
+        mod_bits: bits::BitLength,
+    ) -> Result<Self, error::Unspecified> {
+        let em_bits = mod_bits.try_sub_1()?;
+        let em_len = em_bits.as_usize_bytes_rounded_up();
+        let leading_zero_bits = (8 * em_len) - em_bits.as_bits();
+        debug_assert!(leading_zero_bits < 8);
+        let top_byte_mask = 0xffu8 >> leading_zero_bits;
+
+        let h_len = digest_alg.output_len();
+
+        // We require the salt length to be equal to the digest length.
+        let s_len = h_len;
+
+        // Step 3 of both `EMSA-PSS-ENCODE` is `EMSA-PSS-VERIFY` requires that
+        // we reject inputs where "emLen < hLen + sLen + 2". The definition of
+        // `emBits` in RFC 3447 Sections 9.1.1 and 9.1.2 says `emBits` must be
+        // "at least 8hLen + 8sLen + 9". Since 9 bits requires two bytes, these
+        // two conditions are equivalent. 9 bits are required as the 0x01
+        // before the salt requires 1 bit and the 0xbc after the digest
+        // requires 8 bits.
+        let db_len = em_len.checked_sub(1 + s_len).ok_or(error::Unspecified)?;
+        let ps_len = db_len.checked_sub(h_len + 1).ok_or(error::Unspecified)?;
+
+        debug_assert!(em_bits.as_bits() >= (8 * h_len) + (8 * s_len) + 9);
+
+        Ok(Self {
+            em_len,
+            db_len,
+            ps_len,
+            s_len,
+            h_len,
+            top_byte_mask,
+        })
+    }
+}
+
+fn pss_digest(
+    digest_alg: &'static digest::Algorithm,
+    m_hash: digest::Digest,
+    salt: &[u8],
+) -> digest::Digest {
+    // Fixed prefix.
+    const PREFIX_ZEROS: [u8; 8] = [0u8; 8];
+
+    // Encoding step 5 and 6, Verification step 12 and 13.
+    let mut ctx = digest::Context::new(digest_alg);
+    ctx.update(&PREFIX_ZEROS);
+    ctx.update(m_hash.as_ref());
+    ctx.update(salt);
+    ctx.finish()
+}
+
+macro_rules! rsa_pss_padding {
+    ( $vis:vis $PADDING_ALGORITHM:ident, $digest_alg:expr, $doc_str:expr ) => {
+        #[doc=$doc_str]
+        $vis static $PADDING_ALGORITHM: PSS = PSS {
+            digest_alg: $digest_alg,
+        };
+    };
+}
+
+rsa_pss_padding!(
+    pub RSA_PSS_SHA256,
+    &digest::SHA256,
+    "RSA PSS padding using SHA-256 for RSA signatures.\n\nSee
+                 \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level
+                 documentation for more details."
+);
+
+rsa_pss_padding!(
+    pub RSA_PSS_SHA384,
+    &digest::SHA384,
+    "RSA PSS padding using SHA-384 for RSA signatures.\n\nSee
+                 \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level
+                 documentation for more details."
+);
+
+rsa_pss_padding!(
+    pub RSA_PSS_SHA512,
+    &digest::SHA512,
+    "RSA PSS padding using SHA-512 for RSA signatures.\n\nSee
+                 \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level
+                 documentation for more details."
+);
diff --git a/ring-0.17.14/src/rsa/public_exponent.rs b/ring-0.17.14/src/rsa/public_exponent.rs
new file mode 100644
index 0000000000..b87b30b217
--- /dev/null
+++ b/ring-0.17.14/src/rsa/public_exponent.rs
@@ -0,0 +1,104 @@
+use crate::error;
+use crate::polyfill::{unwrap_const, ArrayFlatMap, LeadingZerosStripped};
+use core::num::NonZeroU64;
+
+/// The exponent `e` of an RSA public key.
+#[derive(Clone, Copy)]
+pub struct PublicExponent(NonZeroU64);
+
+impl PublicExponent {
+    #[cfg(test)]
+    const ALL_CONSTANTS: [Self; 3] = [Self::_3, Self::_65537, Self::MAX];
+
+    pub(super) const _3: Self = Self(unwrap_const(NonZeroU64::new(3)));
+    pub(super) const _65537: Self = Self(unwrap_const(NonZeroU64::new(65537)));
+
+    // This limit was chosen to bound the performance of the simple
+    // exponentiation-by-squaring implementation in `elem_exp_vartime`. In
+    // particular, it helps mitigate theoretical resource exhaustion attacks. 33
+    // bits was chosen as the limit based on the recommendations in [1] and
+    // [2]. Windows CryptoAPI (at least older versions) doesn't support values
+    // larger than 32 bits [3], so it is unlikely that exponents larger than 32
+    // bits are being used for anything Windows commonly does.
+    //
+    // [1] https://www.imperialviolet.org/2012/03/16/rsae.html
+    // [2] https://www.imperialviolet.org/2012/03/17/rsados.html
+    // [3] https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx
+    const MAX: Self = Self(unwrap_const(NonZeroU64::new((1u64 << 33) - 1)));
+
+    pub(super) fn from_be_bytes(
+        input: untrusted::Input,
+        min_value: Self,
+    ) -> Result<Self, error::KeyRejected> {
+        // See `PublicKey::from_modulus_and_exponent` for background on the step
+        // numbering.
+
+        if input.len() > 5 {
+            return Err(error::KeyRejected::too_large());
+        }
+        let value = input.read_all(error::KeyRejected::invalid_encoding(), |input| {
+            // The exponent can't be zero and it can't be prefixed with
+            // zero-valued bytes.
+            if input.peek(0) {
+                return Err(error::KeyRejected::invalid_encoding());
+            }
+            let mut value = 0u64;
+            loop {
+                let byte = input
+                    .read_byte()
+                    .map_err(|untrusted::EndOfInput| error::KeyRejected::invalid_encoding())?;
+                value = (value << 8) | u64::from(byte);
+                if input.at_end() {
+                    return Ok(value);
+                }
+            }
+        })?;
+
+        // Step 2 / Step b. NIST SP800-89 defers to FIPS 186-3, which requires
+        // `e >= 65537`. We enforce this when signing, but are more flexible in
+        // verification, for compatibility. Only small public exponents are
+        // supported.
+        let value = NonZeroU64::new(value).ok_or_else(error::KeyRejected::too_small)?;
+        if value < min_value.0 {
+            return Err(error::KeyRejected::too_small());
+        }
+        if value > Self::MAX.0 {
+            return Err(error::KeyRejected::too_large());
+        }
+
+        // Step 3 / Step c.
+        if value.get() & 1 != 1 {
+            return Err(error::KeyRejected::invalid_component());
+        }
+
+        Ok(Self(value))
+    }
+
+    /// The big-endian encoding of the exponent.
+    ///
+    /// There are no leading zeros.
+    pub fn be_bytes(&self) -> impl ExactSizeIterator<Item = u8> + Clone + '_ {
+        // The `unwrap()` won't fail as `self.0` is only a few bytes long.
+        let bytes = ArrayFlatMap::new(core::iter::once(self.0.get()), u64::to_be_bytes).unwrap();
+        LeadingZerosStripped::new(bytes)
+    }
+
+    pub(super) fn value(self) -> NonZeroU64 {
+        self.0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_public_exponent_constants() {
+        for value in PublicExponent::ALL_CONSTANTS.iter() {
+            let value: u64 = value.0.into();
+            assert_eq!(value & 1, 1);
+            assert!(value >= PublicExponent::_3.0.into()); // The absolute minimum.
+            assert!(value <= PublicExponent::MAX.0.into());
+        }
+    }
+}
diff --git a/ring-0.17.14/src/rsa/public_key.rs b/ring-0.17.14/src/rsa/public_key.rs
new file mode 100644
index 0000000000..87e1a30d6e
--- /dev/null
+++ b/ring-0.17.14/src/rsa/public_key.rs
@@ -0,0 +1,228 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{PublicExponent, PublicModulus, N, PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN};
+use crate::{
+    arithmetic::bigint,
+    bits, cpu, error,
+    io::{self, der, der_writer},
+    limb::LIMB_BYTES,
+};
+use alloc::boxed::Box;
+use core::num::NonZeroU64;
+
+/// An RSA Public Key.
+#[derive(Clone)]
+pub struct PublicKey {
+    inner: Inner,
+    serialized: Box<[u8]>,
+}
+
+derive_debug_self_as_ref_hex_bytes!(PublicKey);
+
+impl PublicKey {
+    pub(super) fn from_modulus_and_exponent(
+        n: untrusted::Input,
+        e: untrusted::Input,
+        n_min_bits: bits::BitLength,
+        n_max_bits: bits::BitLength,
+        e_min_value: PublicExponent,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::KeyRejected> {
+        let inner = Inner::from_modulus_and_exponent(
+            n,
+            e,
+            n_min_bits,
+            n_max_bits,
+            e_min_value,
+            cpu_features,
+        )?;
+
+        let n_bytes = n;
+        let e_bytes = e;
+
+        // TODO: Remove this re-parsing, and stop allocating this here.
+        // Instead we should serialize on demand without allocation, from
+        // `Modulus::be_bytes()` and `Exponent::be_bytes()`. Once this is
+        // fixed, merge `Inner` back into `PublicKey`.
+        let n_bytes = io::Positive::from_be_bytes(n_bytes)
+            .map_err(|_: error::Unspecified| error::KeyRejected::unexpected_error())?;
+        let e_bytes = io::Positive::from_be_bytes(e_bytes)
+            .map_err(|_: error::Unspecified| error::KeyRejected::unexpected_error())?;
+        let serialized = der_writer::write_all(der::Tag::Sequence, &|output| {
+            der_writer::write_positive_integer(output, &n_bytes)?;
+            der_writer::write_positive_integer(output, &e_bytes)
+        })
+        .map_err(|_: io::TooLongError| error::KeyRejected::unexpected_error())?;
+
+        Ok(Self { inner, serialized })
+    }
+
+    /// The length, in bytes, of the public modulus.
+    ///
+    /// The modulus length is rounded up to a whole number of bytes if its
+    /// bit length isn't a multiple of 8.
+    pub fn modulus_len(&self) -> usize {
+        self.inner.n().len_bits().as_usize_bytes_rounded_up()
+    }
+
+    pub(super) fn inner(&self) -> &Inner {
+        &self.inner
+    }
+}
+
+/// `PublicKey` but without any superfluous allocations, optimized for one-shot
+/// RSA signature verification.
+#[derive(Clone)]
+pub(crate) struct Inner {
+    n: PublicModulus,
+    e: PublicExponent,
+}
+
+impl Inner {
+    pub(super) fn from_modulus_and_exponent(
+        n: untrusted::Input,
+        e: untrusted::Input,
+        n_min_bits: bits::BitLength,
+        n_max_bits: bits::BitLength,
+        e_min_value: PublicExponent,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::KeyRejected> {
+        // This is an incomplete implementation of NIST SP800-56Br1 Section
+        // 6.4.2.2, "Partial Public-Key Validation for RSA." That spec defers
+        // to NIST SP800-89 Section 5.3.3, "(Explicit) Partial Public Key
+        // Validation for RSA," "with the caveat that the length of the modulus
+        // shall be a length that is specified in this Recommendation." In
+        // SP800-89, two different sets of steps are given, one set numbered,
+        // and one set lettered. TODO: Document this in the end-user
+        // documentation for RSA keys.
+
+        let n = PublicModulus::from_be_bytes(n, n_min_bits..=n_max_bits, cpu_features)?;
+
+        let e = PublicExponent::from_be_bytes(e, e_min_value)?;
+
+        // If `n` is less than `e` then somebody has probably accidentally swapped
+        // them. The largest acceptable `e` is smaller than the smallest acceptable
+        // `n`, so no additional checks need to be done.
+
+        // XXX: Steps 4 & 5 / Steps d, e, & f are not implemented. This is also the
+        // case in most other commonly-used crypto libraries.
+
+        Ok(Self { n, e })
+    }
+
+    /// The public modulus.
+    #[inline]
+    pub(super) fn n(&self) -> &PublicModulus {
+        &self.n
+    }
+
+    /// The public exponent.
+    #[inline]
+    pub(super) fn e(&self) -> PublicExponent {
+        self.e
+    }
+
+    /// Calculates base**e (mod n), filling the first part of `out_buffer` with
+    /// the result.
+    ///
+    /// This is constant-time with respect to the value in `base` (only).
+    ///
+    /// The result will be a slice of the encoded bytes of the result within
+    /// `out_buffer`, if successful.
+    pub(super) fn exponentiate<'out>(
+        &self,
+        base: untrusted::Input,
+        out_buffer: &'out mut [u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN],
+        cpu_features: cpu::Features,
+    ) -> Result<&'out [u8], error::Unspecified> {
+        let n = &self.n.value(cpu_features);
+
+        // The encoded value of the base must be the same length as the modulus,
+        // in bytes.
+        if base.len() != self.n.len_bits().as_usize_bytes_rounded_up() {
+            return Err(error::Unspecified);
+        }
+
+        // RFC 8017 Section 5.2.2: RSAVP1.
+
+        // Step 1.
+        let s = bigint::Elem::from_be_bytes_padded(base, n)?;
+        if s.is_zero() {
+            return Err(error::Unspecified);
+        }
+
+        // Step 2.
+        let m = n.alloc_zero();
+        let m = self.exponentiate_elem(m, &s, cpu_features);
+
+        // Step 3.
+        Ok(fill_be_bytes_n(m, self.n.len_bits(), out_buffer))
+    }
+
+    /// Calculates base**e (mod n).
+    ///
+    /// This is constant-time with respect to `base` only.
+    pub(super) fn exponentiate_elem(
+        &self,
+        out: bigint::Storage<N>,
+        base: &bigint::Elem<N>,
+        cpu_features: cpu::Features,
+    ) -> bigint::Elem<N> {
+        // The exponent was already checked to be at least 3.
+        let exponent_without_low_bit = NonZeroU64::try_from(self.e.value().get() & !1).unwrap();
+        // The exponent was already checked to be odd.
+        debug_assert_ne!(exponent_without_low_bit, self.e.value());
+
+        let n = &self.n.value(cpu_features);
+
+        let tmp = n.alloc_zero();
+        let base_r = bigint::elem_mul_into(tmp, self.n.oneRR(), base, n);
+
+        // During RSA public key operations the exponent is almost always either
+        // 65537 (0b10000000000000001) or 3 (0b11), both of which have a Hamming
+        // weight of 2. The maximum bit length and maximum Hamming weight of the
+        // exponent is bounded by the value of `PublicExponent::MAX`.
+        let acc = bigint::elem_exp_vartime(out, base_r, exponent_without_low_bit, n);
+
+        // Now do the multiplication for the low bit and convert out of the Montgomery domain.
+        bigint::elem_mul(base, acc, n)
+    }
+}
+
+// XXX: Refactor `signature::KeyPair` to get rid of this.
+impl AsRef<[u8]> for PublicKey {
+    fn as_ref(&self) -> &[u8] {
+        &self.serialized
+    }
+}
+
+/// Returns the big-endian representation of `elem` that is
+/// the same length as the minimal-length big-endian representation of
+/// the modulus `n`.
+///
+/// `n_bits` must be the bit length of the public modulus `n`.
+fn fill_be_bytes_n(
+    elem: bigint::Elem<N>,
+    n_bits: bits::BitLength,
+    out: &mut [u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN],
+) -> &[u8] {
+    let n_bytes = n_bits.as_usize_bytes_rounded_up();
+    let n_bytes_padded = ((n_bytes + (LIMB_BYTES - 1)) / LIMB_BYTES) * LIMB_BYTES;
+    let out = &mut out[..n_bytes_padded];
+    elem.fill_be_bytes(out);
+    let (padding, out) = out.split_at(n_bytes_padded - n_bytes);
+    assert!(padding.iter().all(|&b| b == 0));
+    out
+}
diff --git a/ring-0.17.14/src/rsa/public_key_components.rs b/ring-0.17.14/src/rsa/public_key_components.rs
new file mode 100644
index 0000000000..3220d75175
--- /dev/null
+++ b/ring-0.17.14/src/rsa/public_key_components.rs
@@ -0,0 +1,52 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::PublicKey;
+use core::iter::FromIterator;
+
+/// RSA public key components.
+///
+/// `B` must implement `AsRef<[u8]>` like `&[u8]` or `Vec<u8>`.
+#[derive(Clone, Copy)]
+pub struct PublicKeyComponents<B> {
+    /// The public modulus, encoded in big-endian bytes without leading zeros.
+    pub n: B,
+
+    /// The public exponent, encoded in big-endian bytes without leading zeros.
+    pub e: B,
+}
+
+impl<B> core::fmt::Debug for PublicKeyComponents<B>
+where
+    B: core::fmt::Debug,
+{
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
+        f.debug_struct("PublicKeyComponents")
+            .field("n", &self.n)
+            .field("e", &self.e)
+            .finish()
+    }
+}
+
+impl<B> From<&PublicKey> for PublicKeyComponents<B>
+where
+    B: FromIterator<u8>,
+{
+    fn from(public_key: &PublicKey) -> Self {
+        Self {
+            n: public_key.inner().n().be_bytes().collect(),
+            e: public_key.inner().e().be_bytes().collect(),
+        }
+    }
+}
diff --git a/ring-0.17.14/src/rsa/public_modulus.rs b/ring-0.17.14/src/rsa/public_modulus.rs
new file mode 100644
index 0000000000..c40e49edb0
--- /dev/null
+++ b/ring-0.17.14/src/rsa/public_modulus.rs
@@ -0,0 +1,98 @@
+use crate::{
+    arithmetic::{bigint, montgomery::RR},
+    bits::{self, FromByteLen as _},
+    cpu,
+    error::{self, InputTooLongError},
+    rsa::N,
+};
+use core::ops::RangeInclusive;
+
+/// The modulus (n) of an RSA public key.
+pub struct PublicModulus {
+    value: bigint::OwnedModulus<N>,
+    oneRR: bigint::One<N, RR>,
+}
+
+impl Clone for PublicModulus {
+    fn clone(&self) -> Self {
+        let PublicModulus { value, oneRR } = self;
+        let value = value.clone();
+
+        // XXX: Shouldn't really be needed just to call `alloc_zero()`,
+        // but not worth optimizing away.
+        let cpu = cpu::features();
+        let n = value.modulus(cpu);
+        let oneRR = oneRR.clone_into(n.alloc_zero());
+
+        Self { value, oneRR }
+    }
+}
+
+/*
+impl core::fmt::Debug for PublicModulus {
+    fn fmt(&self, fmt: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> {
+        self.value.fmt(fmt)
+    }
+}*/
+
+impl PublicModulus {
+    pub(super) fn from_be_bytes(
+        n: untrusted::Input,
+        allowed_bit_lengths: RangeInclusive<bits::BitLength>,
+        cpu_features: cpu::Features,
+    ) -> Result<Self, error::KeyRejected> {
+        // See `PublicKey::from_modulus_and_exponent` for background on the step
+        // numbering.
+
+        let min_bits = *allowed_bit_lengths.start();
+        let max_bits = *allowed_bit_lengths.end();
+
+        // `pkcs1_encode` depends on this not being small. Otherwise,
+        // `pkcs1_encode` would generate padding that is invalid (too few 0xFF
+        // bytes) for very small keys.
+        const MIN_BITS: bits::BitLength = bits::BitLength::from_bits(1024);
+
+        // Step 3 / Step c for `n` (out of order).
+        let value = bigint::OwnedModulusValue::from_be_bytes(n)?;
+        let bits = value.len_bits();
+
+        // Step 1 / Step a. XXX: SP800-56Br1 and SP800-89 require the length of
+        // the public modulus to be exactly 2048 or 3072 bits, but we are more
+        // flexible to be compatible with other commonly-used crypto libraries.
+        assert!(min_bits >= MIN_BITS);
+        let bits_rounded_up = bits::BitLength::from_byte_len(bits.as_usize_bytes_rounded_up())
+            .map_err(error::erase::<InputTooLongError>)
+            .unwrap(); // TODO: safe?
+        if bits_rounded_up < min_bits {
+            return Err(error::KeyRejected::too_small());
+        }
+        if bits > max_bits {
+            return Err(error::KeyRejected::too_large());
+        }
+        let value = bigint::OwnedModulus::from(value);
+        let m = value.modulus(cpu_features);
+        let oneRR = bigint::One::newRR(m.alloc_zero(), &m);
+
+        Ok(Self { value, oneRR })
+    }
+
+    /// The big-endian encoding of the modulus.
+    ///
+    /// There are no leading zeros.
+    pub fn be_bytes(&self) -> impl ExactSizeIterator<Item = u8> + Clone + '_ {
+        self.value.be_bytes()
+    }
+
+    /// The length of the modulus in bits.
+    pub fn len_bits(&self) -> bits::BitLength {
+        self.value.len_bits()
+    }
+
+    pub(super) fn value(&self, cpu_features: cpu::Features) -> bigint::Modulus<N> {
+        self.value.modulus(cpu_features)
+    }
+
+    pub(super) fn oneRR(&self) -> &bigint::Elem<N, RR> {
+        self.oneRR.as_ref()
+    }
+}
diff --git a/ring-0.17.14/src/rsa/signature_rsa_example_private_key.der b/ring-0.17.14/src/rsa/signature_rsa_example_private_key.der
new file mode 100644
index 0000000000..47f08a48d7
Binary files /dev/null and b/ring-0.17.14/src/rsa/signature_rsa_example_private_key.der differ
diff --git a/ring-0.17.14/src/rsa/signature_rsa_example_public_key.der b/ring-0.17.14/src/rsa/signature_rsa_example_public_key.der
new file mode 100644
index 0000000000..19e944fc3e
Binary files /dev/null and b/ring-0.17.14/src/rsa/signature_rsa_example_public_key.der differ
diff --git a/ring-0.17.14/src/rsa/verification.rs b/ring-0.17.14/src/rsa/verification.rs
new file mode 100644
index 0000000000..4ffbaec640
--- /dev/null
+++ b/ring-0.17.14/src/rsa/verification.rs
@@ -0,0 +1,231 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Verification of RSA signatures.
+
+use super::{
+    parse_public_key, public_key, PublicExponent, RsaParameters, PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN,
+};
+use crate::{
+    bits::{self, FromByteLen as _},
+    cpu, digest,
+    error::{self, InputTooLongError},
+    sealed, signature,
+};
+
+impl signature::VerificationAlgorithm for RsaParameters {
+    fn verify(
+        &self,
+        public_key: untrusted::Input,
+        msg: untrusted::Input,
+        signature: untrusted::Input,
+    ) -> Result<(), error::Unspecified> {
+        let (n, e) = parse_public_key(public_key)?;
+        verify_rsa_(
+            self,
+            (
+                n.big_endian_without_leading_zero_as_input(),
+                e.big_endian_without_leading_zero_as_input(),
+            ),
+            msg,
+            signature,
+            cpu::features(),
+        )
+    }
+}
+
+impl sealed::Sealed for RsaParameters {}
+
+macro_rules! rsa_params {
+    ( $VERIFY_ALGORITHM:ident, $min_bits:expr, $PADDING_ALGORITHM:expr,
+      $doc_str:expr ) => {
+        #[doc=$doc_str]
+        ///
+        /// Only available in `alloc` mode.
+        pub static $VERIFY_ALGORITHM: RsaParameters = RsaParameters {
+            padding_alg: $PADDING_ALGORITHM,
+            min_bits: bits::BitLength::from_bits($min_bits),
+        };
+    };
+}
+
+rsa_params!(
+    RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY,
+    1024,
+    &super::padding::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY,
+    "Verification of signatures using RSA keys of 1024-8192 bits,
+             PKCS#1.5 padding, and SHA-1.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY,
+    2048,
+    &super::padding::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PKCS#1.5 padding, and SHA-1.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY,
+    1024,
+    &super::padding::RSA_PKCS1_SHA256,
+    "Verification of signatures using RSA keys of 1024-8192 bits,
+             PKCS#1.5 padding, and SHA-256.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_2048_8192_SHA256,
+    2048,
+    &super::padding::RSA_PKCS1_SHA256,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PKCS#1.5 padding, and SHA-256.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_2048_8192_SHA384,
+    2048,
+    &super::padding::RSA_PKCS1_SHA384,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PKCS#1.5 padding, and SHA-384.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_2048_8192_SHA512,
+    2048,
+    &super::padding::RSA_PKCS1_SHA512,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PKCS#1.5 padding, and SHA-512.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY,
+    1024,
+    &super::padding::RSA_PKCS1_SHA512,
+    "Verification of signatures using RSA keys of 1024-8192 bits,
+             PKCS#1.5 padding, and SHA-512.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PKCS1_3072_8192_SHA384,
+    3072,
+    &super::padding::RSA_PKCS1_SHA384,
+    "Verification of signatures using RSA keys of 3072-8192 bits,
+             PKCS#1.5 padding, and SHA-384.\n\nSee \"`RSA_PKCS1_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+
+rsa_params!(
+    RSA_PSS_2048_8192_SHA256,
+    2048,
+    &super::padding::RSA_PSS_SHA256,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PSS padding, and SHA-256.\n\nSee \"`RSA_PSS_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PSS_2048_8192_SHA384,
+    2048,
+    &super::padding::RSA_PSS_SHA384,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PSS padding, and SHA-384.\n\nSee \"`RSA_PSS_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+rsa_params!(
+    RSA_PSS_2048_8192_SHA512,
+    2048,
+    &super::padding::RSA_PSS_SHA512,
+    "Verification of signatures using RSA keys of 2048-8192 bits,
+             PSS padding, and SHA-512.\n\nSee \"`RSA_PSS_*` Details\" in
+             `ring::signature`'s module-level documentation for more details."
+);
+
+pub use super::PublicKeyComponents as RsaPublicKeyComponents;
+
+impl<B> super::PublicKeyComponents<B>
+where
+    B: AsRef<[u8]>,
+{
+    /// Verifies that `signature` is a valid signature of `message` using `self`
+    /// as the public key. `params` determine what algorithm parameters
+    /// (padding, digest algorithm, key length range, etc.) are used in the
+    /// verification.
+    ///
+    /// When the public key is in DER-encoded PKCS#1 ASN.1 format, it is
+    /// recommended to use `ring::signature::verify()` with
+    /// `ring::signature::RSA_PKCS1_*`, because `ring::signature::verify()`
+    /// will handle the parsing in that case. Otherwise, this function can be used
+    /// to pass in the raw bytes for the public key components as
+    /// `untrusted::Input` arguments.
+    //
+    // There are a small number of tests that test this directly, but the
+    // test coverage for this function mostly depends on the test coverage for the
+    // `signature::VerificationAlgorithm` implementation for `RsaParameters`. If we
+    // change that, test coverage for `verify_rsa()` will need to be reconsidered.
+    // (The NIST test vectors were originally in a form that was optimized for
+    // testing `verify_rsa` directly, but the testing work for RSA PKCS#1
+    // verification was done during the implementation of
+    // `signature::VerificationAlgorithm`, before `verify_rsa` was factored out).
+    pub fn verify(
+        &self,
+        params: &RsaParameters,
+        message: &[u8],
+        signature: &[u8],
+    ) -> Result<(), error::Unspecified> {
+        verify_rsa_(
+            params,
+            (
+                untrusted::Input::from(self.n.as_ref()),
+                untrusted::Input::from(self.e.as_ref()),
+            ),
+            untrusted::Input::from(message),
+            untrusted::Input::from(signature),
+            cpu::features(),
+        )
+    }
+}
+
+pub(crate) fn verify_rsa_(
+    params: &RsaParameters,
+    (n, e): (untrusted::Input, untrusted::Input),
+    msg: untrusted::Input,
+    signature: untrusted::Input,
+    cpu_features: cpu::Features,
+) -> Result<(), error::Unspecified> {
+    let max_bits: bits::BitLength =
+        bits::BitLength::from_byte_len(PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN)
+            .map_err(error::erase::<InputTooLongError>)?;
+
+    // XXX: FIPS 186-4 seems to indicate that the minimum
+    // exponent value is 2**16 + 1, but it isn't clear if this is just for
+    // signing or also for verification. We support exponents of 3 and larger
+    // for compatibility with other commonly-used crypto libraries.
+    let key = public_key::Inner::from_modulus_and_exponent(
+        n,
+        e,
+        params.min_bits,
+        max_bits,
+        PublicExponent::_3,
+        cpu_features,
+    )?;
+
+    // RFC 8017 Section 5.2.2: RSAVP1.
+    let mut decoded = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN];
+    let decoded = key.exponentiate(signature, &mut decoded, cpu_features)?;
+
+    // Verify the padded message is correct.
+    let m_hash = digest::digest(params.padding_alg.digest_alg(), msg.as_slice_less_safe());
+    untrusted::Input::from(decoded).read_all(error::Unspecified, |m| {
+        params.padding_alg.verify(m_hash, m, key.n().len_bits())
+    })
+}
diff --git a/ring-0.17.14/src/signature.rs b/ring-0.17.14/src/signature.rs
new file mode 100644
index 0000000000..70806cac2e
--- /dev/null
+++ b/ring-0.17.14/src/signature.rs
@@ -0,0 +1,409 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Public key signatures: signing and verification.
+//!
+//! Use the `verify` function to verify signatures, passing a reference to the
+//! algorithm that identifies the algorithm. See the documentation for `verify`
+//! for examples.
+//!
+//! For signature verification, this API treats each combination of parameters
+//! as a separate algorithm. For example, instead of having a single "RSA"
+//! algorithm with a verification function that takes a bunch of parameters,
+//! there are `RSA_PKCS1_2048_8192_SHA256`, `RSA_PKCS1_2048_8192_SHA384`, etc.,
+//! which encode sets of parameter choices into objects. This is designed to
+//! reduce the risks of algorithm agility and to provide consistency with ECDSA
+//! and EdDSA.
+//!
+//! Currently this module does not support digesting the message to be signed
+//! separately from the public key operation, as it is currently being
+//! optimized for Ed25519 and for the implementation of protocols that do not
+//! requiring signing large messages. An interface for efficiently supporting
+//! larger messages may be added later.
+//!
+//!
+//! # Algorithm Details
+//!
+//! ## `ECDSA_*_ASN1` Details: ASN.1-encoded ECDSA Signatures
+//!
+//! The signature is a ASN.1 DER-encoded `Ecdsa-Sig-Value` as described in
+//! [RFC 3279 Section 2.2.3]. This is the form of ECDSA signature used in
+//! X.509-related structures and in TLS's `ServerKeyExchange` messages.
+//!
+//! The public key is encoding in uncompressed form using the
+//! Octet-String-to-Elliptic-Curve-Point algorithm in
+//! [SEC 1: Elliptic Curve Cryptography, Version 2.0].
+//!
+//! During verification, the public key is validated using the ECC Partial
+//! Public-Key Validation Routine from Section 5.6.2.3.3 of
+//! [NIST Special Publication 800-56A, revision 2] and Appendix A.3 of the
+//! NSA's [Suite B implementer's guide to FIPS 186-3]. Note that, as explained
+//! in the NSA guide, ECC Partial Public-Key Validation is equivalent to ECC
+//! Full Public-Key Validation for prime-order curves like this one.
+//!
+//! ## `ECDSA_*_FIXED` Details: Fixed-length (PKCS#11-style) ECDSA Signatures
+//!
+//! The signature is *r*||*s*, where || denotes concatenation, and where both
+//! *r* and *s* are both big-endian-encoded values that are left-padded to the
+//! maximum length. A P-256 signature will be 64 bytes long (two 32-byte
+//! components) and a P-384 signature will be 96 bytes long (two 48-byte
+//! components). This is the form of ECDSA signature used PKCS#11 and DNSSEC.
+//!
+//! The public key is encoding in uncompressed form using the
+//! Octet-String-to-Elliptic-Curve-Point algorithm in
+//! [SEC 1: Elliptic Curve Cryptography, Version 2.0].
+//!
+//! During verification, the public key is validated using the ECC Partial
+//! Public-Key Validation Routine from Section 5.6.2.3.3 of
+//! [NIST Special Publication 800-56A, revision 2] and Appendix A.3 of the
+//! NSA's [Suite B implementer's guide to FIPS 186-3]. Note that, as explained
+//! in the NSA guide, ECC Partial Public-Key Validation is equivalent to ECC
+//! Full Public-Key Validation for prime-order curves like this one.
+//!
+//! ## `RSA_PKCS1_*` Details: RSA PKCS#1 1.5 Signatures
+//!
+//! The signature is an RSASSA-PKCS1-v1_5 signature as described in
+//! [RFC 3447 Section 8.2].
+//!
+//! The public key is encoded as an ASN.1 `RSAPublicKey` as described in
+//! [RFC 3447 Appendix-A.1.1]. The public key modulus length, rounded *up* to
+//! the nearest (larger) multiple of 8 bits, must be in the range given in the
+//! name of the algorithm. The public exponent must be an odd integer of 2-33
+//! bits, inclusive.
+//!
+//!
+//! ## `RSA_PSS_*` Details: RSA PSS Signatures
+//!
+//! The signature is an RSASSA-PSS signature as described in
+//! [RFC 3447 Section 8.1].
+//!
+//! The public key is encoded as an ASN.1 `RSAPublicKey` as described in
+//! [RFC 3447 Appendix-A.1.1]. The public key modulus length, rounded *up* to
+//! the nearest (larger) multiple of 8 bits, must be in the range given in the
+//! name of the algorithm. The public exponent must be an odd integer of 2-33
+//! bits, inclusive.
+//!
+//! During verification, signatures will only be accepted if the MGF1 digest
+//! algorithm is the same as the message digest algorithm and if the salt
+//! length is the same length as the message digest. This matches the
+//! requirements in TLS 1.3 and other recent specifications.
+//!
+//! During signing, the message digest algorithm will be used as the MGF1
+//! digest algorithm. The salt will be the same length as the message digest.
+//! This matches the requirements in TLS 1.3 and other recent specifications.
+//! Additionally, the entire salt is randomly generated separately for each
+//! signature using the secure random number generator passed to `sign()`.
+//!
+//!
+//! [SEC 1: Elliptic Curve Cryptography, Version 2.0]:
+//!     http://www.secg.org/sec1-v2.pdf
+//! [NIST Special Publication 800-56A, revision 2]:
+//!     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
+//! [Suite B implementer's guide to FIPS 186-3]:
+//!     https://github.com/briansmith/ring/blob/main/doc/ecdsa.pdf
+//! [RFC 3279 Section 2.2.3]:
+//!     https://tools.ietf.org/html/rfc3279#section-2.2.3
+//! [RFC 3447 Section 8.2]:
+//!     https://tools.ietf.org/html/rfc3447#section-7.2
+//! [RFC 3447 Section 8.1]:
+//!     https://tools.ietf.org/html/rfc3447#section-8.1
+//! [RFC 3447 Appendix-A.1.1]:
+//!     https://tools.ietf.org/html/rfc3447#appendix-A.1.1
+//!
+//!
+//! # Examples
+//!
+//! ## Signing and verifying with Ed25519
+//!
+//! ```
+//! use ring::{
+//!     rand,
+//!     signature::{self, KeyPair},
+//! };
+//!
+//! # fn main() -> Result<(), ring::error::Unspecified> {
+//! // Generate a key pair in PKCS#8 (v2) format.
+//! let rng = rand::SystemRandom::new();
+//! let pkcs8_bytes = signature::Ed25519KeyPair::generate_pkcs8(&rng)?;
+//!
+//! // Normally the application would store the PKCS#8 file persistently. Later
+//! // it would read the PKCS#8 file from persistent storage to use it.
+//!
+//! let key_pair = signature::Ed25519KeyPair::from_pkcs8(pkcs8_bytes.as_ref())?;
+//!
+//! // Sign the message "hello, world".
+//! const MESSAGE: &[u8] = b"hello, world";
+//! let sig = key_pair.sign(MESSAGE);
+//!
+//! // Normally an application would extract the bytes of the signature and
+//! // send them in a protocol message to the peer(s). Here we just get the
+//! // public key key directly from the key pair.
+//! let peer_public_key_bytes = key_pair.public_key().as_ref();
+//!
+//! // Verify the signature of the message using the public key. Normally the
+//! // verifier of the message would parse the inputs to this code out of the
+//! // protocol message(s) sent by the signer.
+//! let peer_public_key =
+//!     signature::UnparsedPublicKey::new(&signature::ED25519, peer_public_key_bytes);
+//! peer_public_key.verify(MESSAGE, sig.as_ref())?;
+//!
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! ## Signing and verifying with RSA (PKCS#1 1.5 padding)
+//!
+//! By default OpenSSL writes RSA public keys in SubjectPublicKeyInfo format,
+//! not RSAPublicKey format, and Base64-encodes them (“PEM” format).
+//!
+//! To convert the PEM SubjectPublicKeyInfo format (“BEGIN PUBLIC KEY”) to the
+//! binary RSAPublicKey format needed by `verify()`, use:
+//!
+//! ```sh
+//! openssl rsa -pubin \
+//!             -in public_key.pem \
+//!             -inform PEM \
+//!             -RSAPublicKey_out \
+//!             -outform DER \
+//!             -out public_key.der
+//! ```
+//!
+//! To extract the RSAPublicKey-formatted public key from an ASN.1 (binary)
+//! DER-encoded RSAPrivateKey format private key file, use:
+//!
+//! ```sh
+//! openssl rsa -in private_key.der \
+//!             -inform DER \
+//!             -RSAPublicKey_out \
+//!             -outform DER \
+//!             -out public_key.der
+//! ```
+//!
+//! ```
+//! # #[cfg(feature = "std")]
+//! use ring::{rand, rsa, signature};
+//!
+//! # #[cfg(feature = "std")]
+//! fn sign_and_verify_rsa(private_key_path: &std::path::Path,
+//!                        public_key_path: &std::path::Path)
+//!                        -> Result<(), MyError> {
+//! // Create an RSA keypair from the DER-encoded bytes. This example uses
+//! // a 2048-bit key, but larger keys are also supported.
+//! let private_key_der = read_file(private_key_path)?;
+//! let key_pair = rsa::KeyPair::from_der(&private_key_der)
+//!     .map_err(|_| MyError::BadPrivateKey)?;
+//!
+//! // Sign the message "hello, world", using PKCS#1 v1.5 padding and the
+//! // SHA256 digest algorithm.
+//! const MESSAGE: &'static [u8] = b"hello, world";
+//! let rng = rand::SystemRandom::new();
+//! let mut signature = vec![0; key_pair.public().modulus_len()];
+//! key_pair.sign(&signature::RSA_PKCS1_SHA256, &rng, MESSAGE, &mut signature)
+//!     .map_err(|_| MyError::OOM)?;
+//!
+//! // Verify the signature.
+//! let public_key =
+//!     signature::UnparsedPublicKey::new(&signature::RSA_PKCS1_2048_8192_SHA256,
+//!                                       read_file(public_key_path)?);
+//! public_key.verify(MESSAGE, &signature)
+//!     .map_err(|_| MyError::BadSignature)
+//! }
+//!
+//! #[derive(Debug)]
+//! enum MyError {
+//! #  #[cfg(feature = "std")]
+//!    IO(std::io::Error),
+//!    BadPrivateKey,
+//!    OOM,
+//!    BadSignature,
+//! }
+//!
+//! # #[cfg(feature = "std")]
+//! fn read_file(path: &std::path::Path) -> Result<Vec<u8>, MyError> {
+//!     use std::io::Read;
+//!
+//!     let mut file = std::fs::File::open(path).map_err(|e| MyError::IO(e))?;
+//!     let mut contents: Vec<u8> = Vec::new();
+//!     file.read_to_end(&mut contents).map_err(|e| MyError::IO(e))?;
+//!     Ok(contents)
+//! }
+//! #
+//! # #[cfg(not(feature = "std"))]
+//! # fn sign_and_verify_rsa(_private_key_path: &std::path::Path,
+//! #                        _public_key_path: &std::path::Path)
+//! #                        -> Result<(), ()> {
+//! #     Ok(())
+//! # }
+//! #
+//! # fn main() {
+//! #     let private_key_path =
+//! #         std::path::Path::new("src/rsa/signature_rsa_example_private_key.der");
+//! #     let public_key_path =
+//! #         std::path::Path::new("src/rsa/signature_rsa_example_public_key.der");
+//! #     sign_and_verify_rsa(&private_key_path, &public_key_path).unwrap()
+//! # }
+//! ```
+
+use crate::{cpu, debug, ec, error, sealed};
+
+pub use crate::ec::{
+    curve25519::ed25519::{
+        signing::Ed25519KeyPair,
+        verification::{EdDSAParameters, ED25519},
+        ED25519_PUBLIC_KEY_LEN,
+    },
+    suite_b::ecdsa::{
+        signing::{
+            EcdsaKeyPair, EcdsaSigningAlgorithm, ECDSA_P256_SHA256_ASN1_SIGNING,
+            ECDSA_P256_SHA256_FIXED_SIGNING, ECDSA_P384_SHA384_ASN1_SIGNING,
+            ECDSA_P384_SHA384_FIXED_SIGNING,
+        },
+        verification::{
+            EcdsaVerificationAlgorithm, ECDSA_P256_SHA256_ASN1, ECDSA_P256_SHA256_FIXED,
+            ECDSA_P256_SHA384_ASN1, ECDSA_P384_SHA256_ASN1, ECDSA_P384_SHA384_ASN1,
+            ECDSA_P384_SHA384_FIXED,
+        },
+    },
+};
+
+#[cfg(feature = "alloc")]
+pub use crate::rsa::{
+    padding::{
+        RsaEncoding, RSA_PKCS1_SHA256, RSA_PKCS1_SHA384, RSA_PKCS1_SHA512, RSA_PSS_SHA256,
+        RSA_PSS_SHA384, RSA_PSS_SHA512,
+    },
+    verification::{
+        RsaPublicKeyComponents, RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY,
+        RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY,
+        RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY,
+        RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, RSA_PKCS1_2048_8192_SHA256,
+        RSA_PKCS1_2048_8192_SHA384, RSA_PKCS1_2048_8192_SHA512, RSA_PKCS1_3072_8192_SHA384,
+        RSA_PSS_2048_8192_SHA256, RSA_PSS_2048_8192_SHA384, RSA_PSS_2048_8192_SHA512,
+    },
+    RsaParameters,
+};
+
+/// An RSA key pair, used for signing.
+#[cfg(feature = "alloc")]
+pub type RsaKeyPair = crate::rsa::KeyPair;
+
+/// A public key signature returned from a signing operation.
+#[derive(Clone, Copy)]
+pub struct Signature {
+    value: [u8; MAX_LEN],
+    len: usize,
+}
+
+impl Signature {
+    // Panics if `value` is too long.
+    pub(crate) fn new<F>(fill: F) -> Self
+    where
+        F: FnOnce(&mut [u8; MAX_LEN]) -> usize,
+    {
+        let mut r = Self {
+            value: [0; MAX_LEN],
+            len: 0,
+        };
+        r.len = fill(&mut r.value);
+        r
+    }
+}
+
+impl AsRef<[u8]> for Signature {
+    fn as_ref(&self) -> &[u8] {
+        &self.value[..self.len]
+    }
+}
+
+/// Key pairs for signing messages (private key and public key).
+pub trait KeyPair: core::fmt::Debug + Send + Sized + Sync {
+    /// The type of the public key.
+    type PublicKey: AsRef<[u8]> + core::fmt::Debug + Clone + Send + Sized + Sync;
+
+    /// The public key for the key pair.
+    fn public_key(&self) -> &Self::PublicKey;
+}
+
+/// The longest signature is an ASN.1 P-384 signature where *r* and *s* are of
+/// maximum length with the leading high bit set on each. Then each component
+/// will have a tag, a one-byte length, and a one-byte “I'm not negative”
+/// prefix, and the outer sequence will have a two-byte length.
+pub(crate) const MAX_LEN: usize = 1/*tag:SEQUENCE*/ + 2/*len*/ +
+    (2 * (1/*tag:INTEGER*/ + 1/*len*/ + 1/*zero*/ + ec::SCALAR_MAX_BYTES));
+
+/// A signature verification algorithm.
+pub trait VerificationAlgorithm: core::fmt::Debug + Sync + sealed::Sealed {
+    /// Verify the signature `signature` of message `msg` with the public key
+    /// `public_key`.
+    fn verify(
+        &self,
+        public_key: untrusted::Input,
+        msg: untrusted::Input,
+        signature: untrusted::Input,
+    ) -> Result<(), error::Unspecified>;
+}
+
+/// An unparsed, possibly malformed, public key for signature verification.
+#[derive(Clone, Copy)]
+pub struct UnparsedPublicKey<B> {
+    algorithm: &'static dyn VerificationAlgorithm,
+    bytes: B,
+}
+
+impl<B> AsRef<[u8]> for UnparsedPublicKey<B>
+where
+    B: AsRef<[u8]>,
+{
+    fn as_ref(&self) -> &[u8] {
+        self.bytes.as_ref()
+    }
+}
+
+impl<B: core::fmt::Debug> core::fmt::Debug for UnparsedPublicKey<B>
+where
+    B: AsRef<[u8]>,
+{
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
+        f.debug_struct("UnparsedPublicKey")
+            .field("algorithm", &self.algorithm)
+            .field("bytes", &debug::HexStr(self.bytes.as_ref()))
+            .finish()
+    }
+}
+
+impl<B> UnparsedPublicKey<B> {
+    /// Construct a new `UnparsedPublicKey`.
+    ///
+    /// No validation of `bytes` is done until `verify()` is called.
+    #[inline]
+    pub fn new(algorithm: &'static dyn VerificationAlgorithm, bytes: B) -> Self {
+        Self { algorithm, bytes }
+    }
+
+    /// Parses the public key and verifies `signature` is a valid signature of
+    /// `message` using it.
+    ///
+    /// See the [crate::signature] module-level documentation for examples.
+    pub fn verify(&self, message: &[u8], signature: &[u8]) -> Result<(), error::Unspecified>
+    where
+        B: AsRef<[u8]>,
+    {
+        let _ = cpu::features();
+        self.algorithm.verify(
+            untrusted::Input::from(self.bytes.as_ref()),
+            untrusted::Input::from(message),
+            untrusted::Input::from(signature),
+        )
+    }
+}
diff --git a/ring-0.17.14/src/tests/bits_tests.rs b/ring-0.17.14/src/tests/bits_tests.rs
new file mode 100644
index 0000000000..f8252026c9
--- /dev/null
+++ b/ring-0.17.14/src/tests/bits_tests.rs
@@ -0,0 +1,64 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::{
+    bits::{BitLength, FromByteLen as _},
+    polyfill::u64_from_usize,
+};
+
+#[test]
+fn test_from_byte_len_overflow() {
+    const USIZE_MAX_VALID_BYTES: usize = usize::MAX / 8;
+
+    // Maximum valid input for BitLength<usize>.
+    match BitLength::<usize>::from_byte_len(USIZE_MAX_VALID_BYTES) {
+        Ok(bits) => {
+            assert_eq!(bits.as_usize_bytes_rounded_up(), USIZE_MAX_VALID_BYTES);
+            assert_eq!(bits.as_bits(), usize::MAX & !0b111);
+        }
+        Err(_) => unreachable!(),
+    }
+
+    // Minimum invalid usize input for BitLength<usize>.
+    assert!(BitLength::<usize>::from_byte_len(USIZE_MAX_VALID_BYTES + 1).is_err());
+
+    // Minimum invalid usize input for BitLength<u64> on 64-bit targets.
+    {
+        let r = BitLength::<u64>::from_byte_len(USIZE_MAX_VALID_BYTES + 1);
+        if cfg!(target_pointer_width = "64") {
+            assert!(r.is_err());
+        } else {
+            match r {
+                Ok(bits) => {
+                    assert_eq!(
+                        bits.as_bits(),
+                        (u64_from_usize(USIZE_MAX_VALID_BYTES) + 1) * 8
+                    );
+                }
+                Err(_) => unreachable!(),
+            }
+        }
+    }
+
+    const U64_MAX_VALID_BYTES: u64 = u64::MAX / 8;
+
+    // Maximum valid u64 input for BitLength<u64>.
+    match BitLength::<u64>::from_byte_len(U64_MAX_VALID_BYTES) {
+        Ok(bits) => assert_eq!(bits.as_bits(), u64::MAX & !0b111),
+        Err(_) => unreachable!(),
+    };
+
+    // Minimum invalid usize input for BitLength<u64> on 64-bit targets.
+    assert!(BitLength::<u64>::from_byte_len(U64_MAX_VALID_BYTES + 1).is_err());
+}
diff --git a/ring-0.17.14/src/tests/mod.rs b/ring-0.17.14/src/tests/mod.rs
new file mode 100644
index 0000000000..b1cabd60c6
--- /dev/null
+++ b/ring-0.17.14/src/tests/mod.rs
@@ -0,0 +1,17 @@
+// Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Integration tests for non-public APIs.
+
+mod bits_tests;
diff --git a/ring-0.17.14/src/testutil.rs b/ring-0.17.14/src/testutil.rs
new file mode 100644
index 0000000000..31cfb98f9a
--- /dev/null
+++ b/ring-0.17.14/src/testutil.rs
@@ -0,0 +1,640 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+//! Testing framework.
+//!
+//! Unlike the rest of *ring*, this testing framework uses panics pretty
+//! liberally. It was originally designed for internal use--it drives most of
+//! *ring*'s internal tests, and so it is optimized for getting *ring*'s tests
+//! written quickly at the expense of some usability. The documentation is
+//! lacking. The best way to learn it is to look at some examples. The digest
+//! tests are the most complicated because they use named sections. Other tests
+//! avoid named sections and so are easier to understand.
+//!
+//! # Examples
+//!
+//! ## Writing Tests
+//!
+//! Input files look like this:
+//!
+//! ```text
+//! # This is a comment.
+//!
+//! HMAC = SHA1
+//! Input = "My test data"
+//! Key = ""
+//! Output = 61afdecb95429ef494d61fdee15990cabf0826fc
+//!
+//! HMAC = SHA256
+//! Input = "Sample message for keylen<blocklen"
+//! Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
+//! Output = A28CF43130EE696A98F14A37678B56BCFCBDD9E5CF69717FECF5480F0EBDF790
+//! ```
+//!
+//! Test cases are separated with blank lines. Note how the bytes of the `Key`
+//! attribute are specified as a quoted string in the first test case and as
+//! hex in the second test case; you can use whichever form is more convenient
+//! and you can mix and match within the same file. The empty sequence of bytes
+//! can only be represented with the quoted string form (`""`).
+//!
+//! Here's how you would consume the test data:
+//!
+//! ```ignore
+//! use ring::test;
+//!
+//! test::run(test::test_vector_file!("hmac_tests.txt"), |section, test_case| {
+//!     assert_eq!(section, ""); // This test doesn't use named sections.
+//!
+//!     let digest_alg = test_case.consume_digest_alg("HMAC");
+//!     let input = test_case.consume_bytes("Input");
+//!     let key = test_case.consume_bytes("Key");
+//!     let output = test_case.consume_bytes("Output");
+//!
+//!     // Do the actual testing here
+//! });
+//! ```
+//!
+//! Note that `consume_digest_alg` automatically maps the string "SHA1" to a
+//! reference to `digest::SHA1_FOR_LEGACY_USE_ONLY`, "SHA256" to
+//! `digest::SHA256`, etc.
+//!
+//! ## Output When a Test Fails
+//!
+//! When a test case fails, the framework automatically prints out the test
+//! case. If the test case failed with a panic, then the backtrace of the panic
+//! will be printed too. For example, let's say the failing test case looks
+//! like this:
+//!
+//! ```text
+//! Curve = P-256
+//! a = 2b11cb945c8cf152ffa4c9c2b1c965b019b35d0b7626919ef0ae6cb9d232f8af
+//! b = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
+//! r = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
+//! ```
+//! If the test fails, this will be printed (if `$RUST_BACKTRACE` is `1`):
+//!
+//! ```text
+//! src/example_tests.txt: Test panicked.
+//! Curve = P-256
+//! a = 2b11cb945c8cf152ffa4c9c2b1c965b019b35d0b7626919ef0ae6cb9d232f8af
+//! b = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
+//! r = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
+//! thread 'example_test' panicked at 'Test failed.', src\testutil:206
+//! stack backtrace:
+//!    0:     0x7ff654a05c7c - std::rt::lang_start::h61f4934e780b4dfc
+//!    1:     0x7ff654a04f32 - std::rt::lang_start::h61f4934e780b4dfc
+//!    2:     0x7ff6549f505d - std::panicking::rust_panic_with_hook::hfe203e3083c2b544
+//!    3:     0x7ff654a0825b - rust_begin_unwind
+//!    4:     0x7ff6549f63af - std::panicking::begin_panic_fmt::h484cd47786497f03
+//!    5:     0x7ff654a07e9b - rust_begin_unwind
+//!    6:     0x7ff654a0ae95 - core::panicking::panic_fmt::h257ceb0aa351d801
+//!    7:     0x7ff654a0b190 - core::panicking::panic::h4bb1497076d04ab9
+//!    8:     0x7ff65496dc41 - from_file<closure>
+//!                         at C:\Users\Example\example\<core macros>:4
+//!    9:     0x7ff65496d49c - example_test
+//!                         at C:\Users\Example\example\src\example.rs:652
+//!   10:     0x7ff6549d192a - test::stats::Summary::new::ha139494ed2e4e01f
+//!   11:     0x7ff6549d51a2 - test::stats::Summary::new::ha139494ed2e4e01f
+//!   12:     0x7ff654a0a911 - _rust_maybe_catch_panic
+//!   13:     0x7ff6549d56dd - test::stats::Summary::new::ha139494ed2e4e01f
+//!   14:     0x7ff654a03783 - std::sys::thread::Thread::new::h2b08da6cd2517f79
+//!   15:     0x7ff968518101 - BaseThreadInitThunk
+//! ```
+//!
+//! Notice that the output shows the name of the data file
+//! (`src/example_tests.txt`), the test inputs that led to the failure, and the
+//! stack trace to the line in the test code that panicked: entry 9 in the
+//! stack trace pointing to line 652 of the file `example.rs`.
+
+extern crate alloc;
+
+use alloc::{format, string::String, vec::Vec};
+
+use crate::{bits, digest, error};
+
+#[cfg(any(feature = "std", feature = "test_logging"))]
+extern crate std;
+
+/// `compile_time_assert_clone::<T>();` fails to compile if `T` doesn't
+/// implement `Clone`.
+pub const fn compile_time_assert_clone<T: Clone>() {}
+
+/// `compile_time_assert_copy::<T>();` fails to compile if `T` doesn't
+/// implement `Copy`.
+pub const fn compile_time_assert_copy<T: Copy>() {}
+
+/// `compile_time_assert_eq::<T>();` fails to compile if `T` doesn't
+/// implement `Eq`.
+pub const fn compile_time_assert_eq<T: Eq>() {}
+
+/// `compile_time_assert_send::<T>();` fails to compile if `T` doesn't
+/// implement `Send`.
+pub const fn compile_time_assert_send<T: Send>() {}
+
+/// `compile_time_assert_sync::<T>();` fails to compile if `T` doesn't
+/// implement `Sync`.
+pub const fn compile_time_assert_sync<T: Sync>() {}
+
+/// `compile_time_assert_std_error_error::<T>();` fails to compile if `T`
+/// doesn't implement `std::error::Error`.
+#[cfg(feature = "std")]
+pub const fn compile_time_assert_std_error_error<T: std::error::Error>() {}
+
+/// A test case. A test case consists of a set of named attributes. Every
+/// attribute in the test case must be consumed exactly once; this helps catch
+/// typos and omissions.
+///
+/// Requires the `alloc` default feature to be enabled.
+#[derive(Debug)]
+pub struct TestCase {
+    attributes: Vec<(String, String, bool)>,
+}
+
+impl TestCase {
+    /// Maps the string "true" to true and the string "false" to false.
+    pub fn consume_bool(&mut self, key: &str) -> bool {
+        match self.consume_string(key).as_ref() {
+            "true" => true,
+            "false" => false,
+            s => panic!("Invalid bool value: {}", s),
+        }
+    }
+
+    /// Maps the strings "SHA1", "SHA256", "SHA384", and "SHA512" to digest
+    /// algorithms, maps "SHA224" to `None`, and panics on other (erroneous)
+    /// inputs. "SHA224" is mapped to None because *ring* intentionally does
+    /// not support SHA224, but we need to consume test vectors from NIST that
+    /// have SHA224 vectors in them.
+    pub fn consume_digest_alg(&mut self, key: &str) -> Option<&'static digest::Algorithm> {
+        let name = self.consume_string(key);
+        match name.as_ref() {
+            "SHA1" => Some(&digest::SHA1_FOR_LEGACY_USE_ONLY),
+            "SHA224" => None, // We actively skip SHA-224 support.
+            "SHA256" => Some(&digest::SHA256),
+            "SHA384" => Some(&digest::SHA384),
+            "SHA512" => Some(&digest::SHA512),
+            "SHA512_256" => Some(&digest::SHA512_256),
+            _ => panic!("Unsupported digest algorithm: {}", name),
+        }
+    }
+
+    /// Returns the value of an attribute that is encoded as a sequence of an
+    /// even number of hex digits, or as a double-quoted UTF-8 string. The
+    /// empty (zero-length) value is represented as "".
+    pub fn consume_bytes(&mut self, key: &str) -> Vec<u8> {
+        self.consume_optional_bytes(key)
+            .unwrap_or_else(|| panic!("No attribute named \"{}\"", key))
+    }
+
+    /// Like `consume_bytes()` except it returns `None` if the test case
+    /// doesn't have the attribute.
+    pub fn consume_optional_bytes(&mut self, key: &str) -> Option<Vec<u8>> {
+        let s = self.consume_optional_string(key)?;
+        let result = if let [b'\"', s @ ..] = s.as_bytes() {
+            // The value is a quoted UTF-8 string.
+            let mut s = s.iter();
+            let mut bytes = Vec::with_capacity(s.len() - 1);
+            loop {
+                let b = match s.next() {
+                    Some(b'\\') => {
+                        match s.next() {
+                            // We don't allow all octal escape sequences, only "\0" for null.
+                            Some(b'0') => 0u8,
+                            Some(b't') => b'\t',
+                            Some(b'n') => b'\n',
+                            // "\xHH"
+                            Some(b'x') => {
+                                let hi = s.next().expect("Invalid hex escape sequence in string.");
+                                let lo = s.next().expect("Invalid hex escape sequence in string.");
+                                if let (Ok(hi), Ok(lo)) = (from_hex_digit(*hi), from_hex_digit(*lo))
+                                {
+                                    (hi << 4) | lo
+                                } else {
+                                    panic!("Invalid hex escape sequence in string.");
+                                }
+                            }
+                            _ => {
+                                panic!("Invalid hex escape sequence in string.");
+                            }
+                        }
+                    }
+                    Some(b'"') => {
+                        if s.next().is_some() {
+                            panic!("characters after the closing quote of a quoted string.");
+                        }
+                        break;
+                    }
+                    Some(b) => *b,
+                    None => panic!("Missing terminating '\"' in string literal."),
+                };
+                bytes.push(b);
+            }
+            bytes
+        } else {
+            // The value is hex encoded.
+            match from_hex(&s) {
+                Ok(s) => s,
+                Err(err_str) => {
+                    panic!("{} in {}", err_str, s);
+                }
+            }
+        };
+        Some(result)
+    }
+
+    /// Returns the value of an attribute that is an integer, in decimal
+    /// notation.
+    pub fn consume_usize(&mut self, key: &str) -> usize {
+        let s = self.consume_string(key);
+        s.parse::<usize>().unwrap()
+    }
+
+    /// Returns the value of an attribute that is an integer, in decimal
+    /// notation, as a bit length.
+    pub fn consume_usize_bits(&mut self, key: &str) -> bits::BitLength {
+        let s = self.consume_string(key);
+        let bits = s.parse::<usize>().unwrap();
+        bits::BitLength::from_bits(bits)
+    }
+
+    /// Returns the raw value of an attribute, without any unquoting or
+    /// other interpretation.
+    pub fn consume_string(&mut self, key: &str) -> String {
+        self.consume_optional_string(key)
+            .unwrap_or_else(|| panic!("No attribute named \"{}\"", key))
+    }
+
+    /// Like `consume_string()` except it returns `None` if the test case
+    /// doesn't have the attribute.
+    pub fn consume_optional_string(&mut self, key: &str) -> Option<String> {
+        for (name, value, consumed) in &mut self.attributes {
+            if key == name {
+                if *consumed {
+                    panic!("Attribute {} was already consumed", key);
+                }
+                *consumed = true;
+                return Some(value.clone());
+            }
+        }
+        None
+    }
+}
+
+/// References a test input file.
+#[cfg(test)]
+macro_rules! test_vector_file {
+    ($file_name:expr) => {
+        $crate::testutil::File {
+            file_name: $file_name,
+            contents: include_str!($file_name),
+        }
+    };
+}
+
+/// A test input file.
+pub struct File<'a> {
+    /// The name (path) of the file.
+    pub file_name: &'a str,
+
+    /// The contents of the file.
+    pub contents: &'a str,
+}
+
+/// Parses test cases out of the given file, calling `f` on each vector until
+/// `f` fails or until all the test vectors have been read. `f` can indicate
+/// failure either by returning `Err()` or by panicking.
+pub fn run<F>(test_file: File, mut f: F)
+where
+    F: FnMut(&str, &mut TestCase) -> Result<(), error::Unspecified>,
+{
+    let lines = &mut test_file.contents.lines();
+
+    let mut current_section = String::from("");
+    let mut failed = false;
+
+    while let Some(mut test_case) = parse_test_case(&mut current_section, lines) {
+        let result = match f(&current_section, &mut test_case) {
+            Ok(()) => {
+                if !test_case
+                    .attributes
+                    .iter()
+                    .any(|&(_, _, consumed)| !consumed)
+                {
+                    Ok(())
+                } else {
+                    failed = true;
+                    Err("Test didn't consume all attributes.")
+                }
+            }
+            Err(error::Unspecified) => Err("Test returned Err(error::Unspecified)."),
+        };
+
+        if result.is_err() {
+            failed = true;
+        }
+
+        #[cfg(feature = "test_logging")]
+        if let Err(msg) = result {
+            std::println!("{}: {}", test_file.file_name, msg);
+
+            for (name, value, consumed) in test_case.attributes {
+                let consumed_str = if consumed { "" } else { " (unconsumed)" };
+                std::println!("{}{} = {}", name, consumed_str, value);
+            }
+        };
+    }
+
+    if failed {
+        panic!("Test failed.")
+    }
+}
+
+/// Decode an string of hex digits into a sequence of bytes. The input must
+/// have an even number of digits.
+pub fn from_hex(hex_str: &str) -> Result<Vec<u8>, String> {
+    if hex_str.len() % 2 != 0 {
+        return Err(String::from(
+            "Hex string does not have an even number of digits",
+        ));
+    }
+
+    let mut result = Vec::with_capacity(hex_str.len() / 2);
+    for digits in hex_str.as_bytes().chunks(2) {
+        let hi = from_hex_digit(digits[0])?;
+        let lo = from_hex_digit(digits[1])?;
+        result.push((hi * 0x10) | lo);
+    }
+    Ok(result)
+}
+
+fn from_hex_digit(d: u8) -> Result<u8, String> {
+    use core::ops::RangeInclusive;
+    const DECIMAL: (u8, RangeInclusive<u8>) = (0, b'0'..=b'9');
+    const HEX_LOWER: (u8, RangeInclusive<u8>) = (10, b'a'..=b'f');
+    const HEX_UPPER: (u8, RangeInclusive<u8>) = (10, b'A'..=b'F');
+    for (offset, range) in &[DECIMAL, HEX_LOWER, HEX_UPPER] {
+        if range.contains(&d) {
+            return Ok(d - range.start() + offset);
+        }
+    }
+    Err(format!("Invalid hex digit '{}'", d as char))
+}
+
+fn parse_test_case(
+    current_section: &mut String,
+    lines: &mut dyn Iterator<Item = &str>,
+) -> Option<TestCase> {
+    let mut attributes = Vec::new();
+
+    let mut is_first_line = true;
+    loop {
+        let line = lines.next();
+
+        #[cfg(feature = "test_logging")]
+        if let Some(text) = &line {
+            std::println!("Line: {}", text);
+        }
+
+        match line {
+            // If we get to EOF when we're not in the middle of a test case,
+            // then we're done.
+            None if is_first_line => {
+                return None;
+            }
+
+            // End of the file on a non-empty test cases ends the test case.
+            None => {
+                return Some(TestCase { attributes });
+            }
+
+            // A blank line ends a test case if the test case isn't empty.
+            Some("") => {
+                if !is_first_line {
+                    return Some(TestCase { attributes });
+                }
+                // Ignore leading blank lines.
+            }
+
+            // Comments start with '#'; ignore them.
+            Some(line) if line.starts_with('#') => (),
+
+            Some(line) if line.starts_with('[') => {
+                assert!(is_first_line);
+                assert!(line.ends_with(']'));
+                current_section.truncate(0);
+                current_section.push_str(line);
+                let _ = current_section.pop();
+                let _ = current_section.remove(0);
+            }
+
+            Some(line) => {
+                is_first_line = false;
+
+                let parts: Vec<&str> = line.splitn(2, " = ").collect();
+                if parts.len() != 2 {
+                    panic!("Syntax error: Expected Key = Value.");
+                };
+
+                let key = parts[0].trim();
+                let value = parts[1].trim();
+
+                // Don't allow the value to be omitted. An empty value can be
+                // represented as an empty quoted string.
+                assert_ne!(value.len(), 0);
+
+                // Checking is_none() ensures we don't accept duplicate keys.
+                attributes.push((String::from(key), String::from(value), false));
+            }
+        }
+    }
+}
+
+/// Deterministic implementations of `ring::rand::SecureRandom`.
+///
+/// These implementations are particularly useful for testing implementations
+/// of randomized algorithms & protocols using known-answer-tests where the
+/// test vectors contain the random seed to use. They are also especially
+/// useful for some types of fuzzing.
+#[doc(hidden)]
+pub mod rand {
+    use crate::{error, rand};
+
+    /// An implementation of `SecureRandom` that always fills the output slice
+    /// with the given byte.
+    #[derive(Debug)]
+    pub struct FixedByteRandom {
+        pub byte: u8,
+    }
+
+    impl rand::sealed::SecureRandom for FixedByteRandom {
+        fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+            dest.fill(self.byte);
+            Ok(())
+        }
+    }
+
+    /// An implementation of `SecureRandom` that always fills the output slice
+    /// with the slice in `bytes`. The length of the slice given to `slice`
+    /// must match exactly.
+    #[derive(Debug)]
+    pub struct FixedSliceRandom<'a> {
+        pub bytes: &'a [u8],
+    }
+
+    impl rand::sealed::SecureRandom for FixedSliceRandom<'_> {
+        fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+            dest.copy_from_slice(self.bytes);
+            Ok(())
+        }
+    }
+
+    /// An implementation of `SecureRandom` where each slice in `bytes` is a
+    /// test vector for one call to `fill()`. *Not thread-safe.*
+    ///
+    /// The first slice in `bytes` is the output for the first call to
+    /// `fill()`, the second slice is the output for the second call to
+    /// `fill()`, etc. The output slice passed to `fill()` must have exactly
+    /// the length of the corresponding entry in `bytes`. `current` must be
+    /// initialized to zero. `fill()` must be called exactly once for each
+    /// entry in `bytes`.
+    #[derive(Debug)]
+    pub struct FixedSliceSequenceRandom<'a> {
+        /// The value.
+        pub bytes: &'a [&'a [u8]],
+        pub current: core::cell::UnsafeCell<usize>,
+    }
+
+    impl rand::sealed::SecureRandom for FixedSliceSequenceRandom<'_> {
+        fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> {
+            let current = unsafe { *self.current.get() };
+            let bytes = self.bytes[current];
+            dest.copy_from_slice(bytes);
+            // Remember that we returned this slice and prepare to return
+            // the next one, if any.
+            unsafe { *self.current.get() += 1 };
+            Ok(())
+        }
+    }
+
+    impl Drop for FixedSliceSequenceRandom<'_> {
+        fn drop(&mut self) {
+            // Ensure that `fill()` was called exactly the right number of
+            // times.
+            assert_eq!(unsafe { *self.current.get() }, self.bytes.len());
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::error;
+    use crate::testutil as test;
+
+    #[test]
+    fn one_ok() {
+        test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| {
+            let _ = test_case.consume_string("Key");
+            Ok(())
+        });
+    }
+
+    #[test]
+    #[should_panic(expected = "Test failed.")]
+    fn one_err() {
+        test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| {
+            let _ = test_case.consume_string("Key");
+            Err(error::Unspecified)
+        });
+    }
+
+    #[test]
+    #[should_panic(expected = "Oh noes!")]
+    fn one_panics() {
+        test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| {
+            let _ = test_case.consume_string("Key");
+            panic!("Oh noes!");
+        });
+    }
+
+    #[test]
+    #[should_panic(expected = "Test failed.")]
+    fn first_err() {
+        err_one(0)
+    }
+
+    #[test]
+    #[should_panic(expected = "Test failed.")]
+    fn middle_err() {
+        err_one(1)
+    }
+
+    #[test]
+    #[should_panic(expected = "Test failed.")]
+    fn last_err() {
+        err_one(2)
+    }
+
+    fn err_one(test_to_fail: usize) {
+        let mut n = 0;
+        test::run(test_vector_file!("test_3_tests.txt"), |_, test_case| {
+            let _ = test_case.consume_string("Key");
+            let result = if n != test_to_fail {
+                Ok(())
+            } else {
+                Err(error::Unspecified)
+            };
+            n += 1;
+            result
+        });
+    }
+
+    #[test]
+    #[should_panic(expected = "Oh Noes!")]
+    fn first_panic() {
+        panic_one(0)
+    }
+
+    #[test]
+    #[should_panic(expected = "Oh Noes!")]
+    fn middle_panic() {
+        panic_one(1)
+    }
+
+    #[test]
+    #[should_panic(expected = "Oh Noes!")]
+    fn last_panic() {
+        panic_one(2)
+    }
+
+    fn panic_one(test_to_fail: usize) {
+        let mut n = 0;
+        test::run(test_vector_file!("test_3_tests.txt"), |_, test_case| {
+            let _ = test_case.consume_string("Key");
+            if n == test_to_fail {
+                panic!("Oh Noes!");
+            };
+            n += 1;
+            Ok(())
+        });
+    }
+
+    #[test]
+    #[should_panic(expected = "Syntax error: Expected Key = Value.")]
+    fn syntax_error() {
+        test::run(
+            test_vector_file!("test_1_syntax_error_tests.txt"),
+            |_, _| Ok(()),
+        );
+    }
+}
diff --git a/ring-0.17.14/tests/aead_tests.rs b/ring-0.17.14/tests/aead_tests.rs
new file mode 100644
index 0000000000..261822f5ad
--- /dev/null
+++ b/ring-0.17.14/tests/aead_tests.rs
@@ -0,0 +1,653 @@
+// Copyright 2015-2021 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+use core::ops::RangeFrom;
+use ring::{aead, error};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+/// Generate the known answer test functions for the given algorithm and test
+/// case input file, where each test is implemented by a test in `$test`.
+///
+/// All of these tests can be run in parallel.
+macro_rules! test_known_answer {
+    ( $alg:ident, $test_file:expr, [ $( $test:ident ),+, ] ) => {
+        $(
+            #[test]
+            fn $test() {
+                test_aead(
+                    &aead::$alg,
+                    super::super::$test,
+                    test_file!($test_file));
+            }
+        )+
+    }
+}
+
+/// Generate the tests for a given algorithm.
+///
+/// All of these tests can be run in parallel.
+macro_rules! test_aead {
+    { $( { $alg:ident, $test_file:expr } ),+, } => {
+        mod aead_test { // Make `cargo test aead` include these files.
+            $(
+                #[allow(non_snake_case)]
+                mod $alg { // Provide a separate namespace for each algorithm's test.
+                    use super::super::*;
+
+                    #[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+                    use wasm_bindgen_test::wasm_bindgen_test as test;
+
+                    test_known_answer!(
+                        $alg,
+                        $test_file,
+                        [
+                            less_safe_key_open_in_place,
+                            less_safe_key_open_within,
+                            less_safe_key_seal_in_place_append_tag,
+                            less_safe_key_seal_in_place_separate_tag,
+                            opening_key_open_in_place,
+                            opening_key_open_within,
+                            sealing_key_seal_in_place_append_tag,
+                            sealing_key_seal_in_place_separate_tag,
+                            test_open_in_place_seperate_tag,
+                        ]);
+
+                    #[test]
+                    fn key_sizes() {
+                        super::super::key_sizes(&aead::$alg);
+                    }
+                }
+            )+
+        }
+    }
+}
+
+test_aead! {
+    { AES_128_GCM, "aead_aes_128_gcm_tests.txt" },
+    { AES_256_GCM, "aead_aes_256_gcm_tests.txt" },
+    { CHACHA20_POLY1305, "aead_chacha20_poly1305_tests.txt" },
+}
+
+struct KnownAnswerTestCase<'a> {
+    key: &'a [u8],
+    nonce: [u8; aead::NONCE_LEN],
+    plaintext: &'a [u8],
+    aad: aead::Aad<&'a [u8]>,
+    ciphertext: &'a [u8],
+    tag: &'a [u8],
+}
+
+fn test_aead(
+    aead_alg: &'static aead::Algorithm,
+    f: impl Fn(&'static aead::Algorithm, KnownAnswerTestCase) -> Result<(), error::Unspecified>,
+    test_file: test::File,
+) {
+    test::run(test_file, |section, test_case| {
+        assert_eq!(section, "");
+        let key = test_case.consume_bytes("KEY");
+        let nonce = test_case.consume_bytes("NONCE");
+        let plaintext = test_case.consume_bytes("IN");
+        let aad = test_case.consume_bytes("AD");
+        let ct = test_case.consume_bytes("CT");
+        let tag = test_case.consume_bytes("TAG");
+        let error = test_case.consume_optional_string("FAILS");
+
+        match error.as_deref() {
+            Some("WRONG_NONCE_LENGTH") => {
+                assert!(matches!(
+                    aead::Nonce::try_assume_unique_for_key(&nonce),
+                    Err(error::Unspecified)
+                ));
+                return Ok(());
+            }
+            Some(unexpected) => {
+                unreachable!("unexpected error in test data: {}", unexpected);
+            }
+            None => {}
+        };
+
+        let test_case = KnownAnswerTestCase {
+            key: &key,
+            nonce: nonce.as_slice().try_into().unwrap(),
+            plaintext: &plaintext,
+            aad: aead::Aad::from(&aad),
+            ciphertext: &ct,
+            tag: &tag,
+        };
+
+        f(aead_alg, test_case)
+    })
+}
+
+fn test_seal_append_tag<Seal>(
+    tc: &KnownAnswerTestCase,
+    seal: Seal,
+) -> Result<(), error::Unspecified>
+where
+    Seal: FnOnce(aead::Nonce, &mut Vec<u8>) -> Result<(), error::Unspecified>,
+{
+    let mut in_out = Vec::from(tc.plaintext);
+    seal(aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out)?;
+
+    let mut expected_ciphertext_and_tag = Vec::from(tc.ciphertext);
+    expected_ciphertext_and_tag.extend_from_slice(tc.tag);
+
+    assert_eq!(in_out, expected_ciphertext_and_tag);
+
+    Ok(())
+}
+
+fn test_seal_separate_tag<Seal>(
+    tc: &KnownAnswerTestCase,
+    seal: Seal,
+) -> Result<(), error::Unspecified>
+where
+    Seal: Fn(aead::Nonce, &mut [u8]) -> Result<aead::Tag, error::Unspecified>,
+{
+    let mut in_out = Vec::from(tc.plaintext);
+    let actual_tag = seal(aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out)?;
+    assert_eq!(actual_tag.as_ref(), tc.tag);
+    assert_eq!(in_out, tc.ciphertext);
+
+    Ok(())
+}
+
+fn test_open_in_place<OpenInPlace>(
+    tc: &KnownAnswerTestCase<'_>,
+    open_in_place: OpenInPlace,
+) -> Result<(), error::Unspecified>
+where
+    OpenInPlace:
+        for<'a> FnOnce(aead::Nonce, &'a mut [u8]) -> Result<&'a mut [u8], error::Unspecified>,
+{
+    let nonce = aead::Nonce::assume_unique_for_key(tc.nonce);
+
+    let mut in_out = Vec::from(tc.ciphertext);
+    in_out.extend_from_slice(tc.tag);
+
+    let actual_plaintext = open_in_place(nonce, &mut in_out)?;
+
+    assert_eq!(actual_plaintext, tc.plaintext);
+    assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext);
+    Ok(())
+}
+
+fn test_open_in_place_seperate_tag(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    let key = make_less_safe_key(alg, tc.key);
+
+    let mut in_out = Vec::from(tc.ciphertext);
+    let tag = tc.tag.try_into().unwrap();
+
+    // Test the simplest behavior.
+    {
+        let nonce = aead::Nonce::assume_unique_for_key(tc.nonce);
+        let actual_plaintext =
+            key.open_in_place_separate_tag(nonce, tc.aad, tag, &mut in_out, 0..)?;
+
+        assert_eq!(actual_plaintext, tc.plaintext);
+        assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext);
+    }
+
+    // Test that ciphertext range shifting works as expected.
+    {
+        let range = in_out.len()..;
+        in_out.extend_from_slice(tc.ciphertext);
+
+        let nonce = aead::Nonce::assume_unique_for_key(tc.nonce);
+        let actual_plaintext =
+            key.open_in_place_separate_tag(nonce, tc.aad, tag, &mut in_out, range)?;
+
+        assert_eq!(actual_plaintext, tc.plaintext);
+        assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext);
+    }
+
+    Ok(())
+}
+
+fn test_open_within<OpenWithin>(
+    tc: &KnownAnswerTestCase<'_>,
+    open_within: OpenWithin,
+) -> Result<(), error::Unspecified>
+where
+    OpenWithin: for<'a> Fn(
+        aead::Nonce,
+        &'a mut [u8],
+        RangeFrom<usize>,
+    ) -> Result<&'a mut [u8], error::Unspecified>,
+{
+    // In release builds, test all prefix lengths from 0 to 4096 bytes.
+    // Debug builds are too slow for this, so for those builds, only
+    // test a smaller subset.
+
+    // TLS record headers are 5 bytes long.
+    // TLS explicit nonces for AES-GCM are 8 bytes long.
+    static MINIMAL_IN_PREFIX_LENS: [usize; 36] = [
+        // No input prefix to overwrite; i.e. the opening is exactly
+        // "in place."
+        0,
+        1,
+        2,
+        // Proposed TLS 1.3 header (no explicit nonce).
+        5,
+        8,
+        // Probably the most common use of a non-zero `in_prefix_len`
+        // would be to write a decrypted TLS record over the top of the
+        // TLS header and nonce.
+        5 /* record header */ + 8, /* explicit nonce */
+        // The stitched AES-GCM x86-64 code works on 6-block (96 byte)
+        // units. Some of the ChaCha20 code is even weirder.
+        15,  // The maximum partial AES block.
+        16,  // One AES block.
+        17,  // One byte more than a full AES block.
+        31,  // 2 AES blocks or 1 ChaCha20 block, minus 1.
+        32,  // Two AES blocks, one ChaCha20 block.
+        33,  // 2 AES blocks or 1 ChaCha20 block, plus 1.
+        47,  // Three AES blocks - 1.
+        48,  // Three AES blocks.
+        49,  // Three AES blocks + 1.
+        63,  // Four AES blocks or two ChaCha20 blocks, minus 1.
+        64,  // Four AES blocks or two ChaCha20 blocks.
+        65,  // Four AES blocks or two ChaCha20 blocks, plus 1.
+        79,  // Five AES blocks, minus 1.
+        80,  // Five AES blocks.
+        81,  // Five AES blocks, plus 1.
+        95,  // Six AES blocks or three ChaCha20 blocks, minus 1.
+        96,  // Six AES blocks or three ChaCha20 blocks.
+        97,  // Six AES blocks or three ChaCha20 blocks, plus 1.
+        111, // Seven AES blocks, minus 1.
+        112, // Seven AES blocks.
+        113, // Seven AES blocks, plus 1.
+        127, // Eight AES blocks or four ChaCha20 blocks, minus 1.
+        128, // Eight AES blocks or four ChaCha20 blocks.
+        129, // Eight AES blocks or four ChaCha20 blocks, plus 1.
+        143, // Nine AES blocks, minus 1.
+        144, // Nine AES blocks.
+        145, // Nine AES blocks, plus 1.
+        255, // 16 AES blocks or 8 ChaCha20 blocks, minus 1.
+        256, // 16 AES blocks or 8 ChaCha20 blocks.
+        257, // 16 AES blocks or 8 ChaCha20 blocks, plus 1.
+    ];
+
+    let mut more_comprehensive_in_prefix_lengths = [0; 4096];
+    let in_prefix_lengths = if cfg!(debug_assertions) {
+        &MINIMAL_IN_PREFIX_LENS[..]
+    } else {
+        #[allow(clippy::needless_range_loop)]
+        for b in 0..more_comprehensive_in_prefix_lengths.len() {
+            more_comprehensive_in_prefix_lengths[b] = b;
+        }
+        &more_comprehensive_in_prefix_lengths[..]
+    };
+    let mut in_out = vec![123u8; 4096];
+
+    for &in_prefix_len in in_prefix_lengths.iter() {
+        in_out.truncate(0);
+        in_out.resize(in_prefix_len, 123);
+        in_out.extend_from_slice(tc.ciphertext);
+        in_out.extend_from_slice(tc.tag);
+
+        let actual_plaintext = open_within(
+            aead::Nonce::assume_unique_for_key(tc.nonce),
+            &mut in_out,
+            in_prefix_len..,
+        )?;
+        assert_eq!(actual_plaintext, tc.plaintext);
+        assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext);
+    }
+
+    Ok(())
+}
+
+fn sealing_key_seal_in_place_append_tag(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_seal_append_tag(&tc, |nonce, in_out| {
+        let mut key: aead::SealingKey<OneNonceSequence> = make_key(alg, tc.key, nonce);
+        key.seal_in_place_append_tag(tc.aad, in_out)
+    })
+}
+
+fn sealing_key_seal_in_place_separate_tag(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_seal_separate_tag(&tc, |nonce, in_out| {
+        let mut key: aead::SealingKey<_> = make_key(alg, tc.key, nonce);
+        key.seal_in_place_separate_tag(tc.aad, in_out)
+    })
+}
+
+fn opening_key_open_in_place(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_open_in_place(&tc, |nonce, in_out| {
+        let mut key: aead::OpeningKey<_> = make_key(alg, tc.key, nonce);
+        key.open_in_place(tc.aad, in_out)
+    })
+}
+
+fn opening_key_open_within(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_open_within(&tc, |nonce, in_out, ciphertext_and_tag| {
+        let mut key: aead::OpeningKey<OneNonceSequence> = make_key(alg, tc.key, nonce);
+        key.open_within(tc.aad, in_out, ciphertext_and_tag)
+    })
+}
+
+fn less_safe_key_seal_in_place_append_tag(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_seal_append_tag(&tc, |nonce, in_out| {
+        let key = make_less_safe_key(alg, tc.key);
+        key.seal_in_place_append_tag(nonce, tc.aad, in_out)
+    })
+}
+
+fn less_safe_key_open_in_place(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_open_in_place(&tc, |nonce, in_out| {
+        let key = make_less_safe_key(alg, tc.key);
+        key.open_in_place(nonce, tc.aad, in_out)
+    })
+}
+
+fn less_safe_key_seal_in_place_separate_tag(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_seal_separate_tag(&tc, |nonce, in_out| {
+        let key = make_less_safe_key(alg, tc.key);
+        key.seal_in_place_separate_tag(nonce, tc.aad, in_out)
+    })
+}
+
+fn less_safe_key_open_within(
+    alg: &'static aead::Algorithm,
+    tc: KnownAnswerTestCase,
+) -> Result<(), error::Unspecified> {
+    test_open_within(&tc, |nonce, in_out, ciphertext_and_tag| {
+        let key = make_less_safe_key(alg, tc.key);
+        key.open_within(nonce, tc.aad, in_out, ciphertext_and_tag)
+    })
+}
+
+#[allow(clippy::range_plus_one)]
+fn key_sizes(aead_alg: &'static aead::Algorithm) {
+    let key_len = aead_alg.key_len();
+    let key_data = vec![0u8; key_len * 2];
+
+    // Key is the right size.
+    assert!(aead::UnboundKey::new(aead_alg, &key_data[..key_len]).is_ok());
+
+    // Key is one byte too small.
+    assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len - 1)]).is_err());
+
+    // Key is one byte too large.
+    assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len + 1)]).is_err());
+
+    // Key is half the required size.
+    assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len / 2)]).is_err());
+
+    // Key is twice the required size.
+    assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len * 2)]).is_err());
+
+    // Key is empty.
+    assert!(aead::UnboundKey::new(aead_alg, &[]).is_err());
+
+    // Key is one byte.
+    assert!(aead::UnboundKey::new(aead_alg, &[0]).is_err());
+}
+
+// Test that we reject non-standard nonce sizes.
+#[allow(clippy::range_plus_one)]
+#[test]
+fn test_aead_nonce_sizes() {
+    let nonce_len = aead::NONCE_LEN;
+    let nonce = vec![0u8; nonce_len * 2];
+
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..nonce_len]).is_ok());
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len - 1)]).is_err());
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len + 1)]).is_err());
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len / 2)]).is_err());
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len * 2)]).is_err());
+    assert!(aead::Nonce::try_assume_unique_for_key(&[]).is_err());
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..1]).is_err());
+    assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..16]).is_err()); // 128 bits.
+}
+
+#[allow(clippy::range_plus_one)]
+#[test]
+fn aead_chacha20_poly1305_openssh() {
+    // TODO: test_aead_key_sizes(...);
+
+    test::run(
+        test_file!("aead_chacha20_poly1305_openssh_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            // XXX: `polyfill::convert` isn't available here.
+            let key_bytes = {
+                let as_vec = test_case.consume_bytes("KEY");
+                let mut as_array = [0u8; aead::chacha20_poly1305_openssh::KEY_LEN];
+                as_array.copy_from_slice(&as_vec);
+                as_array
+            };
+
+            let sequence_num: u32 = test_case
+                .consume_usize("SEQUENCE_NUMBER")
+                .try_into()
+                .unwrap();
+            let plaintext = test_case.consume_bytes("IN");
+            let ct = test_case.consume_bytes("CT");
+            let expected_tag = test_case.consume_bytes("TAG");
+
+            // TODO: Add some tests for when things fail.
+            //let error = test_case.consume_optional_string("FAILS");
+
+            let mut tag = [0u8; aead::chacha20_poly1305_openssh::TAG_LEN];
+            let mut s_in_out = plaintext.clone();
+            let s_key = aead::chacha20_poly1305_openssh::SealingKey::new(&key_bytes);
+            s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag);
+            assert_eq!(&ct, &s_in_out);
+            assert_eq!(&expected_tag, &tag);
+            let o_key = aead::chacha20_poly1305_openssh::OpeningKey::new(&key_bytes);
+
+            {
+                let o_result = o_key.open_in_place(sequence_num, &mut s_in_out[..], &tag);
+                assert_eq!(o_result, Ok(&plaintext[4..]));
+            }
+            assert_eq!(&s_in_out[..4], &ct[..4]);
+            assert_eq!(&s_in_out[4..], &plaintext[4..]);
+
+            Ok(())
+        },
+    );
+}
+
+#[test]
+fn aead_test_aad_traits() {
+    test::compile_time_assert_send::<aead::Aad<&'_ [u8]>>();
+    test::compile_time_assert_sync::<aead::Aad<&'_ [u8]>>();
+    test::compile_time_assert_copy::<aead::Aad<&'_ [u8]>>();
+    test::compile_time_assert_eq::<aead::Aad<Vec<u8>>>(); // `!Copy`
+
+    let aad_123 = aead::Aad::from(vec![1, 2, 3]); // `!Copy`
+    assert_eq!(aad_123, aad_123.clone()); // Cover `Clone` and `PartialEq`
+    assert_eq!(
+        format!("{:?}", aead::Aad::from(&[1, 2, 3])),
+        "Aad([1, 2, 3])"
+    );
+}
+
+#[test]
+fn test_nonce_traits() {
+    test::compile_time_assert_send::<aead::Nonce>();
+    test::compile_time_assert_sync::<aead::Nonce>();
+}
+
+#[test]
+fn test_tag_traits() {
+    test::compile_time_assert_send::<aead::Tag>();
+    test::compile_time_assert_sync::<aead::Tag>();
+
+    test::compile_time_assert_copy::<aead::Tag>();
+    test::compile_time_assert_clone::<aead::Tag>();
+
+    let tag = aead::Tag::from([4u8; 16]);
+    let _tag_2 = tag; // Cover `Copy`
+    assert_eq!(tag.as_ref(), tag.clone().as_ref()); // Cover `Clone`
+}
+
+fn test_aead_key_traits<T: Send + Sync>() {}
+
+#[test]
+fn test_aead_key_traits_all() {
+    test_aead_key_traits::<aead::OpeningKey<OneNonceSequence>>();
+    test_aead_key_traits::<aead::SealingKey<OneNonceSequence>>();
+    test_aead_key_traits::<aead::LessSafeKey>();
+}
+
+#[test]
+fn test_aead_key_debug() {
+    let key_bytes = [0; 32];
+    let nonce = [0; aead::NONCE_LEN];
+
+    let key = aead::UnboundKey::new(&aead::AES_256_GCM, &key_bytes).unwrap();
+    assert_eq!(
+        "UnboundKey { algorithm: AES_256_GCM }",
+        format!("{:?}", key)
+    );
+
+    let sealing_key: aead::SealingKey<OneNonceSequence> = make_key(
+        &aead::AES_256_GCM,
+        &key_bytes,
+        aead::Nonce::try_assume_unique_for_key(&nonce).unwrap(),
+    );
+    assert_eq!(
+        "SealingKey { algorithm: AES_256_GCM }",
+        format!("{:?}", sealing_key)
+    );
+
+    let opening_key: aead::OpeningKey<OneNonceSequence> = make_key(
+        &aead::AES_256_GCM,
+        &key_bytes,
+        aead::Nonce::try_assume_unique_for_key(&nonce).unwrap(),
+    );
+    assert_eq!(
+        "OpeningKey { algorithm: AES_256_GCM }",
+        format!("{:?}", opening_key)
+    );
+
+    let key: aead::LessSafeKey = make_less_safe_key(&aead::AES_256_GCM, &key_bytes);
+    assert_eq!(
+        "LessSafeKey { algorithm: AES_256_GCM }",
+        format!("{:?}", key)
+    );
+}
+
+fn test_aead_lesssafekey_clone_for_algorithm(algorithm: &'static aead::Algorithm) {
+    let test_bytes: Vec<u8> = (0..32).collect();
+    let key_bytes = &test_bytes[..algorithm.key_len()];
+    let nonce_bytes = &test_bytes[..algorithm.nonce_len()];
+
+    let key1: aead::LessSafeKey =
+        aead::LessSafeKey::new(aead::UnboundKey::new(algorithm, key_bytes).unwrap());
+    let key2 = key1.clone();
+
+    // LessSafeKey doesn't support AsRef or PartialEq, so instead just check that both keys produce
+    // the same encrypted output.
+    let mut buf1: Vec<u8> = (0..100).collect();
+    let mut buf2 = buf1.clone();
+    let tag1 = key1
+        .seal_in_place_separate_tag(
+            aead::Nonce::try_assume_unique_for_key(nonce_bytes).unwrap(),
+            aead::Aad::empty(),
+            &mut buf1,
+        )
+        .unwrap();
+    let tag2 = key2
+        .seal_in_place_separate_tag(
+            aead::Nonce::try_assume_unique_for_key(nonce_bytes).unwrap(),
+            aead::Aad::empty(),
+            &mut buf2,
+        )
+        .unwrap();
+    assert_eq!(tag1.as_ref(), tag2.as_ref());
+    assert_eq!(buf1, buf2);
+}
+
+#[test]
+fn test_aead_lesssafekey_clone_aes_128_gcm() {
+    test_aead_lesssafekey_clone_for_algorithm(&aead::AES_128_GCM);
+}
+
+#[test]
+fn test_aead_lesssafekey_clone_aes_256_gcm() {
+    test_aead_lesssafekey_clone_for_algorithm(&aead::AES_256_GCM);
+}
+
+#[test]
+fn test_aead_lesssafekey_clone_chacha20_poly1305() {
+    test_aead_lesssafekey_clone_for_algorithm(&aead::CHACHA20_POLY1305);
+}
+
+fn make_key<K: aead::BoundKey<OneNonceSequence>>(
+    algorithm: &'static aead::Algorithm,
+    key: &[u8],
+    nonce: aead::Nonce,
+) -> K {
+    let key = aead::UnboundKey::new(algorithm, key).unwrap();
+    let nonce_sequence = OneNonceSequence::new(nonce);
+    K::new(key, nonce_sequence)
+}
+
+fn make_less_safe_key(algorithm: &'static aead::Algorithm, key: &[u8]) -> aead::LessSafeKey {
+    let key = aead::UnboundKey::new(algorithm, key).unwrap();
+    aead::LessSafeKey::new(key)
+}
+
+struct OneNonceSequence(Option<aead::Nonce>);
+
+impl OneNonceSequence {
+    /// Constructs the sequence allowing `advance()` to be called
+    /// `allowed_invocations` times.
+    fn new(nonce: aead::Nonce) -> Self {
+        Self(Some(nonce))
+    }
+}
+
+impl aead::NonceSequence for OneNonceSequence {
+    fn advance(&mut self) -> Result<aead::Nonce, error::Unspecified> {
+        self.0.take().ok_or(error::Unspecified)
+    }
+}
diff --git a/ring-0.17.14/tests/agreement_tests.rs b/ring-0.17.14/tests/agreement_tests.rs
new file mode 100644
index 0000000000..edbe04fcb5
--- /dev/null
+++ b/ring-0.17.14/tests/agreement_tests.rs
@@ -0,0 +1,218 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+extern crate alloc;
+
+use ring::{agreement, error, rand};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[test]
+fn agreement_traits() {
+    use alloc::vec::Vec;
+
+    let rng = rand::SystemRandom::new();
+    let private_key =
+        agreement::EphemeralPrivateKey::generate(&agreement::ECDH_P256, &rng).unwrap();
+
+    test::compile_time_assert_send::<agreement::EphemeralPrivateKey>();
+    test::compile_time_assert_sync::<agreement::EphemeralPrivateKey>();
+
+    assert_eq!(
+        format!("{:?}", &private_key),
+        "EphemeralPrivateKey { algorithm: Algorithm { curve: P256 } }"
+    );
+
+    let public_key = private_key.compute_public_key().unwrap();
+
+    test::compile_time_assert_clone::<agreement::PublicKey>();
+    test::compile_time_assert_send::<agreement::PublicKey>();
+    test::compile_time_assert_sync::<agreement::PublicKey>();
+
+    // Verify `PublicKey` implements `Debug`.
+    //
+    // TODO: Test the actual output.
+    let _: &dyn core::fmt::Debug = &public_key;
+
+    test::compile_time_assert_clone::<agreement::UnparsedPublicKey<&[u8]>>();
+    test::compile_time_assert_copy::<agreement::UnparsedPublicKey<&[u8]>>();
+    test::compile_time_assert_sync::<agreement::UnparsedPublicKey<&[u8]>>();
+
+    test::compile_time_assert_clone::<agreement::UnparsedPublicKey<Vec<u8>>>();
+    test::compile_time_assert_sync::<agreement::UnparsedPublicKey<Vec<u8>>>();
+
+    let unparsed_public_key =
+        agreement::UnparsedPublicKey::new(&agreement::X25519, &[0x01, 0x02, 0x03]);
+
+    assert_eq!(
+        format!("{:?}", unparsed_public_key),
+        r#"UnparsedPublicKey { algorithm: Algorithm { curve: Curve25519 }, bytes: "010203" }"#
+    );
+
+    // Test `AsRef<[u8]>`
+    assert_eq!(unparsed_public_key.as_ref(), &[0x01, 0x02, 0x03]);
+}
+
+#[test]
+fn agreement_agree_ephemeral() {
+    let rng = rand::SystemRandom::new();
+
+    test::run(test_file!("agreement_tests.txt"), |section, test_case| {
+        assert_eq!(section, "");
+
+        let curve_name = test_case.consume_string("Curve");
+        let alg = alg_from_curve_name(&curve_name);
+        let peer_public = agreement::UnparsedPublicKey::new(alg, test_case.consume_bytes("PeerQ"));
+
+        match test_case.consume_optional_string("Error") {
+            None => {
+                let my_private = test_case.consume_bytes("D");
+                let my_private = {
+                    #[allow(deprecated)]
+                    let rng = test::rand::FixedSliceRandom { bytes: &my_private };
+                    agreement::EphemeralPrivateKey::generate(alg, &rng)?
+                };
+                let my_public = test_case.consume_bytes("MyQ");
+                let output = test_case.consume_bytes("Output");
+
+                assert_eq!(my_private.algorithm(), alg);
+
+                let computed_public = my_private.compute_public_key().unwrap();
+                assert_eq!(computed_public.as_ref(), &my_public[..]);
+
+                assert_eq!(my_private.algorithm(), alg);
+
+                let result = agreement::agree_ephemeral(my_private, &peer_public, |key_material| {
+                    assert_eq!(key_material, &output[..]);
+                });
+                assert_eq!(result, Ok(()));
+            }
+
+            Some(_) => {
+                // In the no-heap mode, some algorithms aren't supported so
+                // we have to skip those algorithms' test cases.
+                let dummy_private_key = agreement::EphemeralPrivateKey::generate(alg, &rng)?;
+                fn kdf_not_called(_: &[u8]) -> Result<(), ()> {
+                    panic!(
+                        "The KDF was called during ECDH when the peer's \
+                         public key is invalid."
+                    );
+                }
+                assert!(agreement::agree_ephemeral(
+                    dummy_private_key,
+                    &peer_public,
+                    kdf_not_called
+                )
+                .is_err());
+            }
+        }
+
+        Ok(())
+    });
+}
+
+#[test]
+fn test_agreement_ecdh_x25519_rfc_iterated() {
+    let mut k = h("0900000000000000000000000000000000000000000000000000000000000000");
+    let mut u = k.clone();
+
+    fn expect_iterated_x25519(
+        expected_result: &str,
+        range: core::ops::Range<usize>,
+        k: &mut Vec<u8>,
+        u: &mut Vec<u8>,
+    ) {
+        for _ in range {
+            let new_k = x25519(k, u);
+            u.clone_from(k);
+            *k = new_k;
+        }
+        assert_eq!(&h(expected_result), k);
+    }
+
+    expect_iterated_x25519(
+        "422c8e7a6227d7bca1350b3e2bb7279f7897b87bb6854b783c60e80311ae3079",
+        0..1,
+        &mut k,
+        &mut u,
+    );
+    expect_iterated_x25519(
+        "684cf59ba83309552800ef566f2f4d3c1c3887c49360e3875f2eb94d99532c51",
+        1..1_000,
+        &mut k,
+        &mut u,
+    );
+
+    // The spec gives a test vector for 1,000,000 iterations but it takes
+    // too long to do 1,000,000 iterations by default right now. This
+    // 10,000 iteration vector is self-computed.
+    expect_iterated_x25519(
+        "2c125a20f639d504a7703d2e223c79a79de48c4ee8c23379aa19a62ecd211815",
+        1_000..10_000,
+        &mut k,
+        &mut u,
+    );
+
+    if cfg!(feature = "slow_tests") {
+        expect_iterated_x25519(
+            "7c3911e0ab2586fd864497297e575e6f3bc601c0883c30df5f4dd2d24f665424",
+            10_000..1_000_000,
+            &mut k,
+            &mut u,
+        );
+    }
+}
+
+fn x25519(private_key: &[u8], public_key: &[u8]) -> Vec<u8> {
+    x25519_(private_key, public_key).unwrap()
+}
+
+fn x25519_(private_key: &[u8], public_key: &[u8]) -> Result<Vec<u8>, error::Unspecified> {
+    #[allow(deprecated)]
+    let rng = test::rand::FixedSliceRandom { bytes: private_key };
+    let private_key = agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?;
+    let public_key = agreement::UnparsedPublicKey::new(&agreement::X25519, public_key);
+    agreement::agree_ephemeral(private_key, &public_key, |agreed_value| {
+        Vec::from(agreed_value)
+    })
+}
+
+fn h(s: &str) -> Vec<u8> {
+    match test::from_hex(s) {
+        Ok(v) => v,
+        Err(msg) => {
+            panic!("{} in {}", msg, s);
+        }
+    }
+}
+
+fn alg_from_curve_name(curve_name: &str) -> &'static agreement::Algorithm {
+    if curve_name == "P-256" {
+        &agreement::ECDH_P256
+    } else if curve_name == "P-384" {
+        &agreement::ECDH_P384
+    } else if curve_name == "X25519" {
+        &agreement::X25519
+    } else {
+        panic!("Unsupported curve: {}", curve_name);
+    }
+}
diff --git a/ring-0.17.14/tests/constant_time_tests.rs b/ring-0.17.14/tests/constant_time_tests.rs
new file mode 100644
index 0000000000..b8d9ac38ff
--- /dev/null
+++ b/ring-0.17.14/tests/constant_time_tests.rs
@@ -0,0 +1,78 @@
+// Copyright 2020 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+#[allow(deprecated)]
+use constant_time::verify_slices_are_equal;
+#[allow(deprecated)]
+use ring::constant_time;
+use ring::{error, rand};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+// This logic is loosely based on BoringSSL's `TEST(ConstantTimeTest, MemCmp)`.
+#[allow(deprecated)]
+#[test]
+fn test_verify_slices_are_equal() {
+    let initial: [u8; 256] = rand::generate(&rand::SystemRandom::new()).unwrap().expose();
+
+    {
+        let copy = initial;
+        for len in 0..copy.len() {
+            // Not equal because the lengths do not match.
+            assert_eq!(
+                verify_slices_are_equal(&initial, &copy[..len]),
+                Err(error::Unspecified)
+            );
+            // Equal lengths and equal contents.
+            assert_eq!(
+                verify_slices_are_equal(&initial[..len], &copy[..len]),
+                Ok(())
+            );
+        }
+        // Equal lengths and equal contents.
+        assert_eq!(verify_slices_are_equal(&initial, &copy), Ok(()));
+    }
+
+    for i in 0..initial.len() {
+        for bit in 0..8 {
+            let mut copy = initial;
+            copy[i] ^= 1u8 << bit;
+
+            for len in 0..=initial.len() {
+                // We flipped at least one bit in `copy`.
+                assert_ne!(&initial[..], &copy[..]);
+
+                let a = &initial[..len];
+                let b = &copy[..len];
+
+                let expected_result = if i < len {
+                    // The flipped bit is within `b` so `a` and `b` are not equal.
+                    Err(error::Unspecified)
+                } else {
+                    // The flipped bit is outside of `b` so `a` and `b` are equal.
+                    Ok(())
+                };
+                assert_eq!(a == b, expected_result.is_ok()); // Sanity check.
+                assert_eq!(verify_slices_are_equal(a, b), expected_result);
+                assert_eq!(verify_slices_are_equal(b, a), expected_result);
+            }
+        }
+    }
+}
diff --git a/ring-0.17.14/tests/digest_tests.rs b/ring-0.17.14/tests/digest_tests.rs
new file mode 100644
index 0000000000..0a3a824826
--- /dev/null
+++ b/ring-0.17.14/tests/digest_tests.rs
@@ -0,0 +1,137 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::digest;
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+/// Test vectors from BoringSSL, Go, and other sources.
+#[test]
+fn digest_misc() {
+    test::run(test_file!("digest_tests.txt"), |section, test_case| {
+        assert_eq!(section, "");
+        let digest_alg = test_case.consume_digest_alg("Hash").unwrap();
+        let input = test_case.consume_bytes("Input");
+        let repeat = test_case.consume_usize("Repeat");
+        let expected = test_case.consume_bytes("Output");
+
+        let mut ctx = digest::Context::new(digest_alg);
+        let mut data = Vec::new();
+        for _ in 0..repeat {
+            ctx.update(&input);
+            data.extend(&input);
+        }
+        let actual_from_chunks = ctx.finish();
+        assert_eq!(&expected, &actual_from_chunks.as_ref());
+
+        let actual_from_one_shot = digest::digest(digest_alg, &data);
+        assert_eq!(&expected, &actual_from_one_shot.as_ref());
+
+        Ok(())
+    });
+}
+
+/// Test some ways in which `Context::update` and/or `Context::finish`
+/// could go wrong by testing every combination of updating three inputs
+/// that vary from zero bytes to one byte larger than the block length.
+///
+/// These are not run in dev (debug) builds because they are too slow.
+macro_rules! test_i_u_f {
+    ( $test_name:ident, $alg:expr) => {
+        #[cfg(not(debug_assertions))]
+        #[test]
+        fn $test_name() {
+            let mut input = [0; (digest::MAX_BLOCK_LEN + 1) * 3];
+            let max = $alg.block_len() + 1;
+            for i in 0..(max * 3) {
+                input[i] = (i & 0xff) as u8;
+            }
+
+            for i in 0..max {
+                for j in 0..max {
+                    for k in 0..max {
+                        let part1 = &input[..i];
+                        let part2 = &input[i..(i + j)];
+                        let part3 = &input[(i + j)..(i + j + k)];
+
+                        let mut ctx = digest::Context::new(&$alg);
+                        ctx.update(part1);
+                        ctx.update(part2);
+                        ctx.update(part3);
+                        let i_u_f = ctx.finish();
+
+                        let one_shot = digest::digest(&$alg, &input[..(i + j + k)]);
+
+                        assert_eq!(i_u_f.as_ref(), one_shot.as_ref());
+                    }
+                }
+            }
+        }
+    };
+}
+test_i_u_f!(digest_test_i_u_f_sha1, digest::SHA1_FOR_LEGACY_USE_ONLY);
+test_i_u_f!(digest_test_i_u_f_sha256, digest::SHA256);
+test_i_u_f!(digest_test_i_u_f_sha384, digest::SHA384);
+test_i_u_f!(digest_test_i_u_f_sha512, digest::SHA512);
+
+#[test]
+fn test_fmt_algorithm() {
+    assert_eq!("SHA1", &format!("{:?}", digest::SHA1_FOR_LEGACY_USE_ONLY));
+    assert_eq!("SHA256", &format!("{:?}", digest::SHA256));
+    assert_eq!("SHA384", &format!("{:?}", digest::SHA384));
+    assert_eq!("SHA512", &format!("{:?}", digest::SHA512));
+    assert_eq!("SHA512_256", &format!("{:?}", digest::SHA512_256));
+}
+
+#[test]
+fn digest_test_fmt() {
+    assert_eq!(
+        "SHA1:b7e23ec29af22b0b4e41da31e868d57226121c84",
+        &format!(
+            "{:?}",
+            digest::digest(&digest::SHA1_FOR_LEGACY_USE_ONLY, b"hello, world")
+        )
+    );
+    assert_eq!(
+        "SHA256:09ca7e4eaa6e8ae9c7d261167129184883644d\
+         07dfba7cbfbc4c8a2e08360d5b",
+        &format!("{:?}", digest::digest(&digest::SHA256, b"hello, world"))
+    );
+    assert_eq!(
+        "SHA384:1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5\
+         fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f0\
+         0418a70cdb7e",
+        &format!("{:?}", digest::digest(&digest::SHA384, b"hello, world"))
+    );
+    assert_eq!(
+        "SHA512:8710339dcb6814d0d9d2290ef422285c9322b7\
+         163951f9a0ca8f883d3305286f44139aa374848e4174f5\
+         aada663027e4548637b6d19894aec4fb6c46a139fbf9",
+        &format!("{:?}", digest::digest(&digest::SHA512, b"hello, world"))
+    );
+
+    assert_eq!(
+        "SHA512_256:11f2c88c04f0a9c3d0970894ad2472505e\
+         0bc6e8c7ec46b5211cd1fa3e253e62",
+        &format!("{:?}", digest::digest(&digest::SHA512_256, b"hello, world"))
+    );
+}
diff --git a/ring-0.17.14/tests/ecdsa_test_private_key_p256.p8 b/ring-0.17.14/tests/ecdsa_test_private_key_p256.p8
new file mode 100644
index 0000000000..bc118842bb
Binary files /dev/null and b/ring-0.17.14/tests/ecdsa_test_private_key_p256.p8 differ
diff --git a/ring-0.17.14/tests/ecdsa_test_public_key_p256.der b/ring-0.17.14/tests/ecdsa_test_public_key_p256.der
new file mode 100644
index 0000000000..ce3e6d7e97
--- /dev/null
+++ b/ring-0.17.14/tests/ecdsa_test_public_key_p256.der
@@ -0,0 +1 @@
+�f���#eP��墳M`*eғ�P�3�M�������?5�Y$^[Zz�ܧk3���Y���5��.�
\ No newline at end of file
diff --git a/ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt b/ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt
new file mode 100644
index 0000000000..b590fc8490
--- /dev/null
+++ b/ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt
@@ -0,0 +1 @@
+PublicKey("04fc116698a3e3236550c4c9efa9bd4d0619602a65d2930e9150ab33e84dbc83f8a6a6b9933f35ab59245e5b5a7af5dca76b33cbe7aeee5981b3ca350bebf52ecd")
\ No newline at end of file
diff --git a/ring-0.17.14/tests/ecdsa_tests.rs b/ring-0.17.14/tests/ecdsa_tests.rs
new file mode 100644
index 0000000000..83e5abf405
--- /dev/null
+++ b/ring-0.17.14/tests/ecdsa_tests.rs
@@ -0,0 +1,321 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::{
+    rand,
+    signature::{self, KeyPair},
+};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+// ECDSA *signing* tests are in src/ec/ecdsa/signing.rs.
+
+#[test]
+fn ecdsa_from_pkcs8_test() {
+    let rng = rand::SystemRandom::new();
+
+    test::run(
+        test_file!("ecdsa_from_pkcs8_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let curve_name = test_case.consume_string("Curve");
+            let ((this_fixed, this_asn1), (other_fixed, other_asn1)) = match curve_name.as_str() {
+                "P-256" => (
+                    (
+                        &signature::ECDSA_P256_SHA256_FIXED_SIGNING,
+                        &signature::ECDSA_P256_SHA256_ASN1_SIGNING,
+                    ),
+                    (
+                        &signature::ECDSA_P384_SHA384_FIXED_SIGNING,
+                        &signature::ECDSA_P384_SHA384_ASN1_SIGNING,
+                    ),
+                ),
+                "P-384" => (
+                    (
+                        &signature::ECDSA_P384_SHA384_FIXED_SIGNING,
+                        &signature::ECDSA_P384_SHA384_ASN1_SIGNING,
+                    ),
+                    (
+                        &signature::ECDSA_P256_SHA256_FIXED_SIGNING,
+                        &signature::ECDSA_P256_SHA256_ASN1_SIGNING,
+                    ),
+                ),
+                _ => unreachable!(),
+            };
+
+            let input = test_case.consume_bytes("Input");
+
+            let error = test_case.consume_optional_string("Error");
+
+            match (
+                signature::EcdsaKeyPair::from_pkcs8(this_fixed, &input, &rng),
+                error.clone(),
+            ) {
+                (Ok(_), None) => (),
+                (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e),
+                (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e),
+                (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected),
+            };
+
+            match (
+                signature::EcdsaKeyPair::from_pkcs8(this_asn1, &input, &rng),
+                error,
+            ) {
+                (Ok(_), None) => (),
+                (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e),
+                (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e),
+                (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected),
+            };
+
+            assert!(signature::EcdsaKeyPair::from_pkcs8(other_fixed, &input, &rng).is_err());
+            assert!(signature::EcdsaKeyPair::from_pkcs8(other_asn1, &input, &rng).is_err());
+
+            Ok(())
+        },
+    );
+}
+
+// Verify that, at least, we generate PKCS#8 documents that we can read.
+#[test]
+fn ecdsa_generate_pkcs8_test() {
+    let rng = rand::SystemRandom::new();
+
+    for alg in &[
+        &signature::ECDSA_P256_SHA256_ASN1_SIGNING,
+        &signature::ECDSA_P256_SHA256_FIXED_SIGNING,
+        &signature::ECDSA_P384_SHA384_ASN1_SIGNING,
+        &signature::ECDSA_P384_SHA384_FIXED_SIGNING,
+    ] {
+        let pkcs8 = signature::EcdsaKeyPair::generate_pkcs8(alg, &rng).unwrap();
+        println!();
+        for b in pkcs8.as_ref() {
+            print!("{:02x}", *b);
+        }
+        println!();
+        println!();
+
+        #[cfg(feature = "alloc")]
+        let _ = signature::EcdsaKeyPair::from_pkcs8(alg, pkcs8.as_ref(), &rng).unwrap();
+    }
+}
+
+#[test]
+fn signature_ecdsa_verify_asn1_test() {
+    test::run(
+        test_file!("ecdsa_verify_asn1_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let curve_name = test_case.consume_string("Curve");
+            let digest_name = test_case.consume_string("Digest");
+            let msg = test_case.consume_bytes("Msg");
+            let public_key = test_case.consume_bytes("Q");
+            let sig = test_case.consume_bytes("Sig");
+            let is_valid = test_case.consume_string("Result") == "P (0 )";
+
+            let alg = match (curve_name.as_str(), digest_name.as_str()) {
+                ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_ASN1,
+                ("P-256", "SHA384") => &signature::ECDSA_P256_SHA384_ASN1,
+                ("P-384", "SHA256") => &signature::ECDSA_P384_SHA256_ASN1,
+                ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_ASN1,
+                _ => {
+                    panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                }
+            };
+
+            let actual_result =
+                signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig);
+            assert_eq!(actual_result.is_ok(), is_valid);
+
+            Ok(())
+        },
+    );
+}
+
+#[test]
+fn signature_ecdsa_verify_fixed_test() {
+    test::run(
+        test_file!("ecdsa_verify_fixed_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let curve_name = test_case.consume_string("Curve");
+            let digest_name = test_case.consume_string("Digest");
+
+            let msg = test_case.consume_bytes("Msg");
+            let public_key = test_case.consume_bytes("Q");
+            let sig = test_case.consume_bytes("Sig");
+            let expected_result = test_case.consume_string("Result");
+
+            let alg = match (curve_name.as_str(), digest_name.as_str()) {
+                ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_FIXED,
+                ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_FIXED,
+                _ => {
+                    panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                }
+            };
+
+            let is_valid = expected_result == "P (0 )";
+
+            let actual_result =
+                signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig);
+            assert_eq!(actual_result.is_ok(), is_valid);
+
+            Ok(())
+        },
+    );
+}
+
+#[test]
+fn ecdsa_test_public_key_coverage() {
+    const PRIVATE_KEY: &[u8] = include_bytes!("ecdsa_test_private_key_p256.p8");
+    const PUBLIC_KEY: &[u8] = include_bytes!("ecdsa_test_public_key_p256.der");
+    const PUBLIC_KEY_DEBUG: &str = include_str!("ecdsa_test_public_key_p256_debug.txt");
+
+    let rng = rand::SystemRandom::new();
+    let key_pair = signature::EcdsaKeyPair::from_pkcs8(
+        &signature::ECDSA_P256_SHA256_FIXED_SIGNING,
+        PRIVATE_KEY,
+        &rng,
+    )
+    .unwrap();
+
+    // Test `AsRef<[u8]>`
+    assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY);
+
+    // Test `Clone`.
+    #[allow(clippy::clone_on_copy, clippy::redundant_clone)]
+    let _: <signature::EcdsaKeyPair as KeyPair>::PublicKey = key_pair.public_key().clone();
+
+    // Test `Copy`.
+    let _: <signature::EcdsaKeyPair as KeyPair>::PublicKey = *key_pair.public_key();
+
+    // Test `Debug`.
+    assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key()));
+    assert_eq!(
+        format!("EcdsaKeyPair {{ public_key: {:?} }}", key_pair.public_key()),
+        format!("{:?}", key_pair)
+    );
+}
+
+// This test is not a known-answer test, though it re-uses the known-answer
+// test vectors. Because the nonce is randomized, the signature will be
+// different each time. Because of that, here we simply verify that the
+// signature verifies correctly. The known-answer tests themselves are in
+// ecsda/signing.rs.
+#[test]
+fn signature_ecdsa_sign_fixed_sign_and_verify_test() {
+    let rng = rand::SystemRandom::new();
+
+    test::run(
+        test_file!("../src/ec/suite_b/ecdsa/ecdsa_sign_fixed_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let curve_name = test_case.consume_string("Curve");
+            let digest_name = test_case.consume_string("Digest");
+
+            let msg = test_case.consume_bytes("Msg");
+            let d = test_case.consume_bytes("d");
+            let q = test_case.consume_bytes("Q");
+
+            // Ignored since the actual signature will use a randomized nonce.
+            let _k = test_case.consume_bytes("k");
+            let _expected_result = test_case.consume_bytes("Sig");
+
+            let (signing_alg, verification_alg) = match (curve_name.as_str(), digest_name.as_str())
+            {
+                ("P-256", "SHA256") => (
+                    &signature::ECDSA_P256_SHA256_FIXED_SIGNING,
+                    &signature::ECDSA_P256_SHA256_FIXED,
+                ),
+                ("P-384", "SHA384") => (
+                    &signature::ECDSA_P384_SHA384_FIXED_SIGNING,
+                    &signature::ECDSA_P384_SHA384_FIXED,
+                ),
+                _ => {
+                    panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                }
+            };
+
+            let private_key =
+                signature::EcdsaKeyPair::from_private_key_and_public_key(signing_alg, &d, &q, &rng)
+                    .unwrap();
+
+            let signature = private_key.sign(&rng, &msg).unwrap();
+
+            let public_key = signature::UnparsedPublicKey::new(verification_alg, q);
+            assert_eq!(public_key.verify(&msg, signature.as_ref()), Ok(()));
+
+            Ok(())
+        },
+    );
+}
+
+// This test is not a known-answer test, though it re-uses the known-answer
+// test vectors. Because the nonce is randomized, the signature will be
+// different each time. Because of that, here we simply verify that the
+// signature verifies correctly. The known-answer tests themselves are in
+// ecsda/signing.rs.
+#[test]
+fn signature_ecdsa_sign_asn1_test() {
+    let rng = rand::SystemRandom::new();
+
+    test::run(
+        test_file!("../src/ec/suite_b/ecdsa/ecdsa_sign_asn1_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let curve_name = test_case.consume_string("Curve");
+            let digest_name = test_case.consume_string("Digest");
+
+            let msg = test_case.consume_bytes("Msg");
+            let d = test_case.consume_bytes("d");
+            let q = test_case.consume_bytes("Q");
+
+            // Ignored since the actual signature will use a randomized nonce.
+            let _k = test_case.consume_bytes("k");
+            let _expected_result = test_case.consume_bytes("Sig");
+
+            let (signing_alg, verification_alg) = match (curve_name.as_str(), digest_name.as_str())
+            {
+                ("P-256", "SHA256") => (
+                    &signature::ECDSA_P256_SHA256_ASN1_SIGNING,
+                    &signature::ECDSA_P256_SHA256_ASN1,
+                ),
+                ("P-384", "SHA384") => (
+                    &signature::ECDSA_P384_SHA384_ASN1_SIGNING,
+                    &signature::ECDSA_P384_SHA384_ASN1,
+                ),
+                _ => {
+                    panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name);
+                }
+            };
+
+            let private_key =
+                signature::EcdsaKeyPair::from_private_key_and_public_key(signing_alg, &d, &q, &rng)
+                    .unwrap();
+
+            let signature = private_key.sign(&rng, &msg).unwrap();
+
+            let public_key = signature::UnparsedPublicKey::new(verification_alg, q);
+            assert_eq!(public_key.verify(&msg, signature.as_ref()), Ok(()));
+
+            Ok(())
+        },
+    );
+}
diff --git a/ring-0.17.14/tests/ed25519_test_private_key.bin b/ring-0.17.14/tests/ed25519_test_private_key.bin
new file mode 100644
index 0000000000..afd7169069
--- /dev/null
+++ b/ring-0.17.14/tests/ed25519_test_private_key.bin
@@ -0,0 +1 @@
+�a����Z`��J���,�DI�i{2ip;��`
\ No newline at end of file
diff --git a/ring-0.17.14/tests/ed25519_test_private_key.p8 b/ring-0.17.14/tests/ed25519_test_private_key.p8
new file mode 100644
index 0000000000..fc90176662
Binary files /dev/null and b/ring-0.17.14/tests/ed25519_test_private_key.p8 differ
diff --git a/ring-0.17.14/tests/ed25519_test_public_key.bin b/ring-0.17.14/tests/ed25519_test_public_key.bin
new file mode 100644
index 0000000000..9ed91630c4
--- /dev/null
+++ b/ring-0.17.14/tests/ed25519_test_public_key.bin
@@ -0,0 +1,2 @@
+�Z���
+��K���d:�r�ڦ#%�h�Q
\ No newline at end of file
diff --git a/ring-0.17.14/tests/ed25519_test_public_key.der b/ring-0.17.14/tests/ed25519_test_public_key.der
new file mode 100644
index 0000000000..0ac2bd5c41
--- /dev/null
+++ b/ring-0.17.14/tests/ed25519_test_public_key.der
@@ -0,0 +1 @@
+X	�����X����~���W����X5ö���k}
\ No newline at end of file
diff --git a/ring-0.17.14/tests/ed25519_tests.rs b/ring-0.17.14/tests/ed25519_tests.rs
new file mode 100644
index 0000000000..65afc94897
--- /dev/null
+++ b/ring-0.17.14/tests/ed25519_tests.rs
@@ -0,0 +1,236 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::{
+    error, rand,
+    signature::{self, Ed25519KeyPair, KeyPair},
+};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+/// Test vectors from BoringSSL.
+#[test]
+fn test_signature_ed25519() {
+    test::run(test_file!("ed25519_tests.txt"), |section, test_case| {
+        assert_eq!(section, "");
+        let seed = test_case.consume_bytes("SEED");
+        assert_eq!(32, seed.len());
+
+        let public_key = test_case.consume_bytes("PUB");
+        assert_eq!(32, public_key.len());
+
+        let msg = test_case.consume_bytes("MESSAGE");
+
+        let expected_sig = test_case.consume_bytes("SIG");
+
+        {
+            let key_pair = Ed25519KeyPair::from_seed_and_public_key(&seed, &public_key).unwrap();
+            let actual_sig = key_pair.sign(&msg);
+            assert_eq!(&expected_sig[..], actual_sig.as_ref());
+        }
+
+        // Test PKCS#8 generation, parsing, and private-to-public calculations.
+        #[allow(deprecated)]
+        let rng = test::rand::FixedSliceRandom { bytes: &seed };
+        let pkcs8 = Ed25519KeyPair::generate_pkcs8(&rng).unwrap();
+        let key_pair = Ed25519KeyPair::from_pkcs8(pkcs8.as_ref()).unwrap();
+        assert_eq!(public_key, key_pair.public_key().as_ref());
+
+        // Test Signature generation.
+        let actual_sig = key_pair.sign(&msg);
+        assert_eq!(&expected_sig[..], actual_sig.as_ref());
+
+        // Test Signature verification.
+        test_signature_verification(&public_key, &msg, &expected_sig, Ok(()));
+
+        let mut tampered_sig = expected_sig;
+        tampered_sig[0] ^= 1;
+
+        test_signature_verification(&public_key, &msg, &tampered_sig, Err(error::Unspecified));
+
+        Ok(())
+    });
+}
+
+/// Test vectors from BoringSSL.
+#[test]
+fn test_signature_ed25519_verify() {
+    test::run(
+        test_file!("ed25519_verify_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let public_key = test_case.consume_bytes("PUB");
+            let msg = test_case.consume_bytes("MESSAGE");
+            let sig = test_case.consume_bytes("SIG");
+            let expected_result = match test_case.consume_string("Result").as_str() {
+                "P" => Ok(()),
+                "F" => Err(error::Unspecified),
+                s => panic!("{:?} is not a valid result", s),
+            };
+            test_signature_verification(&public_key, &msg, &sig, expected_result);
+            Ok(())
+        },
+    );
+}
+
+fn test_signature_verification(
+    public_key: &[u8],
+    msg: &[u8],
+    sig: &[u8],
+    expected_result: Result<(), error::Unspecified>,
+) {
+    assert_eq!(
+        expected_result,
+        signature::UnparsedPublicKey::new(&signature::ED25519, public_key).verify(msg, sig)
+    );
+}
+
+#[test]
+fn test_ed25519_from_seed_and_public_key_misuse() {
+    const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.bin");
+    const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.bin");
+
+    assert!(Ed25519KeyPair::from_seed_and_public_key(PRIVATE_KEY, PUBLIC_KEY).is_ok());
+
+    // Truncated private key.
+    assert!(Ed25519KeyPair::from_seed_and_public_key(&PRIVATE_KEY[..31], PUBLIC_KEY).is_err());
+
+    // Truncated public key.
+    assert!(Ed25519KeyPair::from_seed_and_public_key(PRIVATE_KEY, &PUBLIC_KEY[..31]).is_err());
+
+    // Swapped public and private key.
+    assert!(Ed25519KeyPair::from_seed_and_public_key(PUBLIC_KEY, PRIVATE_KEY).is_err());
+}
+
+enum FromPkcs8Variant {
+    Checked,
+    MaybeUnchecked,
+}
+
+#[test]
+fn test_ed25519_from_pkcs8_unchecked() {
+    test_ed25519_from_pkcs8_(
+        FromPkcs8Variant::MaybeUnchecked,
+        Ed25519KeyPair::from_pkcs8_maybe_unchecked,
+    )
+}
+
+#[test]
+fn test_ed25519_from_pkcs8() {
+    test_ed25519_from_pkcs8_(FromPkcs8Variant::Checked, Ed25519KeyPair::from_pkcs8)
+}
+
+fn test_ed25519_from_pkcs8_(
+    variant: FromPkcs8Variant,
+    f: impl Fn(&[u8]) -> Result<Ed25519KeyPair, error::KeyRejected>,
+) {
+    // Just test that we can parse the input.
+    test::run(
+        test_file!("ed25519_from_pkcs8_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+            let input = test_case.consume_bytes("Input");
+            let expected_error = {
+                let expected_checked = test_case.consume_string("Result-Checked");
+                let expected_maybe_unchecked = test_case.consume_string("Result-Maybe-Unchecked");
+                let expected_result = match variant {
+                    FromPkcs8Variant::Checked => expected_checked,
+                    FromPkcs8Variant::MaybeUnchecked => expected_maybe_unchecked,
+                };
+                if expected_result == "OK" {
+                    None
+                } else {
+                    Some(expected_result)
+                }
+            };
+            let expected_public = {
+                let expected_if_no_error = test_case.consume_optional_bytes("Public");
+                if expected_error.is_none() {
+                    Some(expected_if_no_error.unwrap())
+                } else {
+                    None
+                }
+            };
+
+            match f(&input) {
+                Ok(keypair) => {
+                    assert_eq!(expected_error, None);
+                    assert_eq!(
+                        expected_public.as_deref(),
+                        Some(keypair.public_key().as_ref())
+                    );
+                }
+                Err(actual_error) => {
+                    assert_eq!(expected_error, Some(format!("{}", actual_error)));
+                    assert_eq!(expected_public, None);
+                }
+            }
+
+            Ok(())
+        },
+    );
+}
+
+#[test]
+fn ed25519_test_generate_pkcs8() {
+    let rng = rand::SystemRandom::new();
+    let generated = Ed25519KeyPair::generate_pkcs8(&rng).unwrap();
+    let generated = generated.as_ref();
+
+    let _ronudtripped = Ed25519KeyPair::from_pkcs8(generated).unwrap();
+
+    // Regression test: Verify we're generating the correct encoding, as
+    // `Ed25519KeyPair::from_pkcs8` also accepts our old wrong encoding.
+    assert_eq!(generated.len(), 19 + 32 + 32);
+    assert_eq!(&generated[..2], &[0x30, 0x51]);
+}
+
+#[test]
+fn ed25519_test_public_key_coverage() {
+    const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.p8");
+    const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.der");
+    const PUBLIC_KEY_DEBUG: &str =
+        "PublicKey(\"5809e9fef6dcec58f0f2e3b0d67e9880a11957e083ace85835c3b6c8fbaf6b7d\")";
+
+    let key_pair = Ed25519KeyPair::from_pkcs8(PRIVATE_KEY).unwrap();
+
+    // Test `AsRef<[u8]>`
+    assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY);
+
+    // Test `Clone`.
+    #[allow(clippy::clone_on_copy)]
+    let _: <Ed25519KeyPair as KeyPair>::PublicKey = key_pair.public_key().clone();
+
+    // Test `Copy`.
+    let _: <Ed25519KeyPair as KeyPair>::PublicKey = *key_pair.public_key();
+
+    // Test `Debug`.
+    assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key()));
+    assert_eq!(
+        format!(
+            "Ed25519KeyPair {{ public_key: {:?} }}",
+            key_pair.public_key()
+        ),
+        format!("{:?}", key_pair)
+    );
+}
diff --git a/ring-0.17.14/tests/error_tests.rs b/ring-0.17.14/tests/error_tests.rs
new file mode 100644
index 0000000000..0152b703e8
--- /dev/null
+++ b/ring-0.17.14/tests/error_tests.rs
@@ -0,0 +1,12 @@
+#![allow(missing_docs)]
+
+#[cfg(feature = "std")]
+#[test]
+fn error_impl_std_error_error_test() {
+    use ring::error;
+    #[allow(deprecated)]
+    use ring::test;
+
+    test::compile_time_assert_std_error_error::<error::Unspecified>();
+    test::compile_time_assert_std_error_error::<error::KeyRejected>();
+}
diff --git a/ring-0.17.14/tests/hkdf_tests.rs b/ring-0.17.14/tests/hkdf_tests.rs
new file mode 100644
index 0000000000..64f664bc1d
--- /dev/null
+++ b/ring-0.17.14/tests/hkdf_tests.rs
@@ -0,0 +1,131 @@
+// Copyright 2015 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::{digest, error, hkdf};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+fn hkdf_tests() {
+    test::run(test_file!("hkdf_tests.txt"), |section, test_case| {
+        assert_eq!(section, "");
+        let alg = {
+            let digest_alg = test_case
+                .consume_digest_alg("Hash")
+                .ok_or(error::Unspecified)?;
+            if digest_alg == &digest::SHA256 {
+                hkdf::HKDF_SHA256
+            } else {
+                // TODO: add test vectors for other algorithms
+                panic!("unsupported algorithm: {:?}", digest_alg);
+            }
+        };
+        let secret = test_case.consume_bytes("IKM");
+        let salt = test_case.consume_bytes("salt");
+        let info = test_case.consume_bytes("info");
+        let _ = test_case.consume_bytes("PRK");
+        let expected_out = test_case.consume_bytes("OKM");
+
+        let salt = hkdf::Salt::new(alg, &salt);
+
+        // TODO: test multi-part info, especially with empty parts.
+        let My(out) = salt
+            .extract(&secret)
+            .expand(&[&info], My(expected_out.len()))
+            .unwrap()
+            .into();
+        assert_eq!(out, expected_out);
+
+        Ok(())
+    });
+}
+
+#[test]
+fn hkdf_output_len_tests() {
+    for &alg in &[hkdf::HKDF_SHA256, hkdf::HKDF_SHA384, hkdf::HKDF_SHA512] {
+        const MAX_BLOCKS: usize = 255;
+
+        let salt = hkdf::Salt::new(alg, &[]);
+        let prk = salt.extract(&[]); // TODO: enforce minimum length.
+
+        {
+            // Test zero length.
+            let okm = prk.expand(&[b"info"], My(0)).unwrap();
+            let result: My<Vec<u8>> = okm.into();
+            assert_eq!(&result.0, &[]);
+        }
+
+        let max_out_len = MAX_BLOCKS * alg.hmac_algorithm().digest_algorithm().output_len();
+
+        {
+            // Test maximum length output succeeds.
+            let okm = prk.expand(&[b"info"], My(max_out_len)).unwrap();
+            let result: My<Vec<u8>> = okm.into();
+            assert_eq!(result.0.len(), max_out_len);
+        }
+
+        {
+            // Test too-large output fails.
+            assert!(prk.expand(&[b"info"], My(max_out_len + 1)).is_err());
+        }
+
+        {
+            // Test length mismatch (smaller).
+            let okm = prk.expand(&[b"info"], My(2)).unwrap();
+            let mut buf = [0u8; 1];
+            assert_eq!(okm.fill(&mut buf), Err(error::Unspecified));
+        }
+
+        {
+            // Test length mismatch (larger).
+            let okm = prk.expand(&[b"info"], My(2)).unwrap();
+            let mut buf = [0u8; 3];
+            assert_eq!(okm.fill(&mut buf), Err(error::Unspecified));
+        }
+
+        {
+            // Control for above two tests.
+            let okm = prk.expand(&[b"info"], My(2)).unwrap();
+            let mut buf = [0u8; 2];
+            assert_eq!(okm.fill(&mut buf), Ok(()));
+        }
+    }
+}
+
+/// Generic newtype wrapper that lets us implement traits for externally-defined
+/// types.
+#[derive(Debug, PartialEq)]
+struct My<T: core::fmt::Debug + PartialEq>(T);
+
+impl hkdf::KeyType for My<usize> {
+    fn len(&self) -> usize {
+        self.0
+    }
+}
+
+impl From<hkdf::Okm<'_, My<usize>>> for My<Vec<u8>> {
+    fn from(okm: hkdf::Okm<My<usize>>) -> Self {
+        let mut r = vec![0u8; okm.len().0];
+        okm.fill(&mut r).unwrap();
+        Self(r)
+    }
+}
diff --git a/ring-0.17.14/tests/hmac_tests.rs b/ring-0.17.14/tests/hmac_tests.rs
new file mode 100644
index 0000000000..4a56f30f85
--- /dev/null
+++ b/ring-0.17.14/tests/hmac_tests.rs
@@ -0,0 +1,113 @@
+// Copyright 2015-2016 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::{digest, hmac};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+fn hmac_tests() {
+    test::run(test_file!("hmac_tests.txt"), |section, test_case| {
+        assert_eq!(section, "");
+        let digest_alg = test_case.consume_digest_alg("HMAC");
+        let key_value = test_case.consume_bytes("Key");
+        let mut input = test_case.consume_bytes("Input");
+        let output = test_case.consume_bytes("Output");
+
+        let algorithm = {
+            let digest_alg = match digest_alg {
+                Some(digest_alg) => digest_alg,
+                None => {
+                    return Ok(());
+                } // Unsupported digest algorithm
+            };
+            if digest_alg == &digest::SHA1_FOR_LEGACY_USE_ONLY {
+                hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY
+            } else if digest_alg == &digest::SHA256 {
+                hmac::HMAC_SHA256
+            } else if digest_alg == &digest::SHA384 {
+                hmac::HMAC_SHA384
+            } else if digest_alg == &digest::SHA512 {
+                hmac::HMAC_SHA512
+            } else {
+                unreachable!()
+            }
+        };
+
+        hmac_test_case_inner(algorithm, &key_value[..], &input[..], &output[..], true);
+
+        // Tamper with the input and check that verification fails.
+        if input.is_empty() {
+            input.push(0);
+        } else {
+            input[0] ^= 1;
+        }
+
+        hmac_test_case_inner(algorithm, &key_value[..], &input[..], &output[..], false);
+
+        Ok(())
+    });
+}
+
+fn hmac_test_case_inner(
+    algorithm: hmac::Algorithm,
+    key_value: &[u8],
+    input: &[u8],
+    output: &[u8],
+    is_ok: bool,
+) {
+    let key = hmac::Key::new(algorithm, key_value);
+
+    // One-shot API.
+    {
+        let signature = hmac::sign(&key, input);
+        assert_eq!(is_ok, signature.as_ref() == output);
+        assert_eq!(is_ok, hmac::verify(&key, input, output).is_ok());
+    }
+
+    // Multi-part API, one single part.
+    {
+        let mut s_ctx = hmac::Context::with_key(&key);
+        s_ctx.update(input);
+        let signature = s_ctx.sign();
+        assert_eq!(is_ok, signature.as_ref() == output);
+    }
+
+    // Multi-part API, byte by byte.
+    {
+        let mut ctx = hmac::Context::with_key(&key);
+        for b in input {
+            ctx.update(&[*b]);
+        }
+        let signature = ctx.sign();
+        assert_eq!(is_ok, signature.as_ref() == output);
+    }
+}
+
+#[test]
+fn hmac_debug() {
+    let key = hmac::Key::new(hmac::HMAC_SHA256, &[0; 32]);
+    assert_eq!("Key { algorithm: SHA256 }", format!("{:?}", &key));
+
+    let ctx = hmac::Context::with_key(&key);
+    assert_eq!("Context { algorithm: SHA256 }", format!("{:?}", &ctx));
+}
diff --git a/ring-0.17.14/tests/pbkdf2_tests.rs b/ring-0.17.14/tests/pbkdf2_tests.rs
new file mode 100644
index 0000000000..a78506e60f
--- /dev/null
+++ b/ring-0.17.14/tests/pbkdf2_tests.rs
@@ -0,0 +1,72 @@
+// Copyright 2015-2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use core::num::NonZeroU32;
+use ring::{digest, error, pbkdf2};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+/// Test vectors from BoringSSL, Go, and other sources.
+#[test]
+pub fn pbkdf2_tests() {
+    test::run(test_file!("pbkdf2_tests.txt"), |section, test_case| {
+        assert_eq!(section, "");
+        let algorithm = {
+            let digest_alg = test_case.consume_digest_alg("Hash").unwrap();
+            if digest_alg == &digest::SHA1_FOR_LEGACY_USE_ONLY {
+                pbkdf2::PBKDF2_HMAC_SHA1
+            } else if digest_alg == &digest::SHA256 {
+                pbkdf2::PBKDF2_HMAC_SHA256
+            } else if digest_alg == &digest::SHA384 {
+                pbkdf2::PBKDF2_HMAC_SHA384
+            } else if digest_alg == &digest::SHA512 {
+                pbkdf2::PBKDF2_HMAC_SHA512
+            } else {
+                unreachable!()
+            }
+        };
+        let iterations: u32 = test_case.consume_usize("c").try_into().unwrap();
+        let iterations: NonZeroU32 = iterations.try_into().unwrap();
+        let secret = test_case.consume_bytes("P");
+        let salt = test_case.consume_bytes("S");
+        let dk = test_case.consume_bytes("DK");
+        let verify_expected_result = test_case.consume_string("Verify");
+        let verify_expected_result = match verify_expected_result.as_str() {
+            "OK" => Ok(()),
+            "Err" => Err(error::Unspecified),
+            _ => panic!("Unsupported value of \"Verify\""),
+        };
+
+        {
+            let mut out = vec![0u8; dk.len()];
+            pbkdf2::derive(algorithm, iterations, &salt, &secret, &mut out);
+            assert_eq!(dk == out, verify_expected_result.is_ok() || dk.is_empty());
+        }
+
+        assert_eq!(
+            pbkdf2::verify(algorithm, iterations, &salt, &secret, &dk),
+            verify_expected_result
+        );
+
+        Ok(())
+    });
+}
diff --git a/ring-0.17.14/tests/quic_tests.rs b/ring-0.17.14/tests/quic_tests.rs
new file mode 100644
index 0000000000..dae30330a3
--- /dev/null
+++ b/ring-0.17.14/tests/quic_tests.rs
@@ -0,0 +1,87 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::aead::quic;
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[test]
+fn quic_aes_128() {
+    test_quic(&quic::AES_128, test_file!("quic_aes_128_tests.txt"));
+}
+
+#[test]
+fn quic_aes_256() {
+    test_quic(&quic::AES_256, test_file!("quic_aes_256_tests.txt"));
+}
+
+#[test]
+fn quic_chacha20() {
+    test_quic(&quic::CHACHA20, test_file!("quic_chacha20_tests.txt"));
+}
+
+fn test_quic(alg: &'static quic::Algorithm, test_file: test::File) {
+    test_key_len(alg);
+    test_sample_len(alg);
+
+    test::run(test_file, |section, test_case| {
+        assert_eq!(section, "");
+        let key_bytes = test_case.consume_bytes("KEY");
+        let sample = test_case.consume_bytes("SAMPLE");
+        let mask = test_case.consume_bytes("MASK");
+
+        let key = quic::HeaderProtectionKey::new(alg, &key_bytes)?;
+
+        assert_eq!(mask.as_ref(), key.new_mask(&sample)?);
+
+        Ok(())
+    });
+}
+
+#[allow(clippy::range_plus_one)]
+fn test_key_len(alg: &'static quic::Algorithm) {
+    let key_len = alg.key_len();
+    let key_data = vec![0u8; key_len + 1];
+
+    assert!(quic::HeaderProtectionKey::new(alg, &[]).is_err());
+    assert!(quic::HeaderProtectionKey::new(alg, &key_data[..key_len]).is_ok());
+    assert!(quic::HeaderProtectionKey::new(alg, &key_data[..(key_len + 1)]).is_err());
+    assert!(quic::HeaderProtectionKey::new(alg, &key_data[..(key_len - 1)]).is_err());
+}
+
+#[allow(clippy::range_plus_one)]
+fn test_sample_len(alg: &'static quic::Algorithm) {
+    let key_len = alg.key_len();
+    let key_data = vec![0u8; key_len];
+
+    let key = quic::HeaderProtectionKey::new(alg, &key_data).unwrap();
+
+    let sample_len = alg.sample_len();
+    assert_eq!(sample_len, 16); // For all currently-implemented algorithms
+    let sample_data = vec![0u8; sample_len + 2];
+
+    // Sample is the right size.
+    assert!(key.new_mask(&sample_data[..sample_len]).is_ok());
+
+    // Sample is one byte too small.
+    assert!(key.new_mask(&sample_data[..(sample_len - 1)]).is_err());
+
+    // Sample is one byte too big.
+    assert!(key.new_mask(&sample_data[..(sample_len + 1)]).is_err());
+
+    // Sample is empty.
+    assert!(key.new_mask(&[]).is_err());
+}
diff --git a/ring-0.17.14/tests/rand_tests.rs b/ring-0.17.14/tests/rand_tests.rs
new file mode 100644
index 0000000000..91965362d9
--- /dev/null
+++ b/ring-0.17.14/tests/rand_tests.rs
@@ -0,0 +1,79 @@
+// Copyright 2015-2019 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+
+use ring::rand::{self, SecureRandom as _};
+#[allow(deprecated)]
+use ring::test;
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+fn test_system_random_lengths() {
+    const LINUX_LIMIT: usize = 256;
+    const WEB_LIMIT: usize = 65536;
+
+    // Test that `fill` succeeds for various interesting lengths. `256` and
+    // multiples thereof are interesting because that's an edge case for
+    // `getrandom` on Linux.
+    let lengths = [
+        0,
+        1,
+        2,
+        3,
+        96,
+        LINUX_LIMIT - 1,
+        LINUX_LIMIT,
+        LINUX_LIMIT + 1,
+        LINUX_LIMIT * 2,
+        511,
+        512,
+        513,
+        4096,
+        WEB_LIMIT - 1,
+        WEB_LIMIT,
+        WEB_LIMIT + 1,
+        WEB_LIMIT * 2,
+    ];
+
+    for len in lengths.iter() {
+        let mut buf = vec![0; *len];
+
+        let rng = rand::SystemRandom::new();
+        assert!(rng.fill(&mut buf).is_ok());
+
+        // If `len` < 96 then there's a big chance of false positives, but
+        // otherwise the likelihood of a false positive is so too low to
+        // worry about.
+        if *len >= 96 {
+            assert!(buf.iter().any(|x| *x != 0));
+        }
+    }
+}
+
+#[test]
+fn test_system_random_traits() {
+    test::compile_time_assert_clone::<rand::SystemRandom>();
+    test::compile_time_assert_send::<rand::SystemRandom>();
+
+    assert_eq!(
+        "SystemRandom(())",
+        format!("{:?}", rand::SystemRandom::new())
+    );
+}
diff --git a/ring-0.17.14/tests/rsa_test_private_key_2048.p8 b/ring-0.17.14/tests/rsa_test_private_key_2048.p8
new file mode 100644
index 0000000000..26c480b150
Binary files /dev/null and b/ring-0.17.14/tests/rsa_test_private_key_2048.p8 differ
diff --git a/ring-0.17.14/tests/rsa_test_public_key_2048.der b/ring-0.17.14/tests/rsa_test_public_key_2048.der
new file mode 100644
index 0000000000..47f18a7f5a
Binary files /dev/null and b/ring-0.17.14/tests/rsa_test_public_key_2048.der differ
diff --git a/ring-0.17.14/tests/rsa_test_public_key_2048_debug.txt b/ring-0.17.14/tests/rsa_test_public_key_2048_debug.txt
new file mode 100644
index 0000000000..659d6ee495
--- /dev/null
+++ b/ring-0.17.14/tests/rsa_test_public_key_2048_debug.txt
@@ -0,0 +1 @@
+PublicKey("3082010a0282010100c8a78500a5a250db8ed36c85b8dcf83c4be1953114faaac7616e0ea24922fa6b7ab01f85582c815cc3bdeb5ed46762bc536accaa8b72705b00cef316b2ec508fb9697241b9e34238419cccf7339eeb8b062147af4f5932f613d9bc0ae70bf6d56d4432e83e13767587531bfa9dd56531741244be75e8bc9226b9fa44b4b8a101358d7e8bb75d0c724a4f11ece77776263faefe79612eb1d71646e77e8982866be1400eafc3580d3139b41aaa7380187372f22e35bd55b288496165c881ed154d5811245c52d56cc09d4916d4f2a50bcf5ae0a2637f4cfa6bf9daafc113dba8383b6dd7da6dd8db22d8510a8d3115983308909a1a0332517aa55e896e154249b30203010001")
\ No newline at end of file
diff --git a/ring-0.17.14/tests/rsa_test_public_modulus.bin b/ring-0.17.14/tests/rsa_test_public_modulus.bin
new file mode 100644
index 0000000000..1f473ee603
Binary files /dev/null and b/ring-0.17.14/tests/rsa_test_public_modulus.bin differ
diff --git a/ring-0.17.14/tests/rsa_tests.rs b/ring-0.17.14/tests/rsa_tests.rs
new file mode 100644
index 0000000000..835fc85a0c
--- /dev/null
+++ b/ring-0.17.14/tests/rsa_tests.rs
@@ -0,0 +1,348 @@
+// Copyright 2017 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![allow(missing_docs)]
+#![cfg(feature = "alloc")]
+
+use ring::{
+    error,
+    io::der,
+    rand, rsa,
+    signature::{self, KeyPair},
+};
+#[allow(deprecated)]
+use ring::{test, test_file};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+fn rsa_from_pkcs8_test() {
+    test::run(
+        test_file!("rsa_from_pkcs8_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let input = test_case.consume_bytes("Input");
+            let error = test_case.consume_optional_string("Error");
+
+            match (rsa::KeyPair::from_pkcs8(&input), error) {
+                (Ok(_), None) => {}
+                (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e),
+                (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e),
+                (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected),
+            };
+
+            Ok(())
+        },
+    );
+}
+
+#[cfg(feature = "alloc")]
+#[test]
+fn test_signature_rsa_pkcs1_sign() {
+    let rng = rand::SystemRandom::new();
+    test::run(
+        test_file!("rsa_pkcs1_sign_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let digest_name = test_case.consume_string("Digest");
+            let alg = match digest_name.as_ref() {
+                "SHA256" => &signature::RSA_PKCS1_SHA256,
+                "SHA384" => &signature::RSA_PKCS1_SHA384,
+                "SHA512" => &signature::RSA_PKCS1_SHA512,
+                _ => panic!("Unsupported digest: {}", digest_name),
+            };
+
+            let private_key = test_case.consume_bytes("Key");
+            let msg = test_case.consume_bytes("Msg");
+            let expected = test_case.consume_bytes("Sig");
+            let result = test_case.consume_string("Result");
+
+            let key_pair = rsa::KeyPair::from_der(&private_key);
+            if result == "Fail-Invalid-Key" {
+                assert!(key_pair.is_err());
+                return Ok(());
+            }
+            let key_pair = key_pair.unwrap();
+
+            // XXX: This test is too slow on Android ARM Travis CI builds.
+            // TODO: re-enable these tests on Android ARM.
+            let mut actual = vec![0u8; key_pair.public().modulus_len()];
+            key_pair
+                .sign(alg, &rng, &msg, actual.as_mut_slice())
+                .unwrap();
+            assert_eq!(actual.as_slice() == &expected[..], result == "Pass");
+            Ok(())
+        },
+    );
+}
+
+#[cfg(feature = "alloc")]
+#[test]
+fn test_signature_rsa_pss_sign() {
+    test::run(
+        test_file!("rsa_pss_sign_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let digest_name = test_case.consume_string("Digest");
+            let alg = match digest_name.as_ref() {
+                "SHA256" => &signature::RSA_PSS_SHA256,
+                "SHA384" => &signature::RSA_PSS_SHA384,
+                "SHA512" => &signature::RSA_PSS_SHA512,
+                _ => panic!("Unsupported digest: {}", digest_name),
+            };
+
+            let result = test_case.consume_string("Result");
+            let private_key = test_case.consume_bytes("Key");
+            let key_pair = rsa::KeyPair::from_der(&private_key);
+            if key_pair.is_err() && result == "Fail-Invalid-Key" {
+                return Ok(());
+            }
+            let key_pair = key_pair.unwrap();
+            let msg = test_case.consume_bytes("Msg");
+            let salt = test_case.consume_bytes("Salt");
+            let expected = test_case.consume_bytes("Sig");
+
+            #[allow(deprecated)]
+            let rng = test::rand::FixedSliceRandom { bytes: &salt };
+
+            let mut actual = vec![0u8; key_pair.public().modulus_len()];
+            key_pair.sign(alg, &rng, &msg, actual.as_mut_slice())?;
+            assert_eq!(actual.as_slice() == &expected[..], result == "Pass");
+            Ok(())
+        },
+    );
+}
+
+// `KeyPair::sign` requires that the output buffer is the same length as
+// the public key modulus. Test what happens when it isn't the same length.
+#[test]
+fn test_signature_rsa_pkcs1_sign_output_buffer_len() {
+    // Sign the message "hello, world", using PKCS#1 v1.5 padding and the
+    // SHA256 digest algorithm.
+    const MESSAGE: &[u8] = b"hello, world";
+    let rng = rand::SystemRandom::new();
+
+    const PRIVATE_KEY_DER: &[u8] =
+        include_bytes!("../src/rsa/signature_rsa_example_private_key.der");
+    let key_pair = rsa::KeyPair::from_der(PRIVATE_KEY_DER).unwrap();
+
+    // When the output buffer is not exactly the right length, `sign()` returns
+    // an error (and does not panic or invoke UB). if `sign` doesn't check that
+    // the length is correct at the beginning then there are various possible
+    // failure points when the output buffer is too small.
+    for len in 0..key_pair.public().modulus_len() + 1 {
+        let mut signature = vec![0; len];
+        assert_eq!(
+            len == key_pair.public().modulus_len(),
+            key_pair
+                .sign(&signature::RSA_PKCS1_SHA256, &rng, MESSAGE, &mut signature)
+                .is_ok()
+        );
+    }
+}
+
+#[cfg(feature = "alloc")]
+#[test]
+fn test_signature_rsa_pkcs1_verify() {
+    let sha1_params = &[
+        (
+            &signature::RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY,
+            1024,
+        ),
+        (
+            &signature::RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY,
+            2048,
+        ),
+    ];
+    let sha256_params = &[
+        (
+            &signature::RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY,
+            1024,
+        ),
+        (&signature::RSA_PKCS1_2048_8192_SHA256, 2048),
+    ];
+    let sha384_params = &[
+        (&signature::RSA_PKCS1_2048_8192_SHA384, 2048),
+        (&signature::RSA_PKCS1_3072_8192_SHA384, 3072),
+    ];
+    let sha512_params = &[
+        (
+            &signature::RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY,
+            1024,
+        ),
+        (&signature::RSA_PKCS1_2048_8192_SHA512, 2048),
+    ];
+    test::run(
+        test_file!("rsa_pkcs1_verify_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let digest_name = test_case.consume_string("Digest");
+            let params: &[_] = match digest_name.as_ref() {
+                "SHA1" => sha1_params,
+                "SHA256" => sha256_params,
+                "SHA384" => sha384_params,
+                "SHA512" => sha512_params,
+                _ => panic!("Unsupported digest: {}", digest_name),
+            };
+
+            let public_key = test_case.consume_bytes("Key");
+
+            // Sanity check that we correctly DER-encoded the originally-
+            // provided separate (n, e) components. When we add test vectors
+            // for improperly-encoded signatures, we'll have to revisit this.
+            let key_bits = untrusted::Input::from(&public_key)
+                .read_all(error::Unspecified, |input| {
+                    der::nested(input, der::Tag::Sequence, error::Unspecified, |input| {
+                        let n_bytes =
+                            der::positive_integer(input)?.big_endian_without_leading_zero();
+                        let _e = der::positive_integer(input)?;
+
+                        // Because `n_bytes` has the leading zeros stripped and is big-endian, there
+                        // must be less than 8 leading zero bits.
+                        let n_leading_zeros = usize::try_from(n_bytes[0].leading_zeros()).unwrap();
+                        assert!(n_leading_zeros < 8);
+                        Ok((n_bytes.len() * 8) - n_leading_zeros)
+                    })
+                })
+                .expect("invalid DER");
+
+            let msg = test_case.consume_bytes("Msg");
+            let sig = test_case.consume_bytes("Sig");
+            let is_valid = test_case.consume_string("Result") == "P";
+            for &(alg, min_bits) in params {
+                let width_ok = key_bits >= min_bits;
+                let actual_result =
+                    signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig);
+                assert_eq!(actual_result.is_ok(), is_valid && width_ok);
+            }
+
+            Ok(())
+        },
+    );
+}
+
+#[cfg(feature = "alloc")]
+#[test]
+fn test_signature_rsa_pss_verify() {
+    test::run(
+        test_file!("rsa_pss_verify_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+
+            let digest_name = test_case.consume_string("Digest");
+            let alg = match digest_name.as_ref() {
+                "SHA256" => &signature::RSA_PSS_2048_8192_SHA256,
+                "SHA384" => &signature::RSA_PSS_2048_8192_SHA384,
+                "SHA512" => &signature::RSA_PSS_2048_8192_SHA512,
+                _ => panic!("Unsupported digest: {}", digest_name),
+            };
+
+            let public_key = test_case.consume_bytes("Key");
+
+            // Sanity check that we correctly DER-encoded the originally-
+            // provided separate (n, e) components. When we add test vectors
+            // for improperly-encoded signatures, we'll have to revisit this.
+            assert!(untrusted::Input::from(&public_key)
+                .read_all(error::Unspecified, |input| der::nested(
+                    input,
+                    der::Tag::Sequence,
+                    error::Unspecified,
+                    |input| {
+                        let _ = der::positive_integer(input)?;
+                        let _ = der::positive_integer(input)?;
+                        Ok(())
+                    }
+                ))
+                .is_ok());
+
+            let msg = test_case.consume_bytes("Msg");
+            let sig = test_case.consume_bytes("Sig");
+            let is_valid = test_case.consume_string("Result") == "P";
+
+            let actual_result =
+                signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig);
+            assert_eq!(actual_result.is_ok(), is_valid);
+
+            Ok(())
+        },
+    );
+}
+
+// Test for `primitive::verify()`. Read public key parts from a file
+// and use them to verify a signature.
+#[cfg(feature = "alloc")]
+#[test]
+fn test_signature_rsa_primitive_verification() {
+    test::run(
+        test_file!("rsa_primitive_verify_tests.txt"),
+        |section, test_case| {
+            assert_eq!(section, "");
+            let n = test_case.consume_bytes("n");
+            let e = test_case.consume_bytes("e");
+            let msg = test_case.consume_bytes("Msg");
+            let sig = test_case.consume_bytes("Sig");
+            let expected = test_case.consume_string("Result");
+            let public_key = signature::RsaPublicKeyComponents { n: &n, e: &e };
+            let result = public_key.verify(&signature::RSA_PKCS1_2048_8192_SHA256, &msg, &sig);
+            assert_eq!(result.is_ok(), expected == "Pass");
+            Ok(())
+        },
+    )
+}
+
+#[cfg(feature = "alloc")]
+#[test]
+fn rsa_test_keypair_coverage() {
+    const PRIVATE_KEY: &[u8] = include_bytes!("rsa_test_private_key_2048.p8");
+
+    let key_pair = rsa::KeyPair::from_pkcs8(PRIVATE_KEY).unwrap();
+
+    // Test that `signature::KeyPair::PublicKey` is `rsa::PublicKey`; if it
+    // were a separate type then it would need to be tested separately.
+    let _: &rsa::PublicKey = key_pair.public_key();
+
+    test_public_key_coverage(key_pair.public());
+    // Test clones.
+    test_public_key_coverage(&key_pair.public().clone());
+
+    // Test `Debug`
+    assert_eq!(
+        format!("RsaKeyPair {{ public: {:?} }}", key_pair.public_key()),
+        format!("{:?}", key_pair)
+    );
+}
+
+fn test_public_key_coverage(key: &rsa::PublicKey) {
+    // Test `AsRef<[u8]>`
+    const PUBLIC_KEY: &[u8] = include_bytes!("rsa_test_public_key_2048.der");
+    assert_eq!(key.as_ref(), PUBLIC_KEY);
+
+    // Test `Debug`.
+    const PUBLIC_KEY_DEBUG: &str = include_str!("rsa_test_public_key_2048_debug.txt");
+    assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key));
+
+    let components = rsa::PublicKeyComponents::<Vec<_>>::from(key);
+    const PUBLIC_KEY_MODULUS_BE_BYTES: &[u8] = include_bytes!("rsa_test_public_modulus.bin");
+    assert_eq!(PUBLIC_KEY_MODULUS_BE_BYTES, &components.n);
+    const _65537: &[u8] = &[0x01, 0x00, 0x01];
+    assert_eq!(_65537, &components.e);
+}
diff --git a/ring-0.17.14/tests/signature_tests.rs b/ring-0.17.14/tests/signature_tests.rs
new file mode 100644
index 0000000000..dd340fab65
--- /dev/null
+++ b/ring-0.17.14/tests/signature_tests.rs
@@ -0,0 +1,30 @@
+#![allow(missing_docs)]
+
+use ring::signature;
+#[allow(deprecated)]
+use ring::test;
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure};
+
+#[cfg(all(target_arch = "wasm32", target_os = "unknown"))]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+fn signature_impl_test() {
+    test::compile_time_assert_clone::<signature::Signature>();
+    test::compile_time_assert_copy::<signature::Signature>();
+    test::compile_time_assert_send::<signature::Signature>();
+    test::compile_time_assert_sync::<signature::Signature>();
+
+    let unparsed_public_key =
+        signature::UnparsedPublicKey::new(&signature::ED25519, &[0x01, 0x02, 0x03]);
+
+    assert_eq!(
+        format!("{:?}", unparsed_public_key),
+        r#"UnparsedPublicKey { algorithm: ring::signature::ED25519, bytes: "010203" }"#
+    );
+
+    // Test `AsRef<[u8]>`
+    assert_eq!(unparsed_public_key.as_ref(), &[0x01, 0x02, 0x03]);
+}
diff --git a/ring-0.17.14/third_party/fiat/LICENSE b/ring-0.17.14/third_party/fiat/LICENSE
new file mode 100644
index 0000000000..3bc4b882eb
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/LICENSE
@@ -0,0 +1,15 @@
+The Apache License, Version 2.0 (Apache-2.0)
+
+Copyright 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S
new file mode 100644
index 0000000000..7e7be03036
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S
@@ -0,0 +1,178 @@
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__APPLE__) || defined(__ELF__))
+
+.intel_syntax noprefix
+.text
+#if defined(__APPLE__)
+.private_extern _fiat_curve25519_adx_mul
+.global _fiat_curve25519_adx_mul
+_fiat_curve25519_adx_mul:
+#else
+.type fiat_curve25519_adx_mul, @function
+.hidden fiat_curve25519_adx_mul
+.global fiat_curve25519_adx_mul
+fiat_curve25519_adx_mul:
+#endif
+
+.cfi_startproc
+_CET_ENDBR
+push rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp, -16
+mov rbp, rsp
+
+mov rax, rdx
+mov rdx, [ rsi + 0x18 ]
+mulx r11, r10, [ rax + 0x8 ]
+mov rdx, [ rax + 0x0 ]
+mov [ rsp - 0x58 ], r15
+.cfi_offset r15, -16-0x58
+mulx r8, rcx, [ rsi + 0x18 ]
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x80 ], rbx
+.cfi_offset rbx, -16-0x80
+mulx rbx, r9, [ rax + 0x18 ]
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x70 ], r12
+.cfi_offset r12, -16-0x70
+mulx r15, r12, [ rax + 0x8 ]
+mov rdx, [ rsi + 0x0 ]
+mov [ rsp - 0x68 ], r13
+.cfi_offset r13, -16-0x68
+mov [ rsp - 0x60 ], r14
+.cfi_offset r14, -16-0x60
+mulx r14, r13, [ rax + 0x0 ]
+mov rdx, [ rax + 0x10 ]
+mov [ rsp - 0x18 ], r15
+mov [ rsp - 0x50 ], rdi
+mulx rdi, r15, [ rsi + 0x0 ]
+mov rdx, [ rax + 0x18 ]
+mov [ rsp - 0x48 ], r13
+mov [ rsp - 0x40 ], r9
+mulx r9, r13, [ rsi + 0x0 ]
+test al, al
+adox rcx, rdi
+mov rdx, [ rsi + 0x10 ]
+mov [ rsp - 0x38 ], r13
+mulx r13, rdi, [ rax + 0x8 ]
+adox r10, r9
+mov rdx, 0x0
+adox rbx, rdx
+adcx rdi, rcx
+adcx r8, r10
+mov r9, rdx
+adcx r9, rbx
+mov rdx, [ rsi + 0x10 ]
+mulx r10, rcx, [ rax + 0x0 ]
+mov rdx, [ rsi + 0x0 ]
+mov [ rsp - 0x30 ], r15
+mulx r15, rbx, [ rax + 0x8 ]
+mov rdx, -0x2
+inc rdx
+adox rcx, r15
+setc r15b
+clc
+adcx rcx, r12
+adox r10, rdi
+mov rdx, [ rax + 0x10 ]
+mov [ rsp - 0x78 ], rcx
+mulx rcx, rdi, [ rsi + 0x10 ]
+adox rdi, r8
+mov rdx, [ rax + 0x18 ]
+mov [ rsp - 0x28 ], rcx
+mulx rcx, r8, [ rsi + 0x10 ]
+mov rdx, [ rax + 0x10 ]
+mov [ rsp - 0x20 ], r8
+mulx r12, r8, [ rsi + 0x18 ]
+adox r8, r9
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x10 ], r12
+mulx r12, r9, [ rax + 0x10 ]
+movzx rdx, r15b
+lea rdx, [ rdx + rcx ]
+adcx r9, r10
+adcx r13, rdi
+mov r15, 0x0
+mov r10, r15
+adox r10, rdx
+mov rdx, [ rax + 0x18 ]
+mulx rcx, rdi, [ rsi + 0x18 ]
+adox rcx, r15
+adcx r11, r8
+mov rdx, r15
+adcx rdx, r10
+adcx rcx, r15
+mov r8, rdx
+mov rdx, [ rax + 0x0 ]
+mulx r15, r10, [ rsi + 0x8 ]
+test al, al
+adox r10, r14
+adcx rbx, r10
+adox r15, [ rsp - 0x78 ]
+adcx r15, [ rsp - 0x30 ]
+adox r9, [ rsp - 0x18 ]
+adcx r9, [ rsp - 0x38 ]
+adox r13, [ rsp - 0x40 ]
+adcx r12, r13
+adox r11, [ rsp - 0x20 ]
+adcx r11, [ rsp - 0x28 ]
+mov rdx, 0x26
+mulx rsi, r14, r12
+adox rdi, r8
+adcx rdi, [ rsp - 0x10 ]
+mulx r10, r8, r11
+mov r13, 0x0
+adox rcx, r13
+adcx rcx, r13
+mulx r11, r12, rdi
+xor rdi, rdi
+adox r8, rbx
+adox r12, r15
+mulx rbx, r13, rcx
+adcx r14, [ rsp - 0x48 ]
+adox r13, r9
+adox rbx, rdi
+adcx rsi, r8
+adcx r10, r12
+adcx r11, r13
+adc rbx, 0x0
+mulx r9, r15, rbx
+xor r9, r9
+adox r15, r14
+mov rdi, r9
+adox rdi, rsi
+mov rcx, r9
+adox rcx, r10
+mov r8, [ rsp - 0x50 ]
+mov [ r8 + 0x8 ], rdi
+mov r12, r9
+adox r12, r11
+mov r14, r9
+cmovo r14, rdx
+mov [ r8 + 0x18 ], r12
+adcx r15, r14
+mov [ r8 + 0x0 ], r15
+mov [ r8 + 0x10 ], rcx
+mov rbx, [ rsp - 0x80 ]
+.cfi_restore rbx
+mov r12, [ rsp - 0x70 ]
+.cfi_restore r12
+mov r13, [ rsp - 0x68 ]
+.cfi_restore r13
+mov r14, [ rsp - 0x60 ]
+.cfi_restore r14
+mov r15, [ rsp - 0x58 ]
+.cfi_restore r15
+
+pop rbp
+.cfi_restore rbp
+.cfi_adjust_cfa_offset -8
+ret
+.cfi_endproc
+#if defined(__ELF__)
+.size fiat_curve25519_adx_mul, .-fiat_curve25519_adx_mul
+#endif
+
+#endif
diff --git a/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S
new file mode 100644
index 0000000000..ccf7b76e58
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S
@@ -0,0 +1,146 @@
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__APPLE__) || defined(__ELF__))
+
+.intel_syntax noprefix
+.text
+#if defined(__APPLE__)
+.private_extern _fiat_curve25519_adx_square
+.global _fiat_curve25519_adx_square
+_fiat_curve25519_adx_square:
+#else
+.type fiat_curve25519_adx_square, @function
+.hidden fiat_curve25519_adx_square
+.global fiat_curve25519_adx_square
+fiat_curve25519_adx_square:
+#endif
+
+.cfi_startproc
+_CET_ENDBR
+push rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp, -16
+mov rbp, rsp
+
+mov rdx, [ rsi + 0x0 ]
+mulx r10, rax, [ rsi + 0x8 ]
+mov rdx, [ rsi + 0x0 ]
+mulx rcx, r11, [ rsi + 0x10 ]
+xor rdx, rdx
+adox r11, r10
+mov rdx, [ rsi + 0x0 ]
+mulx r9, r8, [ rsi + 0x18 ]
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x80 ], rbx
+.cfi_offset rbx, -16-0x80
+mulx rbx, r10, [ rsi + 0x18 ]
+adox r8, rcx
+mov [rsp - 0x48 ], rdi
+adox r10, r9
+adcx rax, rax
+mov rdx, [ rsi + 0x10 ]
+mulx r9, rcx, [ rsi + 0x18 ]
+adox rcx, rbx
+mov rdx, [ rsi + 0x10 ]
+mulx rdi, rbx, [ rsi + 0x8 ]
+mov rdx, 0x0
+adox r9, rdx
+mov [ rsp - 0x70 ], r12
+.cfi_offset r12, -16-0x70
+mov r12, -0x3
+inc r12
+adox rbx, r8
+adox rdi, r10
+adcx r11, r11
+mov r8, rdx
+adox r8, rcx
+mov r10, rdx
+adox r10, r9
+adcx rbx, rbx
+mov rdx, [ rsi + 0x0 ]
+mulx r9, rcx, rdx
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x68 ], r13
+.cfi_offset r13, -16-0x68
+mov [ rsp - 0x60 ], r14
+.cfi_offset r14, -16-0x60
+mulx r14, r13, rdx
+seto dl
+inc r12
+adox r9, rax
+adox r13, r11
+adox r14, rbx
+adcx rdi, rdi
+mov al, dl
+mov rdx, [ rsi + 0x10 ]
+mulx rbx, r11, rdx
+adox r11, rdi
+adcx r8, r8
+adox rbx, r8
+adcx r10, r10
+movzx rdx, al
+mov rdi, 0x0
+adcx rdx, rdi
+movzx r8, al
+lea r8, [ r8 + rdx ]
+mov rdx, [ rsi + 0x18 ]
+mulx rdi, rax, rdx
+adox rax, r10
+mov rdx, 0x26
+mov [ rsp - 0x58 ], r15
+.cfi_offset r15, -16-0x58
+mulx r15, r10, r11
+clc
+adcx r10, rcx
+mulx r11, rcx, rbx
+adox r8, rdi
+mulx rdi, rbx, r8
+inc r12
+adox rcx, r9
+mulx r8, r9, rax
+adcx r15, rcx
+adox r9, r13
+adcx r11, r9
+adox rbx, r14
+adox rdi, r12
+adcx r8, rbx
+adc rdi, 0x0
+mulx r14, r13, rdi
+test al, al
+mov rdi, [ rsp - 0x48 ]
+adox r13, r10
+mov r14, r12
+adox r14, r15
+mov [ rdi + 0x8 ], r14
+mov rax, r12
+adox rax, r11
+mov r10, r12
+adox r10, r8
+mov [ rdi + 0x10 ], rax
+mov rcx, r12
+cmovo rcx, rdx
+adcx r13, rcx
+mov [ rdi + 0x0 ], r13
+mov [ rdi + 0x18 ], r10
+mov rbx, [ rsp - 0x80 ]
+.cfi_restore rbx
+mov r12, [ rsp - 0x70 ]
+.cfi_restore r12
+mov r13, [ rsp - 0x68 ]
+.cfi_restore r13
+mov r14, [ rsp - 0x60 ]
+.cfi_restore r14
+mov r15, [ rsp - 0x58 ]
+.cfi_restore r15
+
+pop rbp
+.cfi_restore rbp
+.cfi_adjust_cfa_offset -8
+ret
+.cfi_endproc
+#if defined(__ELF__)
+.size fiat_curve25519_adx_square, .-fiat_curve25519_adx_square
+#endif
+
+#endif
diff --git a/ring-0.17.14/third_party/fiat/curve25519_32.h b/ring-0.17.14/third_party/fiat/curve25519_32.h
new file mode 100644
index 0000000000..56417ce635
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/curve25519_32.h
@@ -0,0 +1,1479 @@
+/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 32 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */
+/* curve description: 25519 */
+/* machine_wordsize = 32 (from "32") */
+/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */
+/* n = 10 (from "(auto)") */
+/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
+/* tight_bounds_multiplier = 1 (from "") */
+/*  */
+/* Computed values: */
+/*   carry_chain = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1] */
+/*   eval z = z[0] + (z[1] << 26) + (z[2] << 51) + (z[3] << 77) + (z[4] << 102) + (z[5] << 128) + (z[6] << 153) + (z[7] << 179) + (z[8] << 204) + (z[9] << 230) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   balance = [0x7ffffda, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe] */
+
+#include <stdint.h>
+typedef unsigned char fiat_25519_uint1;
+typedef signed char fiat_25519_int1;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_25519_FIAT_INLINE __inline__
+#else
+#  define FIAT_25519_FIAT_INLINE
+#endif
+
+/* The type fiat_25519_loose_field_element is a field element with loose bounds. */
+/* Bounds: [[0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000]] */
+typedef uint32_t fiat_25519_loose_field_element[10];
+
+/* The type fiat_25519_tight_field_element is a field element with tight bounds. */
+/* Bounds: [[0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000]] */
+typedef uint32_t fiat_25519_tight_field_element[10];
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+#if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint32_t fiat_25519_value_barrier_u32(uint32_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_25519_value_barrier_u32(x) (x)
+#endif
+
+
+/*
+ * The function fiat_25519_addcarryx_u26 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^26
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^26⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x3ffffff]
+ *   arg3: [0x0 ~> 0x3ffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x3ffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  uint32_t x1;
+  uint32_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT32_C(0x3ffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 26);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_25519_subborrowx_u26 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^26
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^26⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x3ffffff]
+ *   arg3: [0x0 ~> 0x3ffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x3ffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  int32_t x1;
+  fiat_25519_int1 x2;
+  uint32_t x3;
+  x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 26);
+  x3 = (x1 & UINT32_C(0x3ffffff));
+  *out1 = x3;
+  *out2 = (fiat_25519_uint1)(0x0 - x2);
+}
+
+/*
+ * The function fiat_25519_addcarryx_u25 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^25
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^25⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x1ffffff]
+ *   arg3: [0x0 ~> 0x1ffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x1ffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  uint32_t x1;
+  uint32_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT32_C(0x1ffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 25);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_25519_subborrowx_u25 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^25
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^25⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x1ffffff]
+ *   arg3: [0x0 ~> 0x1ffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x1ffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  int32_t x1;
+  fiat_25519_int1 x2;
+  uint32_t x3;
+  x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 25);
+  x3 = (x1 & UINT32_C(0x1ffffff));
+  *out1 = x3;
+  *out2 = (fiat_25519_uint1)(0x0 - x2);
+}
+
+/*
+ * The function fiat_25519_cmovznz_u32 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffff]
+ *   arg3: [0x0 ~> 0xffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u32(uint32_t* out1, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  fiat_25519_uint1 x1;
+  uint32_t x2;
+  uint32_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_25519_int1)(0x0 - x1) & UINT32_C(0xffffffff));
+  x3 = ((fiat_25519_value_barrier_u32(x2) & arg3) | (fiat_25519_value_barrier_u32((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * The function fiat_25519_carry_mul multiplies two field elements and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint64_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  uint64_t x72;
+  uint64_t x73;
+  uint64_t x74;
+  uint64_t x75;
+  uint64_t x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  uint64_t x81;
+  uint64_t x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  uint64_t x93;
+  uint64_t x94;
+  uint64_t x95;
+  uint64_t x96;
+  uint64_t x97;
+  uint64_t x98;
+  uint64_t x99;
+  uint64_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint32_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  uint64_t x106;
+  uint64_t x107;
+  uint64_t x108;
+  uint64_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint64_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint32_t x115;
+  uint64_t x116;
+  uint64_t x117;
+  uint32_t x118;
+  uint64_t x119;
+  uint64_t x120;
+  uint32_t x121;
+  uint64_t x122;
+  uint64_t x123;
+  uint32_t x124;
+  uint64_t x125;
+  uint64_t x126;
+  uint32_t x127;
+  uint64_t x128;
+  uint64_t x129;
+  uint32_t x130;
+  uint64_t x131;
+  uint64_t x132;
+  uint32_t x133;
+  uint64_t x134;
+  uint64_t x135;
+  uint32_t x136;
+  uint64_t x137;
+  uint64_t x138;
+  uint32_t x139;
+  uint64_t x140;
+  uint64_t x141;
+  uint32_t x142;
+  uint32_t x143;
+  uint32_t x144;
+  fiat_25519_uint1 x145;
+  uint32_t x146;
+  uint32_t x147;
+  x1 = ((uint64_t)(arg1[9]) * ((arg2[9]) * UINT8_C(0x26)));
+  x2 = ((uint64_t)(arg1[9]) * ((arg2[8]) * UINT8_C(0x13)));
+  x3 = ((uint64_t)(arg1[9]) * ((arg2[7]) * UINT8_C(0x26)));
+  x4 = ((uint64_t)(arg1[9]) * ((arg2[6]) * UINT8_C(0x13)));
+  x5 = ((uint64_t)(arg1[9]) * ((arg2[5]) * UINT8_C(0x26)));
+  x6 = ((uint64_t)(arg1[9]) * ((arg2[4]) * UINT8_C(0x13)));
+  x7 = ((uint64_t)(arg1[9]) * ((arg2[3]) * UINT8_C(0x26)));
+  x8 = ((uint64_t)(arg1[9]) * ((arg2[2]) * UINT8_C(0x13)));
+  x9 = ((uint64_t)(arg1[9]) * ((arg2[1]) * UINT8_C(0x26)));
+  x10 = ((uint64_t)(arg1[8]) * ((arg2[9]) * UINT8_C(0x13)));
+  x11 = ((uint64_t)(arg1[8]) * ((arg2[8]) * UINT8_C(0x13)));
+  x12 = ((uint64_t)(arg1[8]) * ((arg2[7]) * UINT8_C(0x13)));
+  x13 = ((uint64_t)(arg1[8]) * ((arg2[6]) * UINT8_C(0x13)));
+  x14 = ((uint64_t)(arg1[8]) * ((arg2[5]) * UINT8_C(0x13)));
+  x15 = ((uint64_t)(arg1[8]) * ((arg2[4]) * UINT8_C(0x13)));
+  x16 = ((uint64_t)(arg1[8]) * ((arg2[3]) * UINT8_C(0x13)));
+  x17 = ((uint64_t)(arg1[8]) * ((arg2[2]) * UINT8_C(0x13)));
+  x18 = ((uint64_t)(arg1[7]) * ((arg2[9]) * UINT8_C(0x26)));
+  x19 = ((uint64_t)(arg1[7]) * ((arg2[8]) * UINT8_C(0x13)));
+  x20 = ((uint64_t)(arg1[7]) * ((arg2[7]) * UINT8_C(0x26)));
+  x21 = ((uint64_t)(arg1[7]) * ((arg2[6]) * UINT8_C(0x13)));
+  x22 = ((uint64_t)(arg1[7]) * ((arg2[5]) * UINT8_C(0x26)));
+  x23 = ((uint64_t)(arg1[7]) * ((arg2[4]) * UINT8_C(0x13)));
+  x24 = ((uint64_t)(arg1[7]) * ((arg2[3]) * UINT8_C(0x26)));
+  x25 = ((uint64_t)(arg1[6]) * ((arg2[9]) * UINT8_C(0x13)));
+  x26 = ((uint64_t)(arg1[6]) * ((arg2[8]) * UINT8_C(0x13)));
+  x27 = ((uint64_t)(arg1[6]) * ((arg2[7]) * UINT8_C(0x13)));
+  x28 = ((uint64_t)(arg1[6]) * ((arg2[6]) * UINT8_C(0x13)));
+  x29 = ((uint64_t)(arg1[6]) * ((arg2[5]) * UINT8_C(0x13)));
+  x30 = ((uint64_t)(arg1[6]) * ((arg2[4]) * UINT8_C(0x13)));
+  x31 = ((uint64_t)(arg1[5]) * ((arg2[9]) * UINT8_C(0x26)));
+  x32 = ((uint64_t)(arg1[5]) * ((arg2[8]) * UINT8_C(0x13)));
+  x33 = ((uint64_t)(arg1[5]) * ((arg2[7]) * UINT8_C(0x26)));
+  x34 = ((uint64_t)(arg1[5]) * ((arg2[6]) * UINT8_C(0x13)));
+  x35 = ((uint64_t)(arg1[5]) * ((arg2[5]) * UINT8_C(0x26)));
+  x36 = ((uint64_t)(arg1[4]) * ((arg2[9]) * UINT8_C(0x13)));
+  x37 = ((uint64_t)(arg1[4]) * ((arg2[8]) * UINT8_C(0x13)));
+  x38 = ((uint64_t)(arg1[4]) * ((arg2[7]) * UINT8_C(0x13)));
+  x39 = ((uint64_t)(arg1[4]) * ((arg2[6]) * UINT8_C(0x13)));
+  x40 = ((uint64_t)(arg1[3]) * ((arg2[9]) * UINT8_C(0x26)));
+  x41 = ((uint64_t)(arg1[3]) * ((arg2[8]) * UINT8_C(0x13)));
+  x42 = ((uint64_t)(arg1[3]) * ((arg2[7]) * UINT8_C(0x26)));
+  x43 = ((uint64_t)(arg1[2]) * ((arg2[9]) * UINT8_C(0x13)));
+  x44 = ((uint64_t)(arg1[2]) * ((arg2[8]) * UINT8_C(0x13)));
+  x45 = ((uint64_t)(arg1[1]) * ((arg2[9]) * UINT8_C(0x26)));
+  x46 = ((uint64_t)(arg1[9]) * (arg2[0]));
+  x47 = ((uint64_t)(arg1[8]) * (arg2[1]));
+  x48 = ((uint64_t)(arg1[8]) * (arg2[0]));
+  x49 = ((uint64_t)(arg1[7]) * (arg2[2]));
+  x50 = ((uint64_t)(arg1[7]) * ((arg2[1]) * 0x2));
+  x51 = ((uint64_t)(arg1[7]) * (arg2[0]));
+  x52 = ((uint64_t)(arg1[6]) * (arg2[3]));
+  x53 = ((uint64_t)(arg1[6]) * (arg2[2]));
+  x54 = ((uint64_t)(arg1[6]) * (arg2[1]));
+  x55 = ((uint64_t)(arg1[6]) * (arg2[0]));
+  x56 = ((uint64_t)(arg1[5]) * (arg2[4]));
+  x57 = ((uint64_t)(arg1[5]) * ((arg2[3]) * 0x2));
+  x58 = ((uint64_t)(arg1[5]) * (arg2[2]));
+  x59 = ((uint64_t)(arg1[5]) * ((arg2[1]) * 0x2));
+  x60 = ((uint64_t)(arg1[5]) * (arg2[0]));
+  x61 = ((uint64_t)(arg1[4]) * (arg2[5]));
+  x62 = ((uint64_t)(arg1[4]) * (arg2[4]));
+  x63 = ((uint64_t)(arg1[4]) * (arg2[3]));
+  x64 = ((uint64_t)(arg1[4]) * (arg2[2]));
+  x65 = ((uint64_t)(arg1[4]) * (arg2[1]));
+  x66 = ((uint64_t)(arg1[4]) * (arg2[0]));
+  x67 = ((uint64_t)(arg1[3]) * (arg2[6]));
+  x68 = ((uint64_t)(arg1[3]) * ((arg2[5]) * 0x2));
+  x69 = ((uint64_t)(arg1[3]) * (arg2[4]));
+  x70 = ((uint64_t)(arg1[3]) * ((arg2[3]) * 0x2));
+  x71 = ((uint64_t)(arg1[3]) * (arg2[2]));
+  x72 = ((uint64_t)(arg1[3]) * ((arg2[1]) * 0x2));
+  x73 = ((uint64_t)(arg1[3]) * (arg2[0]));
+  x74 = ((uint64_t)(arg1[2]) * (arg2[7]));
+  x75 = ((uint64_t)(arg1[2]) * (arg2[6]));
+  x76 = ((uint64_t)(arg1[2]) * (arg2[5]));
+  x77 = ((uint64_t)(arg1[2]) * (arg2[4]));
+  x78 = ((uint64_t)(arg1[2]) * (arg2[3]));
+  x79 = ((uint64_t)(arg1[2]) * (arg2[2]));
+  x80 = ((uint64_t)(arg1[2]) * (arg2[1]));
+  x81 = ((uint64_t)(arg1[2]) * (arg2[0]));
+  x82 = ((uint64_t)(arg1[1]) * (arg2[8]));
+  x83 = ((uint64_t)(arg1[1]) * ((arg2[7]) * 0x2));
+  x84 = ((uint64_t)(arg1[1]) * (arg2[6]));
+  x85 = ((uint64_t)(arg1[1]) * ((arg2[5]) * 0x2));
+  x86 = ((uint64_t)(arg1[1]) * (arg2[4]));
+  x87 = ((uint64_t)(arg1[1]) * ((arg2[3]) * 0x2));
+  x88 = ((uint64_t)(arg1[1]) * (arg2[2]));
+  x89 = ((uint64_t)(arg1[1]) * ((arg2[1]) * 0x2));
+  x90 = ((uint64_t)(arg1[1]) * (arg2[0]));
+  x91 = ((uint64_t)(arg1[0]) * (arg2[9]));
+  x92 = ((uint64_t)(arg1[0]) * (arg2[8]));
+  x93 = ((uint64_t)(arg1[0]) * (arg2[7]));
+  x94 = ((uint64_t)(arg1[0]) * (arg2[6]));
+  x95 = ((uint64_t)(arg1[0]) * (arg2[5]));
+  x96 = ((uint64_t)(arg1[0]) * (arg2[4]));
+  x97 = ((uint64_t)(arg1[0]) * (arg2[3]));
+  x98 = ((uint64_t)(arg1[0]) * (arg2[2]));
+  x99 = ((uint64_t)(arg1[0]) * (arg2[1]));
+  x100 = ((uint64_t)(arg1[0]) * (arg2[0]));
+  x101 = (x100 + (x45 + (x44 + (x42 + (x39 + (x35 + (x30 + (x24 + (x17 + x9)))))))));
+  x102 = (x101 >> 26);
+  x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
+  x104 = (x91 + (x82 + (x74 + (x67 + (x61 + (x56 + (x52 + (x49 + (x47 + x46)))))))));
+  x105 = (x92 + (x83 + (x75 + (x68 + (x62 + (x57 + (x53 + (x50 + (x48 + x1)))))))));
+  x106 = (x93 + (x84 + (x76 + (x69 + (x63 + (x58 + (x54 + (x51 + (x10 + x2)))))))));
+  x107 = (x94 + (x85 + (x77 + (x70 + (x64 + (x59 + (x55 + (x18 + (x11 + x3)))))))));
+  x108 = (x95 + (x86 + (x78 + (x71 + (x65 + (x60 + (x25 + (x19 + (x12 + x4)))))))));
+  x109 = (x96 + (x87 + (x79 + (x72 + (x66 + (x31 + (x26 + (x20 + (x13 + x5)))))))));
+  x110 = (x97 + (x88 + (x80 + (x73 + (x36 + (x32 + (x27 + (x21 + (x14 + x6)))))))));
+  x111 = (x98 + (x89 + (x81 + (x40 + (x37 + (x33 + (x28 + (x22 + (x15 + x7)))))))));
+  x112 = (x99 + (x90 + (x43 + (x41 + (x38 + (x34 + (x29 + (x23 + (x16 + x8)))))))));
+  x113 = (x102 + x112);
+  x114 = (x113 >> 25);
+  x115 = (uint32_t)(x113 & UINT32_C(0x1ffffff));
+  x116 = (x114 + x111);
+  x117 = (x116 >> 26);
+  x118 = (uint32_t)(x116 & UINT32_C(0x3ffffff));
+  x119 = (x117 + x110);
+  x120 = (x119 >> 25);
+  x121 = (uint32_t)(x119 & UINT32_C(0x1ffffff));
+  x122 = (x120 + x109);
+  x123 = (x122 >> 26);
+  x124 = (uint32_t)(x122 & UINT32_C(0x3ffffff));
+  x125 = (x123 + x108);
+  x126 = (x125 >> 25);
+  x127 = (uint32_t)(x125 & UINT32_C(0x1ffffff));
+  x128 = (x126 + x107);
+  x129 = (x128 >> 26);
+  x130 = (uint32_t)(x128 & UINT32_C(0x3ffffff));
+  x131 = (x129 + x106);
+  x132 = (x131 >> 25);
+  x133 = (uint32_t)(x131 & UINT32_C(0x1ffffff));
+  x134 = (x132 + x105);
+  x135 = (x134 >> 26);
+  x136 = (uint32_t)(x134 & UINT32_C(0x3ffffff));
+  x137 = (x135 + x104);
+  x138 = (x137 >> 25);
+  x139 = (uint32_t)(x137 & UINT32_C(0x1ffffff));
+  x140 = (x138 * UINT8_C(0x13));
+  x141 = (x103 + x140);
+  x142 = (uint32_t)(x141 >> 26);
+  x143 = (uint32_t)(x141 & UINT32_C(0x3ffffff));
+  x144 = (x142 + x115);
+  x145 = (fiat_25519_uint1)(x144 >> 25);
+  x146 = (x144 & UINT32_C(0x1ffffff));
+  x147 = (x145 + x118);
+  out1[0] = x143;
+  out1[1] = x146;
+  out1[2] = x147;
+  out1[3] = x121;
+  out1[4] = x124;
+  out1[5] = x127;
+  out1[6] = x130;
+  out1[7] = x133;
+  out1[8] = x136;
+  out1[9] = x139;
+}
+
+/*
+ * The function fiat_25519_carry_square squares a field element and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * eval arg1) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint64_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint64_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint64_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  uint64_t x72;
+  uint64_t x73;
+  uint64_t x74;
+  uint64_t x75;
+  uint32_t x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  uint64_t x81;
+  uint64_t x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint32_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint32_t x91;
+  uint64_t x92;
+  uint64_t x93;
+  uint32_t x94;
+  uint64_t x95;
+  uint64_t x96;
+  uint32_t x97;
+  uint64_t x98;
+  uint64_t x99;
+  uint32_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint32_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  uint32_t x106;
+  uint64_t x107;
+  uint64_t x108;
+  uint32_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint32_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint32_t x115;
+  uint32_t x116;
+  uint32_t x117;
+  fiat_25519_uint1 x118;
+  uint32_t x119;
+  uint32_t x120;
+  x1 = ((arg1[9]) * UINT8_C(0x13));
+  x2 = (x1 * 0x2);
+  x3 = ((arg1[9]) * 0x2);
+  x4 = ((arg1[8]) * UINT8_C(0x13));
+  x5 = ((uint64_t)x4 * 0x2);
+  x6 = ((arg1[8]) * 0x2);
+  x7 = ((arg1[7]) * UINT8_C(0x13));
+  x8 = (x7 * 0x2);
+  x9 = ((arg1[7]) * 0x2);
+  x10 = ((arg1[6]) * UINT8_C(0x13));
+  x11 = ((uint64_t)x10 * 0x2);
+  x12 = ((arg1[6]) * 0x2);
+  x13 = ((arg1[5]) * UINT8_C(0x13));
+  x14 = ((arg1[5]) * 0x2);
+  x15 = ((arg1[4]) * 0x2);
+  x16 = ((arg1[3]) * 0x2);
+  x17 = ((arg1[2]) * 0x2);
+  x18 = ((arg1[1]) * 0x2);
+  x19 = ((uint64_t)(arg1[9]) * (x1 * 0x2));
+  x20 = ((uint64_t)(arg1[8]) * x2);
+  x21 = ((uint64_t)(arg1[8]) * x4);
+  x22 = ((arg1[7]) * ((uint64_t)x2 * 0x2));
+  x23 = ((arg1[7]) * x5);
+  x24 = ((uint64_t)(arg1[7]) * (x7 * 0x2));
+  x25 = ((uint64_t)(arg1[6]) * x2);
+  x26 = ((arg1[6]) * x5);
+  x27 = ((uint64_t)(arg1[6]) * x8);
+  x28 = ((uint64_t)(arg1[6]) * x10);
+  x29 = ((arg1[5]) * ((uint64_t)x2 * 0x2));
+  x30 = ((arg1[5]) * x5);
+  x31 = ((arg1[5]) * ((uint64_t)x8 * 0x2));
+  x32 = ((arg1[5]) * x11);
+  x33 = ((uint64_t)(arg1[5]) * (x13 * 0x2));
+  x34 = ((uint64_t)(arg1[4]) * x2);
+  x35 = ((arg1[4]) * x5);
+  x36 = ((uint64_t)(arg1[4]) * x8);
+  x37 = ((arg1[4]) * x11);
+  x38 = ((uint64_t)(arg1[4]) * x14);
+  x39 = ((uint64_t)(arg1[4]) * (arg1[4]));
+  x40 = ((arg1[3]) * ((uint64_t)x2 * 0x2));
+  x41 = ((arg1[3]) * x5);
+  x42 = ((arg1[3]) * ((uint64_t)x8 * 0x2));
+  x43 = ((uint64_t)(arg1[3]) * x12);
+  x44 = ((uint64_t)(arg1[3]) * (x14 * 0x2));
+  x45 = ((uint64_t)(arg1[3]) * x15);
+  x46 = ((uint64_t)(arg1[3]) * ((arg1[3]) * 0x2));
+  x47 = ((uint64_t)(arg1[2]) * x2);
+  x48 = ((arg1[2]) * x5);
+  x49 = ((uint64_t)(arg1[2]) * x9);
+  x50 = ((uint64_t)(arg1[2]) * x12);
+  x51 = ((uint64_t)(arg1[2]) * x14);
+  x52 = ((uint64_t)(arg1[2]) * x15);
+  x53 = ((uint64_t)(arg1[2]) * x16);
+  x54 = ((uint64_t)(arg1[2]) * (arg1[2]));
+  x55 = ((arg1[1]) * ((uint64_t)x2 * 0x2));
+  x56 = ((uint64_t)(arg1[1]) * x6);
+  x57 = ((uint64_t)(arg1[1]) * (x9 * 0x2));
+  x58 = ((uint64_t)(arg1[1]) * x12);
+  x59 = ((uint64_t)(arg1[1]) * (x14 * 0x2));
+  x60 = ((uint64_t)(arg1[1]) * x15);
+  x61 = ((uint64_t)(arg1[1]) * (x16 * 0x2));
+  x62 = ((uint64_t)(arg1[1]) * x17);
+  x63 = ((uint64_t)(arg1[1]) * ((arg1[1]) * 0x2));
+  x64 = ((uint64_t)(arg1[0]) * x3);
+  x65 = ((uint64_t)(arg1[0]) * x6);
+  x66 = ((uint64_t)(arg1[0]) * x9);
+  x67 = ((uint64_t)(arg1[0]) * x12);
+  x68 = ((uint64_t)(arg1[0]) * x14);
+  x69 = ((uint64_t)(arg1[0]) * x15);
+  x70 = ((uint64_t)(arg1[0]) * x16);
+  x71 = ((uint64_t)(arg1[0]) * x17);
+  x72 = ((uint64_t)(arg1[0]) * x18);
+  x73 = ((uint64_t)(arg1[0]) * (arg1[0]));
+  x74 = (x73 + (x55 + (x48 + (x42 + (x37 + x33)))));
+  x75 = (x74 >> 26);
+  x76 = (uint32_t)(x74 & UINT32_C(0x3ffffff));
+  x77 = (x64 + (x56 + (x49 + (x43 + x38))));
+  x78 = (x65 + (x57 + (x50 + (x44 + (x39 + x19)))));
+  x79 = (x66 + (x58 + (x51 + (x45 + x20))));
+  x80 = (x67 + (x59 + (x52 + (x46 + (x22 + x21)))));
+  x81 = (x68 + (x60 + (x53 + (x25 + x23))));
+  x82 = (x69 + (x61 + (x54 + (x29 + (x26 + x24)))));
+  x83 = (x70 + (x62 + (x34 + (x30 + x27))));
+  x84 = (x71 + (x63 + (x40 + (x35 + (x31 + x28)))));
+  x85 = (x72 + (x47 + (x41 + (x36 + x32))));
+  x86 = (x75 + x85);
+  x87 = (x86 >> 25);
+  x88 = (uint32_t)(x86 & UINT32_C(0x1ffffff));
+  x89 = (x87 + x84);
+  x90 = (x89 >> 26);
+  x91 = (uint32_t)(x89 & UINT32_C(0x3ffffff));
+  x92 = (x90 + x83);
+  x93 = (x92 >> 25);
+  x94 = (uint32_t)(x92 & UINT32_C(0x1ffffff));
+  x95 = (x93 + x82);
+  x96 = (x95 >> 26);
+  x97 = (uint32_t)(x95 & UINT32_C(0x3ffffff));
+  x98 = (x96 + x81);
+  x99 = (x98 >> 25);
+  x100 = (uint32_t)(x98 & UINT32_C(0x1ffffff));
+  x101 = (x99 + x80);
+  x102 = (x101 >> 26);
+  x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
+  x104 = (x102 + x79);
+  x105 = (x104 >> 25);
+  x106 = (uint32_t)(x104 & UINT32_C(0x1ffffff));
+  x107 = (x105 + x78);
+  x108 = (x107 >> 26);
+  x109 = (uint32_t)(x107 & UINT32_C(0x3ffffff));
+  x110 = (x108 + x77);
+  x111 = (x110 >> 25);
+  x112 = (uint32_t)(x110 & UINT32_C(0x1ffffff));
+  x113 = (x111 * UINT8_C(0x13));
+  x114 = (x76 + x113);
+  x115 = (uint32_t)(x114 >> 26);
+  x116 = (uint32_t)(x114 & UINT32_C(0x3ffffff));
+  x117 = (x115 + x88);
+  x118 = (fiat_25519_uint1)(x117 >> 25);
+  x119 = (x117 & UINT32_C(0x1ffffff));
+  x120 = (x118 + x91);
+  out1[0] = x116;
+  out1[1] = x119;
+  out1[2] = x120;
+  out1[3] = x94;
+  out1[4] = x97;
+  out1[5] = x100;
+  out1[6] = x103;
+  out1[7] = x106;
+  out1[8] = x109;
+  out1[9] = x112;
+}
+
+/*
+ * The function fiat_25519_carry reduces a field element.
+ *
+ * Postconditions:
+ *   eval out1 mod m = eval arg1 mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  x1 = (arg1[0]);
+  x2 = ((x1 >> 26) + (arg1[1]));
+  x3 = ((x2 >> 25) + (arg1[2]));
+  x4 = ((x3 >> 26) + (arg1[3]));
+  x5 = ((x4 >> 25) + (arg1[4]));
+  x6 = ((x5 >> 26) + (arg1[5]));
+  x7 = ((x6 >> 25) + (arg1[6]));
+  x8 = ((x7 >> 26) + (arg1[7]));
+  x9 = ((x8 >> 25) + (arg1[8]));
+  x10 = ((x9 >> 26) + (arg1[9]));
+  x11 = ((x1 & UINT32_C(0x3ffffff)) + ((x10 >> 25) * UINT8_C(0x13)));
+  x12 = ((fiat_25519_uint1)(x11 >> 26) + (x2 & UINT32_C(0x1ffffff)));
+  x13 = (x11 & UINT32_C(0x3ffffff));
+  x14 = (x12 & UINT32_C(0x1ffffff));
+  x15 = ((fiat_25519_uint1)(x12 >> 25) + (x3 & UINT32_C(0x3ffffff)));
+  x16 = (x4 & UINT32_C(0x1ffffff));
+  x17 = (x5 & UINT32_C(0x3ffffff));
+  x18 = (x6 & UINT32_C(0x1ffffff));
+  x19 = (x7 & UINT32_C(0x3ffffff));
+  x20 = (x8 & UINT32_C(0x1ffffff));
+  x21 = (x9 & UINT32_C(0x3ffffff));
+  x22 = (x10 & UINT32_C(0x1ffffff));
+  out1[0] = x13;
+  out1[1] = x14;
+  out1[2] = x15;
+  out1[3] = x16;
+  out1[4] = x17;
+  out1[5] = x18;
+  out1[6] = x19;
+  out1[7] = x20;
+  out1[8] = x21;
+  out1[9] = x22;
+}
+
+/*
+ * The function fiat_25519_add adds two field elements.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 + eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = ((arg1[0]) + (arg2[0]));
+  x2 = ((arg1[1]) + (arg2[1]));
+  x3 = ((arg1[2]) + (arg2[2]));
+  x4 = ((arg1[3]) + (arg2[3]));
+  x5 = ((arg1[4]) + (arg2[4]));
+  x6 = ((arg1[5]) + (arg2[5]));
+  x7 = ((arg1[6]) + (arg2[6]));
+  x8 = ((arg1[7]) + (arg2[7]));
+  x9 = ((arg1[8]) + (arg2[8]));
+  x10 = ((arg1[9]) + (arg2[9]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+  out1[5] = x6;
+  out1[6] = x7;
+  out1[7] = x8;
+  out1[8] = x9;
+  out1[9] = x10;
+}
+
+/*
+ * The function fiat_25519_sub subtracts two field elements.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 - eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = ((UINT32_C(0x7ffffda) + (arg1[0])) - (arg2[0]));
+  x2 = ((UINT32_C(0x3fffffe) + (arg1[1])) - (arg2[1]));
+  x3 = ((UINT32_C(0x7fffffe) + (arg1[2])) - (arg2[2]));
+  x4 = ((UINT32_C(0x3fffffe) + (arg1[3])) - (arg2[3]));
+  x5 = ((UINT32_C(0x7fffffe) + (arg1[4])) - (arg2[4]));
+  x6 = ((UINT32_C(0x3fffffe) + (arg1[5])) - (arg2[5]));
+  x7 = ((UINT32_C(0x7fffffe) + (arg1[6])) - (arg2[6]));
+  x8 = ((UINT32_C(0x3fffffe) + (arg1[7])) - (arg2[7]));
+  x9 = ((UINT32_C(0x7fffffe) + (arg1[8])) - (arg2[8]));
+  x10 = ((UINT32_C(0x3fffffe) + (arg1[9])) - (arg2[9]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+  out1[5] = x6;
+  out1[6] = x7;
+  out1[7] = x8;
+  out1[8] = x9;
+  out1[9] = x10;
+}
+
+/*
+ * The function fiat_25519_opp negates a field element.
+ *
+ * Postconditions:
+ *   eval out1 mod m = -eval arg1 mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = (UINT32_C(0x7ffffda) - (arg1[0]));
+  x2 = (UINT32_C(0x3fffffe) - (arg1[1]));
+  x3 = (UINT32_C(0x7fffffe) - (arg1[2]));
+  x4 = (UINT32_C(0x3fffffe) - (arg1[3]));
+  x5 = (UINT32_C(0x7fffffe) - (arg1[4]));
+  x6 = (UINT32_C(0x3fffffe) - (arg1[5]));
+  x7 = (UINT32_C(0x7fffffe) - (arg1[6]));
+  x8 = (UINT32_C(0x3fffffe) - (arg1[7]));
+  x9 = (UINT32_C(0x7fffffe) - (arg1[8]));
+  x10 = (UINT32_C(0x3fffffe) - (arg1[9]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+  out1[5] = x6;
+  out1[6] = x7;
+  out1[7] = x8;
+  out1[8] = x9;
+  out1[9] = x10;
+}
+
+/*
+ * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order.
+ *
+ * Postconditions:
+ *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) {
+  uint32_t x1;
+  fiat_25519_uint1 x2;
+  uint32_t x3;
+  fiat_25519_uint1 x4;
+  uint32_t x5;
+  fiat_25519_uint1 x6;
+  uint32_t x7;
+  fiat_25519_uint1 x8;
+  uint32_t x9;
+  fiat_25519_uint1 x10;
+  uint32_t x11;
+  fiat_25519_uint1 x12;
+  uint32_t x13;
+  fiat_25519_uint1 x14;
+  uint32_t x15;
+  fiat_25519_uint1 x16;
+  uint32_t x17;
+  fiat_25519_uint1 x18;
+  uint32_t x19;
+  fiat_25519_uint1 x20;
+  uint32_t x21;
+  uint32_t x22;
+  fiat_25519_uint1 x23;
+  uint32_t x24;
+  fiat_25519_uint1 x25;
+  uint32_t x26;
+  fiat_25519_uint1 x27;
+  uint32_t x28;
+  fiat_25519_uint1 x29;
+  uint32_t x30;
+  fiat_25519_uint1 x31;
+  uint32_t x32;
+  fiat_25519_uint1 x33;
+  uint32_t x34;
+  fiat_25519_uint1 x35;
+  uint32_t x36;
+  fiat_25519_uint1 x37;
+  uint32_t x38;
+  fiat_25519_uint1 x39;
+  uint32_t x40;
+  fiat_25519_uint1 x41;
+  uint32_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint32_t x47;
+  uint32_t x48;
+  uint32_t x49;
+  uint8_t x50;
+  uint32_t x51;
+  uint8_t x52;
+  uint32_t x53;
+  uint8_t x54;
+  uint8_t x55;
+  uint32_t x56;
+  uint8_t x57;
+  uint32_t x58;
+  uint8_t x59;
+  uint32_t x60;
+  uint8_t x61;
+  uint8_t x62;
+  uint32_t x63;
+  uint8_t x64;
+  uint32_t x65;
+  uint8_t x66;
+  uint32_t x67;
+  uint8_t x68;
+  uint8_t x69;
+  uint32_t x70;
+  uint8_t x71;
+  uint32_t x72;
+  uint8_t x73;
+  uint32_t x74;
+  uint8_t x75;
+  uint8_t x76;
+  uint32_t x77;
+  uint8_t x78;
+  uint32_t x79;
+  uint8_t x80;
+  uint32_t x81;
+  uint8_t x82;
+  uint8_t x83;
+  uint8_t x84;
+  uint32_t x85;
+  uint8_t x86;
+  uint32_t x87;
+  uint8_t x88;
+  fiat_25519_uint1 x89;
+  uint32_t x90;
+  uint8_t x91;
+  uint32_t x92;
+  uint8_t x93;
+  uint32_t x94;
+  uint8_t x95;
+  uint8_t x96;
+  uint32_t x97;
+  uint8_t x98;
+  uint32_t x99;
+  uint8_t x100;
+  uint32_t x101;
+  uint8_t x102;
+  uint8_t x103;
+  uint32_t x104;
+  uint8_t x105;
+  uint32_t x106;
+  uint8_t x107;
+  uint32_t x108;
+  uint8_t x109;
+  uint8_t x110;
+  uint32_t x111;
+  uint8_t x112;
+  uint32_t x113;
+  uint8_t x114;
+  uint32_t x115;
+  uint8_t x116;
+  uint8_t x117;
+  fiat_25519_subborrowx_u26(&x1, &x2, 0x0, (arg1[0]), UINT32_C(0x3ffffed));
+  fiat_25519_subborrowx_u25(&x3, &x4, x2, (arg1[1]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x5, &x6, x4, (arg1[2]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x7, &x8, x6, (arg1[3]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x9, &x10, x8, (arg1[4]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x11, &x12, x10, (arg1[5]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x13, &x14, x12, (arg1[6]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x15, &x16, x14, (arg1[7]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x17, &x18, x16, (arg1[8]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x19, &x20, x18, (arg1[9]), UINT32_C(0x1ffffff));
+  fiat_25519_cmovznz_u32(&x21, x20, 0x0, UINT32_C(0xffffffff));
+  fiat_25519_addcarryx_u26(&x22, &x23, 0x0, x1, (x21 & UINT32_C(0x3ffffed)));
+  fiat_25519_addcarryx_u25(&x24, &x25, x23, x3, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x26, &x27, x25, x5, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x28, &x29, x27, x7, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x30, &x31, x29, x9, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x32, &x33, x31, x11, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x34, &x35, x33, x13, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x36, &x37, x35, x15, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x38, &x39, x37, x17, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x40, &x41, x39, x19, (x21 & UINT32_C(0x1ffffff)));
+  x42 = (x40 << 6);
+  x43 = (x38 << 4);
+  x44 = (x36 << 3);
+  x45 = (x34 * (uint32_t)0x2);
+  x46 = (x30 << 6);
+  x47 = (x28 << 5);
+  x48 = (x26 << 3);
+  x49 = (x24 << 2);
+  x50 = (uint8_t)(x22 & UINT8_C(0xff));
+  x51 = (x22 >> 8);
+  x52 = (uint8_t)(x51 & UINT8_C(0xff));
+  x53 = (x51 >> 8);
+  x54 = (uint8_t)(x53 & UINT8_C(0xff));
+  x55 = (uint8_t)(x53 >> 8);
+  x56 = (x49 + (uint32_t)x55);
+  x57 = (uint8_t)(x56 & UINT8_C(0xff));
+  x58 = (x56 >> 8);
+  x59 = (uint8_t)(x58 & UINT8_C(0xff));
+  x60 = (x58 >> 8);
+  x61 = (uint8_t)(x60 & UINT8_C(0xff));
+  x62 = (uint8_t)(x60 >> 8);
+  x63 = (x48 + (uint32_t)x62);
+  x64 = (uint8_t)(x63 & UINT8_C(0xff));
+  x65 = (x63 >> 8);
+  x66 = (uint8_t)(x65 & UINT8_C(0xff));
+  x67 = (x65 >> 8);
+  x68 = (uint8_t)(x67 & UINT8_C(0xff));
+  x69 = (uint8_t)(x67 >> 8);
+  x70 = (x47 + (uint32_t)x69);
+  x71 = (uint8_t)(x70 & UINT8_C(0xff));
+  x72 = (x70 >> 8);
+  x73 = (uint8_t)(x72 & UINT8_C(0xff));
+  x74 = (x72 >> 8);
+  x75 = (uint8_t)(x74 & UINT8_C(0xff));
+  x76 = (uint8_t)(x74 >> 8);
+  x77 = (x46 + (uint32_t)x76);
+  x78 = (uint8_t)(x77 & UINT8_C(0xff));
+  x79 = (x77 >> 8);
+  x80 = (uint8_t)(x79 & UINT8_C(0xff));
+  x81 = (x79 >> 8);
+  x82 = (uint8_t)(x81 & UINT8_C(0xff));
+  x83 = (uint8_t)(x81 >> 8);
+  x84 = (uint8_t)(x32 & UINT8_C(0xff));
+  x85 = (x32 >> 8);
+  x86 = (uint8_t)(x85 & UINT8_C(0xff));
+  x87 = (x85 >> 8);
+  x88 = (uint8_t)(x87 & UINT8_C(0xff));
+  x89 = (fiat_25519_uint1)(x87 >> 8);
+  x90 = (x45 + (uint32_t)x89);
+  x91 = (uint8_t)(x90 & UINT8_C(0xff));
+  x92 = (x90 >> 8);
+  x93 = (uint8_t)(x92 & UINT8_C(0xff));
+  x94 = (x92 >> 8);
+  x95 = (uint8_t)(x94 & UINT8_C(0xff));
+  x96 = (uint8_t)(x94 >> 8);
+  x97 = (x44 + (uint32_t)x96);
+  x98 = (uint8_t)(x97 & UINT8_C(0xff));
+  x99 = (x97 >> 8);
+  x100 = (uint8_t)(x99 & UINT8_C(0xff));
+  x101 = (x99 >> 8);
+  x102 = (uint8_t)(x101 & UINT8_C(0xff));
+  x103 = (uint8_t)(x101 >> 8);
+  x104 = (x43 + (uint32_t)x103);
+  x105 = (uint8_t)(x104 & UINT8_C(0xff));
+  x106 = (x104 >> 8);
+  x107 = (uint8_t)(x106 & UINT8_C(0xff));
+  x108 = (x106 >> 8);
+  x109 = (uint8_t)(x108 & UINT8_C(0xff));
+  x110 = (uint8_t)(x108 >> 8);
+  x111 = (x42 + (uint32_t)x110);
+  x112 = (uint8_t)(x111 & UINT8_C(0xff));
+  x113 = (x111 >> 8);
+  x114 = (uint8_t)(x113 & UINT8_C(0xff));
+  x115 = (x113 >> 8);
+  x116 = (uint8_t)(x115 & UINT8_C(0xff));
+  x117 = (uint8_t)(x115 >> 8);
+  out1[0] = x50;
+  out1[1] = x52;
+  out1[2] = x54;
+  out1[3] = x57;
+  out1[4] = x59;
+  out1[5] = x61;
+  out1[6] = x64;
+  out1[7] = x66;
+  out1[8] = x68;
+  out1[9] = x71;
+  out1[10] = x73;
+  out1[11] = x75;
+  out1[12] = x78;
+  out1[13] = x80;
+  out1[14] = x82;
+  out1[15] = x83;
+  out1[16] = x84;
+  out1[17] = x86;
+  out1[18] = x88;
+  out1[19] = x91;
+  out1[20] = x93;
+  out1[21] = x95;
+  out1[22] = x98;
+  out1[23] = x100;
+  out1[24] = x102;
+  out1[25] = x105;
+  out1[26] = x107;
+  out1[27] = x109;
+  out1[28] = x112;
+  out1[29] = x114;
+  out1[30] = x116;
+  out1[31] = x117;
+}
+
+/*
+ * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order.
+ *
+ * Postconditions:
+ *   eval out1 mod m = bytes_eval arg1 mod m
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint8_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  uint32_t x23;
+  uint32_t x24;
+  uint32_t x25;
+  uint32_t x26;
+  uint32_t x27;
+  uint32_t x28;
+  uint32_t x29;
+  uint32_t x30;
+  uint32_t x31;
+  uint8_t x32;
+  uint32_t x33;
+  uint32_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint8_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint8_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint8_t x47;
+  uint32_t x48;
+  uint32_t x49;
+  uint32_t x50;
+  uint32_t x51;
+  uint8_t x52;
+  uint32_t x53;
+  uint32_t x54;
+  uint32_t x55;
+  uint32_t x56;
+  uint32_t x57;
+  uint32_t x58;
+  uint32_t x59;
+  uint8_t x60;
+  uint32_t x61;
+  uint32_t x62;
+  uint32_t x63;
+  uint32_t x64;
+  uint8_t x65;
+  uint32_t x66;
+  uint32_t x67;
+  uint32_t x68;
+  uint32_t x69;
+  uint8_t x70;
+  uint32_t x71;
+  uint32_t x72;
+  uint32_t x73;
+  uint32_t x74;
+  uint8_t x75;
+  uint32_t x76;
+  uint32_t x77;
+  uint32_t x78;
+  x1 = ((uint32_t)(arg1[31]) << 18);
+  x2 = ((uint32_t)(arg1[30]) << 10);
+  x3 = ((uint32_t)(arg1[29]) << 2);
+  x4 = ((uint32_t)(arg1[28]) << 20);
+  x5 = ((uint32_t)(arg1[27]) << 12);
+  x6 = ((uint32_t)(arg1[26]) << 4);
+  x7 = ((uint32_t)(arg1[25]) << 21);
+  x8 = ((uint32_t)(arg1[24]) << 13);
+  x9 = ((uint32_t)(arg1[23]) << 5);
+  x10 = ((uint32_t)(arg1[22]) << 23);
+  x11 = ((uint32_t)(arg1[21]) << 15);
+  x12 = ((uint32_t)(arg1[20]) << 7);
+  x13 = ((uint32_t)(arg1[19]) << 24);
+  x14 = ((uint32_t)(arg1[18]) << 16);
+  x15 = ((uint32_t)(arg1[17]) << 8);
+  x16 = (arg1[16]);
+  x17 = ((uint32_t)(arg1[15]) << 18);
+  x18 = ((uint32_t)(arg1[14]) << 10);
+  x19 = ((uint32_t)(arg1[13]) << 2);
+  x20 = ((uint32_t)(arg1[12]) << 19);
+  x21 = ((uint32_t)(arg1[11]) << 11);
+  x22 = ((uint32_t)(arg1[10]) << 3);
+  x23 = ((uint32_t)(arg1[9]) << 21);
+  x24 = ((uint32_t)(arg1[8]) << 13);
+  x25 = ((uint32_t)(arg1[7]) << 5);
+  x26 = ((uint32_t)(arg1[6]) << 22);
+  x27 = ((uint32_t)(arg1[5]) << 14);
+  x28 = ((uint32_t)(arg1[4]) << 6);
+  x29 = ((uint32_t)(arg1[3]) << 24);
+  x30 = ((uint32_t)(arg1[2]) << 16);
+  x31 = ((uint32_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint32_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x35 & UINT32_C(0x3ffffff));
+  x37 = (uint8_t)(x35 >> 26);
+  x38 = (x28 + (uint32_t)x37);
+  x39 = (x27 + x38);
+  x40 = (x26 + x39);
+  x41 = (x40 & UINT32_C(0x1ffffff));
+  x42 = (uint8_t)(x40 >> 25);
+  x43 = (x25 + (uint32_t)x42);
+  x44 = (x24 + x43);
+  x45 = (x23 + x44);
+  x46 = (x45 & UINT32_C(0x3ffffff));
+  x47 = (uint8_t)(x45 >> 26);
+  x48 = (x22 + (uint32_t)x47);
+  x49 = (x21 + x48);
+  x50 = (x20 + x49);
+  x51 = (x50 & UINT32_C(0x1ffffff));
+  x52 = (uint8_t)(x50 >> 25);
+  x53 = (x19 + (uint32_t)x52);
+  x54 = (x18 + x53);
+  x55 = (x17 + x54);
+  x56 = (x15 + (uint32_t)x16);
+  x57 = (x14 + x56);
+  x58 = (x13 + x57);
+  x59 = (x58 & UINT32_C(0x1ffffff));
+  x60 = (uint8_t)(x58 >> 25);
+  x61 = (x12 + (uint32_t)x60);
+  x62 = (x11 + x61);
+  x63 = (x10 + x62);
+  x64 = (x63 & UINT32_C(0x3ffffff));
+  x65 = (uint8_t)(x63 >> 26);
+  x66 = (x9 + (uint32_t)x65);
+  x67 = (x8 + x66);
+  x68 = (x7 + x67);
+  x69 = (x68 & UINT32_C(0x1ffffff));
+  x70 = (uint8_t)(x68 >> 25);
+  x71 = (x6 + (uint32_t)x70);
+  x72 = (x5 + x71);
+  x73 = (x4 + x72);
+  x74 = (x73 & UINT32_C(0x3ffffff));
+  x75 = (uint8_t)(x73 >> 26);
+  x76 = (x3 + (uint32_t)x75);
+  x77 = (x2 + x76);
+  x78 = (x1 + x77);
+  out1[0] = x36;
+  out1[1] = x41;
+  out1[2] = x46;
+  out1[3] = x51;
+  out1[4] = x55;
+  out1[5] = x59;
+  out1[6] = x64;
+  out1[7] = x69;
+  out1[8] = x74;
+  out1[9] = x78;
+}
+
+/*
+ * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (121666 * eval arg1) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint64_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint64_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint64_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint64_t x22;
+  uint32_t x23;
+  uint32_t x24;
+  uint64_t x25;
+  uint32_t x26;
+  uint32_t x27;
+  uint64_t x28;
+  uint32_t x29;
+  uint32_t x30;
+  uint64_t x31;
+  uint32_t x32;
+  uint32_t x33;
+  uint64_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint64_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  fiat_25519_uint1 x42;
+  uint32_t x43;
+  uint32_t x44;
+  fiat_25519_uint1 x45;
+  uint32_t x46;
+  uint32_t x47;
+  x1 = ((uint64_t)UINT32_C(0x1db42) * (arg1[9]));
+  x2 = ((uint64_t)UINT32_C(0x1db42) * (arg1[8]));
+  x3 = ((uint64_t)UINT32_C(0x1db42) * (arg1[7]));
+  x4 = ((uint64_t)UINT32_C(0x1db42) * (arg1[6]));
+  x5 = ((uint64_t)UINT32_C(0x1db42) * (arg1[5]));
+  x6 = ((uint64_t)UINT32_C(0x1db42) * (arg1[4]));
+  x7 = ((uint64_t)UINT32_C(0x1db42) * (arg1[3]));
+  x8 = ((uint64_t)UINT32_C(0x1db42) * (arg1[2]));
+  x9 = ((uint64_t)UINT32_C(0x1db42) * (arg1[1]));
+  x10 = ((uint64_t)UINT32_C(0x1db42) * (arg1[0]));
+  x11 = (uint32_t)(x10 >> 26);
+  x12 = (uint32_t)(x10 & UINT32_C(0x3ffffff));
+  x13 = (x11 + x9);
+  x14 = (uint32_t)(x13 >> 25);
+  x15 = (uint32_t)(x13 & UINT32_C(0x1ffffff));
+  x16 = (x14 + x8);
+  x17 = (uint32_t)(x16 >> 26);
+  x18 = (uint32_t)(x16 & UINT32_C(0x3ffffff));
+  x19 = (x17 + x7);
+  x20 = (uint32_t)(x19 >> 25);
+  x21 = (uint32_t)(x19 & UINT32_C(0x1ffffff));
+  x22 = (x20 + x6);
+  x23 = (uint32_t)(x22 >> 26);
+  x24 = (uint32_t)(x22 & UINT32_C(0x3ffffff));
+  x25 = (x23 + x5);
+  x26 = (uint32_t)(x25 >> 25);
+  x27 = (uint32_t)(x25 & UINT32_C(0x1ffffff));
+  x28 = (x26 + x4);
+  x29 = (uint32_t)(x28 >> 26);
+  x30 = (uint32_t)(x28 & UINT32_C(0x3ffffff));
+  x31 = (x29 + x3);
+  x32 = (uint32_t)(x31 >> 25);
+  x33 = (uint32_t)(x31 & UINT32_C(0x1ffffff));
+  x34 = (x32 + x2);
+  x35 = (uint32_t)(x34 >> 26);
+  x36 = (uint32_t)(x34 & UINT32_C(0x3ffffff));
+  x37 = (x35 + x1);
+  x38 = (uint32_t)(x37 >> 25);
+  x39 = (uint32_t)(x37 & UINT32_C(0x1ffffff));
+  x40 = (x38 * UINT8_C(0x13));
+  x41 = (x12 + x40);
+  x42 = (fiat_25519_uint1)(x41 >> 26);
+  x43 = (x41 & UINT32_C(0x3ffffff));
+  x44 = (x42 + x15);
+  x45 = (fiat_25519_uint1)(x44 >> 25);
+  x46 = (x44 & UINT32_C(0x1ffffff));
+  x47 = (x45 + x18);
+  out1[0] = x43;
+  out1[1] = x46;
+  out1[2] = x47;
+  out1[3] = x21;
+  out1[4] = x24;
+  out1[5] = x27;
+  out1[6] = x30;
+  out1[7] = x33;
+  out1[8] = x36;
+  out1[9] = x39;
+}
diff --git a/ring-0.17.14/third_party/fiat/curve25519_64.h b/ring-0.17.14/third_party/fiat/curve25519_64.h
new file mode 100644
index 0000000000..ea2c23ca27
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/curve25519_64.h
@@ -0,0 +1,916 @@
+/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */
+/* curve description: 25519 */
+/* machine_wordsize = 64 (from "64") */
+/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */
+/* n = 5 (from "(auto)") */
+/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
+/* tight_bounds_multiplier = 1 (from "") */
+/*  */
+/* Computed values: */
+/*   carry_chain = [0, 1, 2, 3, 4, 0, 1] */
+/*   eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */
+
+#include <stdint.h>
+typedef unsigned char fiat_25519_uint1;
+typedef signed char fiat_25519_int1;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_25519_FIAT_EXTENSION __extension__
+#  define FIAT_25519_FIAT_INLINE __inline__
+#else
+#  define FIAT_25519_FIAT_EXTENSION
+#  define FIAT_25519_FIAT_INLINE
+#endif
+
+FIAT_25519_FIAT_EXTENSION typedef signed __int128 fiat_25519_int128;
+FIAT_25519_FIAT_EXTENSION typedef unsigned __int128 fiat_25519_uint128;
+
+/* The type fiat_25519_loose_field_element is a field element with loose bounds. */
+/* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */
+typedef uint64_t fiat_25519_loose_field_element[5];
+
+/* The type fiat_25519_tight_field_element is a field element with tight bounds. */
+/* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */
+typedef uint64_t fiat_25519_tight_field_element[5];
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+#if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint64_t fiat_25519_value_barrier_u64(uint64_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_25519_value_barrier_u64(x) (x)
+#endif
+
+
+/*
+ * The function fiat_25519_addcarryx_u51 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^51
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x7ffffffffffff]
+ *   arg3: [0x0 ~> 0x7ffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x7ffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  uint64_t x1;
+  uint64_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT64_C(0x7ffffffffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 51);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_25519_subborrowx_u51 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^51
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x7ffffffffffff]
+ *   arg3: [0x0 ~> 0x7ffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x7ffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  int64_t x1;
+  fiat_25519_int1 x2;
+  uint64_t x3;
+  x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 51);
+  x3 = (x1 & UINT64_C(0x7ffffffffffff));
+  *out1 = x3;
+  *out2 = (fiat_25519_uint1)(0x0 - x2);
+}
+
+/*
+ * The function fiat_25519_cmovznz_u64 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_25519_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * The function fiat_25519_carry_mul multiplies two field elements and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) {
+  fiat_25519_uint128 x1;
+  fiat_25519_uint128 x2;
+  fiat_25519_uint128 x3;
+  fiat_25519_uint128 x4;
+  fiat_25519_uint128 x5;
+  fiat_25519_uint128 x6;
+  fiat_25519_uint128 x7;
+  fiat_25519_uint128 x8;
+  fiat_25519_uint128 x9;
+  fiat_25519_uint128 x10;
+  fiat_25519_uint128 x11;
+  fiat_25519_uint128 x12;
+  fiat_25519_uint128 x13;
+  fiat_25519_uint128 x14;
+  fiat_25519_uint128 x15;
+  fiat_25519_uint128 x16;
+  fiat_25519_uint128 x17;
+  fiat_25519_uint128 x18;
+  fiat_25519_uint128 x19;
+  fiat_25519_uint128 x20;
+  fiat_25519_uint128 x21;
+  fiat_25519_uint128 x22;
+  fiat_25519_uint128 x23;
+  fiat_25519_uint128 x24;
+  fiat_25519_uint128 x25;
+  fiat_25519_uint128 x26;
+  uint64_t x27;
+  uint64_t x28;
+  fiat_25519_uint128 x29;
+  fiat_25519_uint128 x30;
+  fiat_25519_uint128 x31;
+  fiat_25519_uint128 x32;
+  fiat_25519_uint128 x33;
+  uint64_t x34;
+  uint64_t x35;
+  fiat_25519_uint128 x36;
+  uint64_t x37;
+  uint64_t x38;
+  fiat_25519_uint128 x39;
+  uint64_t x40;
+  uint64_t x41;
+  fiat_25519_uint128 x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  fiat_25519_uint1 x50;
+  uint64_t x51;
+  uint64_t x52;
+  x1 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[4]) * UINT8_C(0x13)));
+  x2 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[3]) * UINT8_C(0x13)));
+  x3 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[2]) * UINT8_C(0x13)));
+  x4 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[1]) * UINT8_C(0x13)));
+  x5 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[4]) * UINT8_C(0x13)));
+  x6 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[3]) * UINT8_C(0x13)));
+  x7 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[2]) * UINT8_C(0x13)));
+  x8 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[4]) * UINT8_C(0x13)));
+  x9 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[3]) * UINT8_C(0x13)));
+  x10 = ((fiat_25519_uint128)(arg1[1]) * ((arg2[4]) * UINT8_C(0x13)));
+  x11 = ((fiat_25519_uint128)(arg1[4]) * (arg2[0]));
+  x12 = ((fiat_25519_uint128)(arg1[3]) * (arg2[1]));
+  x13 = ((fiat_25519_uint128)(arg1[3]) * (arg2[0]));
+  x14 = ((fiat_25519_uint128)(arg1[2]) * (arg2[2]));
+  x15 = ((fiat_25519_uint128)(arg1[2]) * (arg2[1]));
+  x16 = ((fiat_25519_uint128)(arg1[2]) * (arg2[0]));
+  x17 = ((fiat_25519_uint128)(arg1[1]) * (arg2[3]));
+  x18 = ((fiat_25519_uint128)(arg1[1]) * (arg2[2]));
+  x19 = ((fiat_25519_uint128)(arg1[1]) * (arg2[1]));
+  x20 = ((fiat_25519_uint128)(arg1[1]) * (arg2[0]));
+  x21 = ((fiat_25519_uint128)(arg1[0]) * (arg2[4]));
+  x22 = ((fiat_25519_uint128)(arg1[0]) * (arg2[3]));
+  x23 = ((fiat_25519_uint128)(arg1[0]) * (arg2[2]));
+  x24 = ((fiat_25519_uint128)(arg1[0]) * (arg2[1]));
+  x25 = ((fiat_25519_uint128)(arg1[0]) * (arg2[0]));
+  x26 = (x25 + (x10 + (x9 + (x7 + x4))));
+  x27 = (uint64_t)(x26 >> 51);
+  x28 = (uint64_t)(x26 & UINT64_C(0x7ffffffffffff));
+  x29 = (x21 + (x17 + (x14 + (x12 + x11))));
+  x30 = (x22 + (x18 + (x15 + (x13 + x1))));
+  x31 = (x23 + (x19 + (x16 + (x5 + x2))));
+  x32 = (x24 + (x20 + (x8 + (x6 + x3))));
+  x33 = (x27 + x32);
+  x34 = (uint64_t)(x33 >> 51);
+  x35 = (uint64_t)(x33 & UINT64_C(0x7ffffffffffff));
+  x36 = (x34 + x31);
+  x37 = (uint64_t)(x36 >> 51);
+  x38 = (uint64_t)(x36 & UINT64_C(0x7ffffffffffff));
+  x39 = (x37 + x30);
+  x40 = (uint64_t)(x39 >> 51);
+  x41 = (uint64_t)(x39 & UINT64_C(0x7ffffffffffff));
+  x42 = (x40 + x29);
+  x43 = (uint64_t)(x42 >> 51);
+  x44 = (uint64_t)(x42 & UINT64_C(0x7ffffffffffff));
+  x45 = (x43 * UINT8_C(0x13));
+  x46 = (x28 + x45);
+  x47 = (x46 >> 51);
+  x48 = (x46 & UINT64_C(0x7ffffffffffff));
+  x49 = (x47 + x35);
+  x50 = (fiat_25519_uint1)(x49 >> 51);
+  x51 = (x49 & UINT64_C(0x7ffffffffffff));
+  x52 = (x50 + x38);
+  out1[0] = x48;
+  out1[1] = x51;
+  out1[2] = x52;
+  out1[3] = x41;
+  out1[4] = x44;
+}
+
+/*
+ * The function fiat_25519_carry_square squares a field element and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * eval arg1) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  fiat_25519_uint128 x9;
+  fiat_25519_uint128 x10;
+  fiat_25519_uint128 x11;
+  fiat_25519_uint128 x12;
+  fiat_25519_uint128 x13;
+  fiat_25519_uint128 x14;
+  fiat_25519_uint128 x15;
+  fiat_25519_uint128 x16;
+  fiat_25519_uint128 x17;
+  fiat_25519_uint128 x18;
+  fiat_25519_uint128 x19;
+  fiat_25519_uint128 x20;
+  fiat_25519_uint128 x21;
+  fiat_25519_uint128 x22;
+  fiat_25519_uint128 x23;
+  fiat_25519_uint128 x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_25519_uint128 x27;
+  fiat_25519_uint128 x28;
+  fiat_25519_uint128 x29;
+  fiat_25519_uint128 x30;
+  fiat_25519_uint128 x31;
+  uint64_t x32;
+  uint64_t x33;
+  fiat_25519_uint128 x34;
+  uint64_t x35;
+  uint64_t x36;
+  fiat_25519_uint128 x37;
+  uint64_t x38;
+  uint64_t x39;
+  fiat_25519_uint128 x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  fiat_25519_uint1 x48;
+  uint64_t x49;
+  uint64_t x50;
+  x1 = ((arg1[4]) * UINT8_C(0x13));
+  x2 = (x1 * 0x2);
+  x3 = ((arg1[4]) * 0x2);
+  x4 = ((arg1[3]) * UINT8_C(0x13));
+  x5 = (x4 * 0x2);
+  x6 = ((arg1[3]) * 0x2);
+  x7 = ((arg1[2]) * 0x2);
+  x8 = ((arg1[1]) * 0x2);
+  x9 = ((fiat_25519_uint128)(arg1[4]) * x1);
+  x10 = ((fiat_25519_uint128)(arg1[3]) * x2);
+  x11 = ((fiat_25519_uint128)(arg1[3]) * x4);
+  x12 = ((fiat_25519_uint128)(arg1[2]) * x2);
+  x13 = ((fiat_25519_uint128)(arg1[2]) * x5);
+  x14 = ((fiat_25519_uint128)(arg1[2]) * (arg1[2]));
+  x15 = ((fiat_25519_uint128)(arg1[1]) * x2);
+  x16 = ((fiat_25519_uint128)(arg1[1]) * x6);
+  x17 = ((fiat_25519_uint128)(arg1[1]) * x7);
+  x18 = ((fiat_25519_uint128)(arg1[1]) * (arg1[1]));
+  x19 = ((fiat_25519_uint128)(arg1[0]) * x3);
+  x20 = ((fiat_25519_uint128)(arg1[0]) * x6);
+  x21 = ((fiat_25519_uint128)(arg1[0]) * x7);
+  x22 = ((fiat_25519_uint128)(arg1[0]) * x8);
+  x23 = ((fiat_25519_uint128)(arg1[0]) * (arg1[0]));
+  x24 = (x23 + (x15 + x13));
+  x25 = (uint64_t)(x24 >> 51);
+  x26 = (uint64_t)(x24 & UINT64_C(0x7ffffffffffff));
+  x27 = (x19 + (x16 + x14));
+  x28 = (x20 + (x17 + x9));
+  x29 = (x21 + (x18 + x10));
+  x30 = (x22 + (x12 + x11));
+  x31 = (x25 + x30);
+  x32 = (uint64_t)(x31 >> 51);
+  x33 = (uint64_t)(x31 & UINT64_C(0x7ffffffffffff));
+  x34 = (x32 + x29);
+  x35 = (uint64_t)(x34 >> 51);
+  x36 = (uint64_t)(x34 & UINT64_C(0x7ffffffffffff));
+  x37 = (x35 + x28);
+  x38 = (uint64_t)(x37 >> 51);
+  x39 = (uint64_t)(x37 & UINT64_C(0x7ffffffffffff));
+  x40 = (x38 + x27);
+  x41 = (uint64_t)(x40 >> 51);
+  x42 = (uint64_t)(x40 & UINT64_C(0x7ffffffffffff));
+  x43 = (x41 * UINT8_C(0x13));
+  x44 = (x26 + x43);
+  x45 = (x44 >> 51);
+  x46 = (x44 & UINT64_C(0x7ffffffffffff));
+  x47 = (x45 + x33);
+  x48 = (fiat_25519_uint1)(x47 >> 51);
+  x49 = (x47 & UINT64_C(0x7ffffffffffff));
+  x50 = (x48 + x36);
+  out1[0] = x46;
+  out1[1] = x49;
+  out1[2] = x50;
+  out1[3] = x39;
+  out1[4] = x42;
+}
+
+/*
+ * The function fiat_25519_carry reduces a field element.
+ *
+ * Postconditions:
+ *   eval out1 mod m = eval arg1 mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  x1 = (arg1[0]);
+  x2 = ((x1 >> 51) + (arg1[1]));
+  x3 = ((x2 >> 51) + (arg1[2]));
+  x4 = ((x3 >> 51) + (arg1[3]));
+  x5 = ((x4 >> 51) + (arg1[4]));
+  x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13)));
+  x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff)));
+  x8 = (x6 & UINT64_C(0x7ffffffffffff));
+  x9 = (x7 & UINT64_C(0x7ffffffffffff));
+  x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff)));
+  x11 = (x4 & UINT64_C(0x7ffffffffffff));
+  x12 = (x5 & UINT64_C(0x7ffffffffffff));
+  out1[0] = x8;
+  out1[1] = x9;
+  out1[2] = x10;
+  out1[3] = x11;
+  out1[4] = x12;
+}
+
+/*
+ * The function fiat_25519_add adds two field elements.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 + eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = ((arg1[0]) + (arg2[0]));
+  x2 = ((arg1[1]) + (arg2[1]));
+  x3 = ((arg1[2]) + (arg2[2]));
+  x4 = ((arg1[3]) + (arg2[3]));
+  x5 = ((arg1[4]) + (arg2[4]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+}
+
+/*
+ * The function fiat_25519_sub subtracts two field elements.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 - eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0]));
+  x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1]));
+  x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2]));
+  x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3]));
+  x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+}
+
+/*
+ * The function fiat_25519_opp negates a field element.
+ *
+ * Postconditions:
+ *   eval out1 mod m = -eval arg1 mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = (UINT64_C(0xfffffffffffda) - (arg1[0]));
+  x2 = (UINT64_C(0xffffffffffffe) - (arg1[1]));
+  x3 = (UINT64_C(0xffffffffffffe) - (arg1[2]));
+  x4 = (UINT64_C(0xffffffffffffe) - (arg1[3]));
+  x5 = (UINT64_C(0xffffffffffffe) - (arg1[4]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+}
+
+/*
+ * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order.
+ *
+ * Postconditions:
+ *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) {
+  uint64_t x1;
+  fiat_25519_uint1 x2;
+  uint64_t x3;
+  fiat_25519_uint1 x4;
+  uint64_t x5;
+  fiat_25519_uint1 x6;
+  uint64_t x7;
+  fiat_25519_uint1 x8;
+  uint64_t x9;
+  fiat_25519_uint1 x10;
+  uint64_t x11;
+  uint64_t x12;
+  fiat_25519_uint1 x13;
+  uint64_t x14;
+  fiat_25519_uint1 x15;
+  uint64_t x16;
+  fiat_25519_uint1 x17;
+  uint64_t x18;
+  fiat_25519_uint1 x19;
+  uint64_t x20;
+  fiat_25519_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint8_t x26;
+  uint64_t x27;
+  uint8_t x28;
+  uint64_t x29;
+  uint8_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint8_t x34;
+  uint64_t x35;
+  uint8_t x36;
+  uint8_t x37;
+  uint64_t x38;
+  uint8_t x39;
+  uint64_t x40;
+  uint8_t x41;
+  uint64_t x42;
+  uint8_t x43;
+  uint64_t x44;
+  uint8_t x45;
+  uint64_t x46;
+  uint8_t x47;
+  uint64_t x48;
+  uint8_t x49;
+  uint8_t x50;
+  uint64_t x51;
+  uint8_t x52;
+  uint64_t x53;
+  uint8_t x54;
+  uint64_t x55;
+  uint8_t x56;
+  uint64_t x57;
+  uint8_t x58;
+  uint64_t x59;
+  uint8_t x60;
+  uint64_t x61;
+  uint8_t x62;
+  uint64_t x63;
+  uint8_t x64;
+  fiat_25519_uint1 x65;
+  uint64_t x66;
+  uint8_t x67;
+  uint64_t x68;
+  uint8_t x69;
+  uint64_t x70;
+  uint8_t x71;
+  uint64_t x72;
+  uint8_t x73;
+  uint64_t x74;
+  uint8_t x75;
+  uint64_t x76;
+  uint8_t x77;
+  uint8_t x78;
+  uint64_t x79;
+  uint8_t x80;
+  uint64_t x81;
+  uint8_t x82;
+  uint64_t x83;
+  uint8_t x84;
+  uint64_t x85;
+  uint8_t x86;
+  uint64_t x87;
+  uint8_t x88;
+  uint64_t x89;
+  uint8_t x90;
+  uint8_t x91;
+  fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed));
+  fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed)));
+  fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff)));
+  x22 = (x20 << 4);
+  x23 = (x18 * (uint64_t)0x2);
+  x24 = (x16 << 6);
+  x25 = (x14 << 3);
+  x26 = (uint8_t)(x12 & UINT8_C(0xff));
+  x27 = (x12 >> 8);
+  x28 = (uint8_t)(x27 & UINT8_C(0xff));
+  x29 = (x27 >> 8);
+  x30 = (uint8_t)(x29 & UINT8_C(0xff));
+  x31 = (x29 >> 8);
+  x32 = (uint8_t)(x31 & UINT8_C(0xff));
+  x33 = (x31 >> 8);
+  x34 = (uint8_t)(x33 & UINT8_C(0xff));
+  x35 = (x33 >> 8);
+  x36 = (uint8_t)(x35 & UINT8_C(0xff));
+  x37 = (uint8_t)(x35 >> 8);
+  x38 = (x25 + (uint64_t)x37);
+  x39 = (uint8_t)(x38 & UINT8_C(0xff));
+  x40 = (x38 >> 8);
+  x41 = (uint8_t)(x40 & UINT8_C(0xff));
+  x42 = (x40 >> 8);
+  x43 = (uint8_t)(x42 & UINT8_C(0xff));
+  x44 = (x42 >> 8);
+  x45 = (uint8_t)(x44 & UINT8_C(0xff));
+  x46 = (x44 >> 8);
+  x47 = (uint8_t)(x46 & UINT8_C(0xff));
+  x48 = (x46 >> 8);
+  x49 = (uint8_t)(x48 & UINT8_C(0xff));
+  x50 = (uint8_t)(x48 >> 8);
+  x51 = (x24 + (uint64_t)x50);
+  x52 = (uint8_t)(x51 & UINT8_C(0xff));
+  x53 = (x51 >> 8);
+  x54 = (uint8_t)(x53 & UINT8_C(0xff));
+  x55 = (x53 >> 8);
+  x56 = (uint8_t)(x55 & UINT8_C(0xff));
+  x57 = (x55 >> 8);
+  x58 = (uint8_t)(x57 & UINT8_C(0xff));
+  x59 = (x57 >> 8);
+  x60 = (uint8_t)(x59 & UINT8_C(0xff));
+  x61 = (x59 >> 8);
+  x62 = (uint8_t)(x61 & UINT8_C(0xff));
+  x63 = (x61 >> 8);
+  x64 = (uint8_t)(x63 & UINT8_C(0xff));
+  x65 = (fiat_25519_uint1)(x63 >> 8);
+  x66 = (x23 + (uint64_t)x65);
+  x67 = (uint8_t)(x66 & UINT8_C(0xff));
+  x68 = (x66 >> 8);
+  x69 = (uint8_t)(x68 & UINT8_C(0xff));
+  x70 = (x68 >> 8);
+  x71 = (uint8_t)(x70 & UINT8_C(0xff));
+  x72 = (x70 >> 8);
+  x73 = (uint8_t)(x72 & UINT8_C(0xff));
+  x74 = (x72 >> 8);
+  x75 = (uint8_t)(x74 & UINT8_C(0xff));
+  x76 = (x74 >> 8);
+  x77 = (uint8_t)(x76 & UINT8_C(0xff));
+  x78 = (uint8_t)(x76 >> 8);
+  x79 = (x22 + (uint64_t)x78);
+  x80 = (uint8_t)(x79 & UINT8_C(0xff));
+  x81 = (x79 >> 8);
+  x82 = (uint8_t)(x81 & UINT8_C(0xff));
+  x83 = (x81 >> 8);
+  x84 = (uint8_t)(x83 & UINT8_C(0xff));
+  x85 = (x83 >> 8);
+  x86 = (uint8_t)(x85 & UINT8_C(0xff));
+  x87 = (x85 >> 8);
+  x88 = (uint8_t)(x87 & UINT8_C(0xff));
+  x89 = (x87 >> 8);
+  x90 = (uint8_t)(x89 & UINT8_C(0xff));
+  x91 = (uint8_t)(x89 >> 8);
+  out1[0] = x26;
+  out1[1] = x28;
+  out1[2] = x30;
+  out1[3] = x32;
+  out1[4] = x34;
+  out1[5] = x36;
+  out1[6] = x39;
+  out1[7] = x41;
+  out1[8] = x43;
+  out1[9] = x45;
+  out1[10] = x47;
+  out1[11] = x49;
+  out1[12] = x52;
+  out1[13] = x54;
+  out1[14] = x56;
+  out1[15] = x58;
+  out1[16] = x60;
+  out1[17] = x62;
+  out1[18] = x64;
+  out1[19] = x67;
+  out1[20] = x69;
+  out1[21] = x71;
+  out1[22] = x73;
+  out1[23] = x75;
+  out1[24] = x77;
+  out1[25] = x80;
+  out1[26] = x82;
+  out1[27] = x84;
+  out1[28] = x86;
+  out1[29] = x88;
+  out1[30] = x90;
+  out1[31] = x91;
+}
+
+/*
+ * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order.
+ *
+ * Postconditions:
+ *   eval out1 mod m = bytes_eval arg1 mod m
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint8_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint8_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint8_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint8_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  x1 = ((uint64_t)(arg1[31]) << 44);
+  x2 = ((uint64_t)(arg1[30]) << 36);
+  x3 = ((uint64_t)(arg1[29]) << 28);
+  x4 = ((uint64_t)(arg1[28]) << 20);
+  x5 = ((uint64_t)(arg1[27]) << 12);
+  x6 = ((uint64_t)(arg1[26]) << 4);
+  x7 = ((uint64_t)(arg1[25]) << 47);
+  x8 = ((uint64_t)(arg1[24]) << 39);
+  x9 = ((uint64_t)(arg1[23]) << 31);
+  x10 = ((uint64_t)(arg1[22]) << 23);
+  x11 = ((uint64_t)(arg1[21]) << 15);
+  x12 = ((uint64_t)(arg1[20]) << 7);
+  x13 = ((uint64_t)(arg1[19]) << 50);
+  x14 = ((uint64_t)(arg1[18]) << 42);
+  x15 = ((uint64_t)(arg1[17]) << 34);
+  x16 = ((uint64_t)(arg1[16]) << 26);
+  x17 = ((uint64_t)(arg1[15]) << 18);
+  x18 = ((uint64_t)(arg1[14]) << 10);
+  x19 = ((uint64_t)(arg1[13]) << 2);
+  x20 = ((uint64_t)(arg1[12]) << 45);
+  x21 = ((uint64_t)(arg1[11]) << 37);
+  x22 = ((uint64_t)(arg1[10]) << 29);
+  x23 = ((uint64_t)(arg1[9]) << 21);
+  x24 = ((uint64_t)(arg1[8]) << 13);
+  x25 = ((uint64_t)(arg1[7]) << 5);
+  x26 = ((uint64_t)(arg1[6]) << 48);
+  x27 = ((uint64_t)(arg1[5]) << 40);
+  x28 = ((uint64_t)(arg1[4]) << 32);
+  x29 = ((uint64_t)(arg1[3]) << 24);
+  x30 = ((uint64_t)(arg1[2]) << 16);
+  x31 = ((uint64_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint64_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x28 + x35);
+  x37 = (x27 + x36);
+  x38 = (x26 + x37);
+  x39 = (x38 & UINT64_C(0x7ffffffffffff));
+  x40 = (uint8_t)(x38 >> 51);
+  x41 = (x25 + (uint64_t)x40);
+  x42 = (x24 + x41);
+  x43 = (x23 + x42);
+  x44 = (x22 + x43);
+  x45 = (x21 + x44);
+  x46 = (x20 + x45);
+  x47 = (x46 & UINT64_C(0x7ffffffffffff));
+  x48 = (uint8_t)(x46 >> 51);
+  x49 = (x19 + (uint64_t)x48);
+  x50 = (x18 + x49);
+  x51 = (x17 + x50);
+  x52 = (x16 + x51);
+  x53 = (x15 + x52);
+  x54 = (x14 + x53);
+  x55 = (x13 + x54);
+  x56 = (x55 & UINT64_C(0x7ffffffffffff));
+  x57 = (uint8_t)(x55 >> 51);
+  x58 = (x12 + (uint64_t)x57);
+  x59 = (x11 + x58);
+  x60 = (x10 + x59);
+  x61 = (x9 + x60);
+  x62 = (x8 + x61);
+  x63 = (x7 + x62);
+  x64 = (x63 & UINT64_C(0x7ffffffffffff));
+  x65 = (uint8_t)(x63 >> 51);
+  x66 = (x6 + (uint64_t)x65);
+  x67 = (x5 + x66);
+  x68 = (x4 + x67);
+  x69 = (x3 + x68);
+  x70 = (x2 + x69);
+  x71 = (x1 + x70);
+  out1[0] = x39;
+  out1[1] = x47;
+  out1[2] = x56;
+  out1[3] = x64;
+  out1[4] = x71;
+}
+
+/*
+ * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (121666 * eval arg1) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  fiat_25519_uint128 x1;
+  fiat_25519_uint128 x2;
+  fiat_25519_uint128 x3;
+  fiat_25519_uint128 x4;
+  fiat_25519_uint128 x5;
+  uint64_t x6;
+  uint64_t x7;
+  fiat_25519_uint128 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_25519_uint128 x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_25519_uint128 x14;
+  uint64_t x15;
+  uint64_t x16;
+  fiat_25519_uint128 x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  fiat_25519_uint1 x22;
+  uint64_t x23;
+  uint64_t x24;
+  fiat_25519_uint1 x25;
+  uint64_t x26;
+  uint64_t x27;
+  x1 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[4]));
+  x2 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[3]));
+  x3 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[2]));
+  x4 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[1]));
+  x5 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[0]));
+  x6 = (uint64_t)(x5 >> 51);
+  x7 = (uint64_t)(x5 & UINT64_C(0x7ffffffffffff));
+  x8 = (x6 + x4);
+  x9 = (uint64_t)(x8 >> 51);
+  x10 = (uint64_t)(x8 & UINT64_C(0x7ffffffffffff));
+  x11 = (x9 + x3);
+  x12 = (uint64_t)(x11 >> 51);
+  x13 = (uint64_t)(x11 & UINT64_C(0x7ffffffffffff));
+  x14 = (x12 + x2);
+  x15 = (uint64_t)(x14 >> 51);
+  x16 = (uint64_t)(x14 & UINT64_C(0x7ffffffffffff));
+  x17 = (x15 + x1);
+  x18 = (uint64_t)(x17 >> 51);
+  x19 = (uint64_t)(x17 & UINT64_C(0x7ffffffffffff));
+  x20 = (x18 * UINT8_C(0x13));
+  x21 = (x7 + x20);
+  x22 = (fiat_25519_uint1)(x21 >> 51);
+  x23 = (x21 & UINT64_C(0x7ffffffffffff));
+  x24 = (x22 + x10);
+  x25 = (fiat_25519_uint1)(x24 >> 51);
+  x26 = (x24 & UINT64_C(0x7ffffffffffff));
+  x27 = (x25 + x13);
+  out1[0] = x23;
+  out1[1] = x26;
+  out1[2] = x27;
+  out1[3] = x16;
+  out1[4] = x19;
+}
diff --git a/ring-0.17.14/third_party/fiat/curve25519_64_adx.h b/ring-0.17.14/third_party/fiat/curve25519_64_adx.h
new file mode 100644
index 0000000000..02e8ad9114
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/curve25519_64_adx.h
@@ -0,0 +1,695 @@
+#include <ring-core/base.h>
+#include "../../crypto/internal.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <immintrin.h>
+
+typedef uint64_t fe4[4];
+typedef uint8_t fiat_uint1;
+typedef int8_t fiat_int1;
+
+static __inline__ uint64_t fiat_value_barrier_u64(uint64_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+
+__attribute__((target("adx,bmi2")))
+static inline void fe4_mul(fe4 out, const fe4 x, const fe4 y) { fiat_curve25519_adx_mul(out, x, y); }
+
+__attribute__((target("adx,bmi2")))
+static inline void fe4_sq(fe4 out, const fe4 x) { fiat_curve25519_adx_square(out, x); }
+
+/*
+ * The function fiat_mulx_u64 is a multiplication, returning the full double-width result.
+ *
+ * Postconditions:
+ *   out1 = (arg1 * arg2) mod 2^64
+ *   out2 = ⌊arg1 * arg2 / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0xffffffffffffffff]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
+// NOTE: edited after generation
+#if defined(_M_X64)
+  unsigned long long t;
+  *out1 = _umul128(arg1, arg2, &t);
+  *out2 = t;
+#elif defined(_M_ARM64)
+  *out1 = arg1 * arg2;
+  *out2 = __umulh(arg1, arg2);
+#else
+  unsigned __int128 t = (unsigned __int128)arg1 * arg2;
+  *out1 = t;
+  *out2 = (t >> 64);
+#endif
+}
+
+/*
+ * The function fiat_addcarryx_u64 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^64
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_addcarryx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+// NOTE: edited after generation
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_ia32_addcarryx_u64)
+#    define addcarry64 __builtin_ia32_addcarryx_u64
+#  endif
+#endif
+#if defined(addcarry64)
+  long long unsigned int t;
+  *out2 = addcarry64(arg1, arg2, arg3, &t);
+  *out1 = t;
+#elif defined(_M_X64)
+  long long unsigned int t;
+  *out2 = _addcarry_u64(arg1, arg2, arg3, out1);
+  *out1 = t;
+#else
+  arg2 += arg1;
+  arg1 = arg2 < arg1;
+  uint64_t ret = arg2 + arg3;
+  arg1 += ret < arg2;
+  *out1 = ret;
+  *out2 = arg1;
+#endif
+#undef addcarry64
+}
+
+/*
+ * The function fiat_subborrowx_u64 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^64
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_subborrowx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_ia32_subborrow_u64)
+#    define subborrow64 __builtin_ia32_subborrow_u64
+#  endif
+#endif
+#if defined(subborrow64)
+  long long unsigned int t;
+  *out2 = subborrow64(arg1, arg2, arg3, &t);
+  *out1 = t;
+#elif defined(_M_X64)
+  long long unsigned int t;
+  *out2 = _subborrow_u64(arg1, arg2, arg3, &t); // NOTE: edited after generation
+  *out1 = t;
+#else
+  *out1 = arg2 - arg3 - arg1;
+  *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1);
+#endif
+#undef subborrow64
+}
+
+/*
+ * The function fiat_cmovznz_u64 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_cmovznz_u64(uint64_t* out1, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_value_barrier_u64(x2) & arg3) | (fiat_value_barrier_u64((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_add(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
+  uint64_t x1;
+  fiat_uint1 x2;
+  uint64_t x3;
+  fiat_uint1 x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  fiat_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_uint1 x11;
+  uint64_t x12;
+  fiat_uint1 x13;
+  uint64_t x14;
+  fiat_uint1 x15;
+  uint64_t x16;
+  fiat_uint1 x17;
+  uint64_t x18;
+  uint64_t x19;
+  fiat_uint1 x20;
+  fiat_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_cmovznz_u64(&x9, x8, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_addcarryx_u64(&x12, &x13, x11, x3, 0x0);
+  fiat_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
+  fiat_addcarryx_u64(&x16, &x17, x15, x7, 0x0);
+  fiat_cmovznz_u64(&x18, x17, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_addcarryx_u64(&x19, &x20, 0x0, x10, x18);
+  out1[0] = x19;
+  out1[1] = x12;
+  out1[2] = x14;
+  out1[3] = x16;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_sub(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  fiat_uint1 x3;
+  uint64_t x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  uint64_t x8;
+  fiat_uint1 x9;
+  uint64_t x10;
+  uint64_t x11;
+  fiat_uint1 x12;
+  uint64_t x13;
+  uint64_t x14;
+  fiat_uint1 x15;
+  uint64_t x16;
+  fiat_uint1 x17;
+  uint64_t x18;
+  fiat_uint1 x19;
+  uint64_t x20;
+  fiat_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  fiat_uint1 x24;
+  x1 = (arg2[0]);
+  fiat_subborrowx_u64(&x2, &x3, 0x0, (arg1[0]), x1);
+  x4 = (arg2[1]);
+  fiat_subborrowx_u64(&x5, &x6, x3, (arg1[1]), x4);
+  x7 = (arg2[2]);
+  fiat_subborrowx_u64(&x8, &x9, x6, (arg1[2]), x7);
+  x10 = (arg2[3]);
+  fiat_subborrowx_u64(&x11, &x12, x9, (arg1[3]), x10);
+  fiat_cmovznz_u64(&x13, x12, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_subborrowx_u64(&x14, &x15, 0x0, x2, x13);
+  fiat_subborrowx_u64(&x16, &x17, x15, x5, 0x0);
+  fiat_subborrowx_u64(&x18, &x19, x17, x8, 0x0);
+  fiat_subborrowx_u64(&x20, &x21, x19, x11, 0x0);
+  fiat_cmovznz_u64(&x22, x21, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_subborrowx_u64(&x23, &x24, 0x0, x14, x22);
+  out1[0] = x23;
+  out1[1] = x16;
+  out1[2] = x18;
+  out1[3] = x20;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg2: [0x0 ~> 0x3ffffffffffffff] // NOTE: this is not any uint64!
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_scmul(uint64_t out1[4], const uint64_t arg1[4], uint64_t arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  fiat_uint1 x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_uint1 x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  fiat_uint1 x18;
+  uint64_t x19;
+  fiat_uint1 x20;
+  uint64_t x21;
+  fiat_uint1 x22;
+  uint64_t x23;
+  fiat_uint1 x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_uint1 x27;
+  fiat_mulx_u64(&x1, &x2, (arg1[0]), arg2);
+  fiat_mulx_u64(&x3, &x4, (arg1[1]), arg2);
+  fiat_addcarryx_u64(&x5, &x6, 0x0, x2, x3);
+  fiat_mulx_u64(&x7, &x8, (arg1[2]), arg2);
+  fiat_addcarryx_u64(&x9, &x10, x6, x4, x7);
+  fiat_mulx_u64(&x11, &x12, (arg1[3]), arg2);
+  fiat_addcarryx_u64(&x13, &x14, x10, x8, x11);
+  fiat_mulx_u64(&x15, &x16, (x12 + (uint64_t)x14), UINT8_C(0x26));
+  fiat_addcarryx_u64(&x17, &x18, 0x0, x1, x15);
+  fiat_addcarryx_u64(&x19, &x20, x18, x5, 0x0);
+  fiat_addcarryx_u64(&x21, &x22, x20, x9, 0x0);
+  fiat_addcarryx_u64(&x23, &x24, x22, x13, 0x0);
+  fiat_cmovznz_u64(&x25, x24, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_addcarryx_u64(&x26, &x27, 0x0, x17, x25);
+  out1[0] = x26;
+  out1[1] = x19;
+  out1[2] = x21;
+  out1[3] = x23;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_canon(uint64_t out1[4], const uint64_t arg1[4]) {
+  uint64_t x1;
+  fiat_uint1 x2;
+  uint64_t x3;
+  fiat_uint1 x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  fiat_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_uint1 x14;
+  uint64_t x15;
+  fiat_uint1 x16;
+  uint64_t x17;
+  fiat_uint1 x18;
+  uint64_t x19;
+  fiat_uint1 x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  fiat_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0xffffffffffffffed));
+  fiat_subborrowx_u64(&x3, &x4, x2, (arg1[1]), UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x5, &x6, x4, (arg1[2]), UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7fffffffffffffff));
+  fiat_cmovznz_u64(&x9, x8, x1, (arg1[0]));
+  fiat_cmovznz_u64(&x10, x8, x3, (arg1[1]));
+  fiat_cmovznz_u64(&x11, x8, x5, (arg1[2]));
+  fiat_cmovznz_u64(&x12, x8, x7, (arg1[3]));
+  fiat_subborrowx_u64(&x13, &x14, 0x0, x9, UINT64_C(0xffffffffffffffed));
+  fiat_subborrowx_u64(&x15, &x16, x14, x10, UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x17, &x18, x16, x11, UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x19, &x20, x18, x12, UINT64_C(0x7fffffffffffffff));
+  fiat_cmovznz_u64(&x21, x20, x13, x9);
+  fiat_cmovznz_u64(&x22, x20, x15, x10);
+  fiat_cmovznz_u64(&x23, x20, x17, x11);
+  fiat_cmovznz_u64(&x24, x20, x19, x12);
+  out1[0] = x21;
+  out1[1] = x22;
+  out1[2] = x23;
+  out1[3] = x24;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_cswap(uint64_t out1[4], uint64_t out2[4], fiat_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  // NOTE: clang 14 for Zen 2 uses YMM registers
+  fiat_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
+  fiat_cmovznz_u64(&x5, arg1, (arg3[0]), (arg2[0]));
+  fiat_cmovznz_u64(&x6, arg1, (arg3[1]), (arg2[1]));
+  fiat_cmovznz_u64(&x7, arg1, (arg3[2]), (arg2[2]));
+  fiat_cmovznz_u64(&x8, arg1, (arg3[3]), (arg2[3]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out2[0] = x5;
+  out2[1] = x6;
+  out2[2] = x7;
+  out2[3] = x8;
+}
+
+// The following functions are adaped from crypto/curve25519/curve25519.c
+// It would be desirable to share the code, but with the current field
+// implementations both 4-limb and 5-limb versions of the curve-level code need
+// to be included in builds targetting an unknown variant of x86_64.
+
+__attribute__((target("adx,bmi2")))
+static void fe4_invert(fe4 out, const fe4 z) {
+  fe4 t0;
+  fe4 t1;
+  fe4 t2;
+  fe4 t3;
+  int i;
+
+  fe4_sq(t0, z);
+  fe4_sq(t1, t0);
+  for (i = 1; i < 2; ++i) {
+    fe4_sq(t1, t1);
+  }
+  fe4_mul(t1, z, t1);
+  fe4_mul(t0, t0, t1);
+  fe4_sq(t2, t0);
+  fe4_mul(t1, t1, t2);
+  fe4_sq(t2, t1);
+  for (i = 1; i < 5; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t1, t2, t1);
+  fe4_sq(t2, t1);
+  for (i = 1; i < 10; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t2, t2, t1);
+  fe4_sq(t3, t2);
+  for (i = 1; i < 20; ++i) {
+    fe4_sq(t3, t3);
+  }
+  fe4_mul(t2, t3, t2);
+  fe4_sq(t2, t2);
+  for (i = 1; i < 10; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t1, t2, t1);
+  fe4_sq(t2, t1);
+  for (i = 1; i < 50; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t2, t2, t1);
+  fe4_sq(t3, t2);
+  for (i = 1; i < 100; ++i) {
+    fe4_sq(t3, t3);
+  }
+  fe4_mul(t2, t3, t2);
+  fe4_sq(t2, t2);
+  for (i = 1; i < 50; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t1, t2, t1);
+  fe4_sq(t1, t1);
+  for (i = 1; i < 5; ++i) {
+    fe4_sq(t1, t1);
+  }
+  fe4_mul(out, t1, t0);
+}
+
+RING_NOINLINE // https://github.com/rust-lang/rust/issues/116573
+__attribute__((target("adx,bmi2")))
+void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32],
+                            const uint8_t point[32]) {
+  uint8_t e[32];
+  OPENSSL_memcpy(e, scalar, 32);
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+
+  // The following implementation was transcribed to Coq and proven to
+  // correspond to unary scalar multiplication in affine coordinates given that
+  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
+  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
+  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
+  // underlying field, so it applies to Curve25519 itself and the quadratic
+  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
+  // correctly simulates extension-field arithmetic on prime-field values.
+  // The decoding of the byte array representation of e was not considered.
+  // Specification of Montgomery curves in affine coordinates:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+  // Proof that these form a group that is isomorphic to a Weierstrass curve:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+  // Coq transcription and correctness proof of the loop (where scalarbits=255):
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
+  fe4 x1, x2 = {1}, z2 = {0}, x3, z3 = {1}, tmp0, tmp1;
+  OPENSSL_memcpy(x1, point, sizeof(fe4));
+  x1[3] &= (uint64_t)(-1)>>1;
+  OPENSSL_memcpy(x3, x1, sizeof(fe4));
+
+  unsigned swap = 0;
+  int pos;
+  for (pos = 254; pos >= 0; --pos) {
+    // loop invariant as of right before the test, for the case where x1 != 0:
+    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
+    //   let r := e >> (pos+1) in the following equalities of projective points:
+    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
+    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
+    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+    swap ^= b;
+    fe4_cswap(x2, x3, swap, x2, x3);
+    fe4_cswap(z2, z3, swap, z2, z3);
+    swap = b;
+    // Coq transcription of ladderstep formula (called from transcribed loop):
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+    // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+    // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+    fe4_sub(tmp0, x3, z3);
+    fe4_sub(tmp1, x2, z2);
+    fe4_add(x2, x2, z2);
+    fe4_add(z2, x3, z3);
+    fe4_mul(z3, tmp0, x2);
+    fe4_mul(z2, z2, tmp1);
+    fe4_sq(tmp0, tmp1);
+    fe4_sq(tmp1, x2);
+    fe4_add(x3, z3, z2);
+    fe4_sub(z2, z3, z2);
+    fe4_mul(x2, tmp1, tmp0);
+    fe4_sub(tmp1, tmp1, tmp0);
+    fe4_sq(z2, z2);
+    fe4_scmul(z3, tmp1, 121666);
+    fe4_sq(x3, x3);
+    fe4_add(tmp0, tmp0, z3);
+    fe4_mul(z3, x1, z2);
+    fe4_mul(z2, tmp1, tmp0);
+  }
+  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
+  fe4_cswap(x2, x3, swap, x2, x3);
+  fe4_cswap(z2, z3, swap, z2, z3);
+
+  fe4_invert(z2, z2);
+  fe4_mul(x2, x2, z2);
+  fe4_canon(x2, x2);
+  OPENSSL_memcpy(out, x2, sizeof(fe4));
+}
+
+typedef struct {
+  fe4 X;
+  fe4 Y;
+  fe4 Z;
+  fe4 T;
+} ge_p3_4;
+
+typedef struct {
+  fe4 yplusx;
+  fe4 yminusx;
+  fe4 xy2d;
+} ge_precomp_4;
+
+__attribute__((target("adx,bmi2")))
+static void inline_x25519_ge_dbl_4(ge_p3_4 *r, const ge_p3_4 *p, bool skip_t) {
+  // Transcribed from a Coq function proven against affine coordinates.
+  // https://github.com/mit-plv/fiat-crypto/blob/9943ba9e7d8f3e1c0054b2c94a5edca46ea73ef8/src/Curves/Edwards/XYZT/Basic.v#L136-L165
+  fe4 trX, trZ, trT, t0, cX, cY, cZ, cT;
+  fe4_sq(trX, p->X);
+  fe4_sq(trZ, p->Y);
+  fe4_sq(trT, p->Z);
+  fe4_add(trT, trT, trT);
+  fe4_add(cY, p->X, p->Y);
+  fe4_sq(t0, cY);
+  fe4_add(cY, trZ, trX);
+  fe4_sub(cZ, trZ, trX);
+  fe4_sub(cX, t0, cY);
+  fe4_sub(cT, trT, cZ);
+  fe4_mul(r->X, cX, cT);
+  fe4_mul(r->Y, cY, cZ);
+  fe4_mul(r->Z, cZ, cT);
+  if (!skip_t) {
+    fe4_mul(r->T, cX, cY);
+  }
+}
+
+__attribute__((target("adx,bmi2")))
+__attribute__((always_inline)) // 4% speedup with clang14 and zen2
+static inline void
+ge_p3_add_p3_precomp_4(ge_p3_4 *r, const ge_p3_4 *p, const ge_precomp_4 *q) {
+  fe4 A, B, C, YplusX, YminusX, D, X3, Y3, Z3, T3;
+  // Transcribed from a Coq function proven against affine coordinates.
+  // https://github.com/mit-plv/fiat-crypto/blob/a36568d1d73aff5d7accc79fd28be672882f9c17/src/Curves/Edwards/XYZT/Precomputed.v#L38-L56
+  fe4_add(YplusX, p->Y, p->X);
+  fe4_sub(YminusX, p->Y, p->X);
+  fe4_mul(A, YplusX, q->yplusx);
+  fe4_mul(B, YminusX, q->yminusx);
+  fe4_mul(C, q->xy2d, p->T);
+  fe4_add(D, p->Z, p->Z);
+  fe4_sub(X3, A, B);
+  fe4_add(Y3, A, B);
+  fe4_add(Z3, D, C);
+  fe4_sub(T3, D, C);
+  fe4_mul(r->X, X3, T3);
+  fe4_mul(r->Y, Y3, Z3);
+  fe4_mul(r->Z, Z3, T3);
+  fe4_mul(r->T, X3, Y3);
+}
+
+__attribute__((always_inline)) // 25% speedup with clang14 and zen2
+static inline void table_select_4(ge_precomp_4 *t, const int pos,
+                                  const signed char b) {
+  uint8_t bnegative = constant_time_msb_w(b);
+  uint8_t babs = b - ((bnegative & b) << 1);
+
+  uint8_t t_bytes[3][32] = {
+      {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
+#if defined(__clang__)
+  __asm__("" : "+m" (t_bytes) : /*no inputs*/);
+#endif
+  OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
+  for (int i = 0; i < 8; i++) {
+    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
+                                     sizeof(t_bytes),
+                                     constant_time_eq_w(babs, 1 + i));
+  }
+
+  OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(ge_precomp_4), "");
+
+  // fe4 uses saturated 64-bit limbs, so converting from bytes is just a copy.
+  OPENSSL_memcpy(t, t_bytes, sizeof(ge_precomp_4));
+
+  fe4 xy2d_neg = {0};
+  fe4_sub(xy2d_neg, xy2d_neg, t->xy2d);
+  constant_time_conditional_memcpy(t->yplusx, t_bytes[1], sizeof(fe4),
+                                   bnegative);
+  constant_time_conditional_memcpy(t->yminusx, t_bytes[0], sizeof(fe4),
+                                   bnegative);
+  constant_time_conditional_memcpy(t->xy2d, xy2d_neg, sizeof(fe4), bnegative);
+}
+
+// h = a * B
+// where a = a[0]+256*a[1]+...+256^31 a[31]
+// B is the Ed25519 base point (x,4/5) with x positive.
+//
+// Preconditions:
+//   a[31] <= 127
+RING_NOINLINE // https://github.com/rust-lang/rust/issues/116573
+__attribute__((target("adx,bmi2")))
+void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]) {
+  signed char e[64];
+  signed char carry;
+
+  for (unsigned i = 0; i < 32; ++i) {
+    e[2 * i + 0] = (a[i] >> 0) & 15;
+    e[2 * i + 1] = (a[i] >> 4) & 15;
+  }
+  // each e[i] is between 0 and 15
+  // e[63] is between 0 and 7
+
+  carry = 0;
+  for (unsigned i = 0; i < 63; ++i) {
+    e[i] += carry;
+    carry = e[i] + 8;
+    carry >>= 4;
+    e[i] -= carry << 4;
+  }
+  e[63] += carry;
+  // each e[i] is between -8 and 8
+
+  ge_p3_4 r = {{0}, {1}, {1}, {0}};
+  for (unsigned i = 1; i < 64; i += 2) {
+    ge_precomp_4 t;
+    table_select_4(&t, i / 2, e[i]);
+    ge_p3_add_p3_precomp_4(&r, &r, &t);
+  }
+
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true);
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true);
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true);
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/false);
+
+  for (unsigned i = 0; i < 64; i += 2) {
+    ge_precomp_4 t;
+    table_select_4(&t, i / 2, e[i]);
+    ge_p3_add_p3_precomp_4(&r, &r, &t);
+  }
+
+  // fe4 uses saturated 64-bit limbs, so converting to bytes is just a copy.
+  // Satisfy stated precondition of fiat_25519_from_bytes; tests pass either way
+  fe4_canon(r.X, r.X);
+  fe4_canon(r.Y, r.Y);
+  fe4_canon(r.Z, r.Z);
+  fe4_canon(r.T, r.T);
+  OPENSSL_STATIC_ASSERT(sizeof(ge_p3_4) == sizeof(uint8_t[4][32]), "");
+  OPENSSL_memcpy(h, &r, sizeof(ge_p3_4));
+}
diff --git a/ring-0.17.14/third_party/fiat/curve25519_64_msvc.h b/ring-0.17.14/third_party/fiat/curve25519_64_msvc.h
new file mode 100644
index 0000000000..4b916e42e1
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/curve25519_64_msvc.h
@@ -0,0 +1,1225 @@
+/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier --no-wide-int 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */
+/* curve description: 25519 */
+/* machine_wordsize = 64 (from "64") */
+/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */
+/* n = 5 (from "(auto)") */
+/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
+/* tight_bounds_multiplier = 1 (from "") */
+/*  */
+/* Computed values: */
+/*   carry_chain = [0, 1, 2, 3, 4, 0, 1] */
+/*   eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */
+
+#include <stdint.h>
+#include <intrin.h>
+#if defined(_M_X64)
+#include <immintrin.h>
+#endif
+
+typedef unsigned char fiat_25519_uint1;
+typedef signed char fiat_25519_int1;
+
+#define FIAT_25519_FIAT_INLINE inline
+
+/* The type fiat_25519_loose_field_element is a field element with loose bounds. */
+/* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */
+typedef uint64_t fiat_25519_loose_field_element[5];
+
+/* The type fiat_25519_tight_field_element is a field element with tight bounds. */
+/* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */
+typedef uint64_t fiat_25519_tight_field_element[5];
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+#define fiat_25519_value_barrier_u64(x) (x)
+
+/*
+ * The function fiat_25519_addcarryx_u64 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^64
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u64(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+// NOTE: edited after generation
+#if defined(_M_X64)
+  *out2 = _addcarry_u64(arg1, arg2, arg3, out1);
+#else
+  arg2 += arg1;
+  arg1 = arg2 < arg1;
+  arg3 += arg2;
+  arg1 += arg3 < arg2;
+  *out1 = arg3;
+  *out2 = arg1;
+#endif
+}
+
+/*
+ * The function fiat_25519_subborrowx_u64 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^64
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u64(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+#if defined(_M_X64)
+  *out2 = _subborrow_u64(arg1, arg2, arg3, out1); // NOTE: edited after generation
+#else
+  *out1 = arg2 - arg3 - arg1;
+  *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1);
+#endif
+}
+
+/*
+ * The function fiat_25519_addcarryx_u51 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^51
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x7ffffffffffff]
+ *   arg3: [0x0 ~> 0x7ffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x7ffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  uint64_t x1;
+  uint64_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT64_C(0x7ffffffffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 51);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_25519_subborrowx_u51 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^51
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0x7ffffffffffff]
+ *   arg3: [0x0 ~> 0x7ffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0x7ffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  int64_t x1;
+  fiat_25519_int1 x2;
+  uint64_t x3;
+  x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 51);
+  x3 = (x1 & UINT64_C(0x7ffffffffffff));
+  *out1 = x3;
+  *out2 = (fiat_25519_uint1)(0x0 - x2);
+}
+
+/*
+ * The function fiat_25519_mulx_u64 is a multiplication, returning the full double-width result.
+ *
+ * Postconditions:
+ *   out1 = (arg1 * arg2) mod 2^64
+ *   out2 = ⌊arg1 * arg2 / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
+// NOTE: edited after generation
+#if defined(_M_X64)
+  *out1 = _umul128(arg1, arg2, out2);
+#elif defined(_M_ARM64)
+  *out1 = arg1 * arg2;
+  *out2 = __umulh(arg1, arg2);
+#else
+#error "This file is intended for MSVC on X64 or ARM64"
+#endif
+}
+
+/*
+ * The function fiat_25519_cmovznz_u64 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_25519_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * The function fiat_25519_carry_mul multiplies two field elements and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  fiat_25519_uint1 x52;
+  uint64_t x53;
+  fiat_25519_uint1 x54;
+  uint64_t x55;
+  fiat_25519_uint1 x56;
+  uint64_t x57;
+  fiat_25519_uint1 x58;
+  uint64_t x59;
+  fiat_25519_uint1 x60;
+  uint64_t x61;
+  fiat_25519_uint1 x62;
+  uint64_t x63;
+  fiat_25519_uint1 x64;
+  uint64_t x65;
+  fiat_25519_uint1 x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  fiat_25519_uint1 x70;
+  uint64_t x71;
+  fiat_25519_uint1 x72;
+  uint64_t x73;
+  fiat_25519_uint1 x74;
+  uint64_t x75;
+  fiat_25519_uint1 x76;
+  uint64_t x77;
+  fiat_25519_uint1 x78;
+  uint64_t x79;
+  fiat_25519_uint1 x80;
+  uint64_t x81;
+  fiat_25519_uint1 x82;
+  uint64_t x83;
+  fiat_25519_uint1 x84;
+  uint64_t x85;
+  fiat_25519_uint1 x86;
+  uint64_t x87;
+  fiat_25519_uint1 x88;
+  uint64_t x89;
+  fiat_25519_uint1 x90;
+  uint64_t x91;
+  fiat_25519_uint1 x92;
+  uint64_t x93;
+  fiat_25519_uint1 x94;
+  uint64_t x95;
+  fiat_25519_uint1 x96;
+  uint64_t x97;
+  fiat_25519_uint1 x98;
+  uint64_t x99;
+  fiat_25519_uint1 x100;
+  uint64_t x101;
+  fiat_25519_uint1 x102;
+  uint64_t x103;
+  fiat_25519_uint1 x104;
+  uint64_t x105;
+  fiat_25519_uint1 x106;
+  uint64_t x107;
+  fiat_25519_uint1 x108;
+  uint64_t x109;
+  fiat_25519_uint1 x110;
+  uint64_t x111;
+  fiat_25519_uint1 x112;
+  uint64_t x113;
+  fiat_25519_uint1 x114;
+  uint64_t x115;
+  fiat_25519_uint1 x116;
+  uint64_t x117;
+  fiat_25519_uint1 x118;
+  uint64_t x119;
+  fiat_25519_uint1 x120;
+  uint64_t x121;
+  fiat_25519_uint1 x122;
+  uint64_t x123;
+  fiat_25519_uint1 x124;
+  uint64_t x125;
+  fiat_25519_uint1 x126;
+  uint64_t x127;
+  fiat_25519_uint1 x128;
+  uint64_t x129;
+  fiat_25519_uint1 x130;
+  uint64_t x131;
+  fiat_25519_uint1 x132;
+  uint64_t x133;
+  fiat_25519_uint1 x134;
+  uint64_t x135;
+  uint64_t x136;
+  uint64_t x137;
+  uint64_t x138;
+  fiat_25519_uint1 x139;
+  uint64_t x140;
+  uint64_t x141;
+  uint64_t x142;
+  uint64_t x143;
+  fiat_25519_uint1 x144;
+  uint64_t x145;
+  uint64_t x146;
+  uint64_t x147;
+  uint64_t x148;
+  fiat_25519_uint1 x149;
+  uint64_t x150;
+  uint64_t x151;
+  uint64_t x152;
+  uint64_t x153;
+  uint64_t x154;
+  uint64_t x155;
+  uint64_t x156;
+  uint64_t x157;
+  fiat_25519_uint1 x158;
+  uint64_t x159;
+  uint64_t x160;
+  fiat_25519_mulx_u64(&x1, &x2, (arg1[4]), ((arg2[4]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x3, &x4, (arg1[4]), ((arg2[3]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x5, &x6, (arg1[4]), ((arg2[2]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x7, &x8, (arg1[4]), ((arg2[1]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x9, &x10, (arg1[3]), ((arg2[4]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x11, &x12, (arg1[3]), ((arg2[3]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x13, &x14, (arg1[3]), ((arg2[2]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x15, &x16, (arg1[2]), ((arg2[4]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x17, &x18, (arg1[2]), ((arg2[3]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x19, &x20, (arg1[1]), ((arg2[4]) * UINT8_C(0x13)));
+  fiat_25519_mulx_u64(&x21, &x22, (arg1[4]), (arg2[0]));
+  fiat_25519_mulx_u64(&x23, &x24, (arg1[3]), (arg2[1]));
+  fiat_25519_mulx_u64(&x25, &x26, (arg1[3]), (arg2[0]));
+  fiat_25519_mulx_u64(&x27, &x28, (arg1[2]), (arg2[2]));
+  fiat_25519_mulx_u64(&x29, &x30, (arg1[2]), (arg2[1]));
+  fiat_25519_mulx_u64(&x31, &x32, (arg1[2]), (arg2[0]));
+  fiat_25519_mulx_u64(&x33, &x34, (arg1[1]), (arg2[3]));
+  fiat_25519_mulx_u64(&x35, &x36, (arg1[1]), (arg2[2]));
+  fiat_25519_mulx_u64(&x37, &x38, (arg1[1]), (arg2[1]));
+  fiat_25519_mulx_u64(&x39, &x40, (arg1[1]), (arg2[0]));
+  fiat_25519_mulx_u64(&x41, &x42, (arg1[0]), (arg2[4]));
+  fiat_25519_mulx_u64(&x43, &x44, (arg1[0]), (arg2[3]));
+  fiat_25519_mulx_u64(&x45, &x46, (arg1[0]), (arg2[2]));
+  fiat_25519_mulx_u64(&x47, &x48, (arg1[0]), (arg2[1]));
+  fiat_25519_mulx_u64(&x49, &x50, (arg1[0]), (arg2[0]));
+  fiat_25519_addcarryx_u64(&x51, &x52, 0x0, x13, x7);
+  fiat_25519_addcarryx_u64(&x53, &x54, x52, x14, x8);
+  fiat_25519_addcarryx_u64(&x55, &x56, 0x0, x17, x51);
+  fiat_25519_addcarryx_u64(&x57, &x58, x56, x18, x53);
+  fiat_25519_addcarryx_u64(&x59, &x60, 0x0, x19, x55);
+  fiat_25519_addcarryx_u64(&x61, &x62, x60, x20, x57);
+  fiat_25519_addcarryx_u64(&x63, &x64, 0x0, x49, x59);
+  fiat_25519_addcarryx_u64(&x65, &x66, x64, x50, x61);
+  x67 = ((x63 >> 51) | ((x65 << 13) & UINT64_C(0xffffffffffffffff)));
+  x68 = (x63 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x69, &x70, 0x0, x23, x21);
+  fiat_25519_addcarryx_u64(&x71, &x72, x70, x24, x22);
+  fiat_25519_addcarryx_u64(&x73, &x74, 0x0, x27, x69);
+  fiat_25519_addcarryx_u64(&x75, &x76, x74, x28, x71);
+  fiat_25519_addcarryx_u64(&x77, &x78, 0x0, x33, x73);
+  fiat_25519_addcarryx_u64(&x79, &x80, x78, x34, x75);
+  fiat_25519_addcarryx_u64(&x81, &x82, 0x0, x41, x77);
+  fiat_25519_addcarryx_u64(&x83, &x84, x82, x42, x79);
+  fiat_25519_addcarryx_u64(&x85, &x86, 0x0, x25, x1);
+  fiat_25519_addcarryx_u64(&x87, &x88, x86, x26, x2);
+  fiat_25519_addcarryx_u64(&x89, &x90, 0x0, x29, x85);
+  fiat_25519_addcarryx_u64(&x91, &x92, x90, x30, x87);
+  fiat_25519_addcarryx_u64(&x93, &x94, 0x0, x35, x89);
+  fiat_25519_addcarryx_u64(&x95, &x96, x94, x36, x91);
+  fiat_25519_addcarryx_u64(&x97, &x98, 0x0, x43, x93);
+  fiat_25519_addcarryx_u64(&x99, &x100, x98, x44, x95);
+  fiat_25519_addcarryx_u64(&x101, &x102, 0x0, x9, x3);
+  fiat_25519_addcarryx_u64(&x103, &x104, x102, x10, x4);
+  fiat_25519_addcarryx_u64(&x105, &x106, 0x0, x31, x101);
+  fiat_25519_addcarryx_u64(&x107, &x108, x106, x32, x103);
+  fiat_25519_addcarryx_u64(&x109, &x110, 0x0, x37, x105);
+  fiat_25519_addcarryx_u64(&x111, &x112, x110, x38, x107);
+  fiat_25519_addcarryx_u64(&x113, &x114, 0x0, x45, x109);
+  fiat_25519_addcarryx_u64(&x115, &x116, x114, x46, x111);
+  fiat_25519_addcarryx_u64(&x117, &x118, 0x0, x11, x5);
+  fiat_25519_addcarryx_u64(&x119, &x120, x118, x12, x6);
+  fiat_25519_addcarryx_u64(&x121, &x122, 0x0, x15, x117);
+  fiat_25519_addcarryx_u64(&x123, &x124, x122, x16, x119);
+  fiat_25519_addcarryx_u64(&x125, &x126, 0x0, x39, x121);
+  fiat_25519_addcarryx_u64(&x127, &x128, x126, x40, x123);
+  fiat_25519_addcarryx_u64(&x129, &x130, 0x0, x47, x125);
+  fiat_25519_addcarryx_u64(&x131, &x132, x130, x48, x127);
+  fiat_25519_addcarryx_u64(&x133, &x134, 0x0, x67, x129);
+  x135 = (x134 + x131);
+  x136 = ((x133 >> 51) | ((x135 << 13) & UINT64_C(0xffffffffffffffff)));
+  x137 = (x133 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x138, &x139, 0x0, x136, x113);
+  x140 = (x139 + x115);
+  x141 = ((x138 >> 51) | ((x140 << 13) & UINT64_C(0xffffffffffffffff)));
+  x142 = (x138 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x143, &x144, 0x0, x141, x97);
+  x145 = (x144 + x99);
+  x146 = ((x143 >> 51) | ((x145 << 13) & UINT64_C(0xffffffffffffffff)));
+  x147 = (x143 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x148, &x149, 0x0, x146, x81);
+  x150 = (x149 + x83);
+  x151 = ((x148 >> 51) | ((x150 << 13) & UINT64_C(0xffffffffffffffff)));
+  x152 = (x148 & UINT64_C(0x7ffffffffffff));
+  x153 = (x151 * UINT8_C(0x13));
+  x154 = (x68 + x153);
+  x155 = (x154 >> 51);
+  x156 = (x154 & UINT64_C(0x7ffffffffffff));
+  x157 = (x155 + x137);
+  x158 = (fiat_25519_uint1)(x157 >> 51);
+  x159 = (x157 & UINT64_C(0x7ffffffffffff));
+  x160 = (x158 + x142);
+  out1[0] = x156;
+  out1[1] = x159;
+  out1[2] = x160;
+  out1[3] = x147;
+  out1[4] = x152;
+}
+
+/*
+ * The function fiat_25519_carry_square squares a field element and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * eval arg1) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  fiat_25519_uint1 x40;
+  uint64_t x41;
+  fiat_25519_uint1 x42;
+  uint64_t x43;
+  fiat_25519_uint1 x44;
+  uint64_t x45;
+  fiat_25519_uint1 x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  fiat_25519_uint1 x50;
+  uint64_t x51;
+  fiat_25519_uint1 x52;
+  uint64_t x53;
+  fiat_25519_uint1 x54;
+  uint64_t x55;
+  fiat_25519_uint1 x56;
+  uint64_t x57;
+  fiat_25519_uint1 x58;
+  uint64_t x59;
+  fiat_25519_uint1 x60;
+  uint64_t x61;
+  fiat_25519_uint1 x62;
+  uint64_t x63;
+  fiat_25519_uint1 x64;
+  uint64_t x65;
+  fiat_25519_uint1 x66;
+  uint64_t x67;
+  fiat_25519_uint1 x68;
+  uint64_t x69;
+  fiat_25519_uint1 x70;
+  uint64_t x71;
+  fiat_25519_uint1 x72;
+  uint64_t x73;
+  fiat_25519_uint1 x74;
+  uint64_t x75;
+  fiat_25519_uint1 x76;
+  uint64_t x77;
+  fiat_25519_uint1 x78;
+  uint64_t x79;
+  fiat_25519_uint1 x80;
+  uint64_t x81;
+  fiat_25519_uint1 x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  fiat_25519_uint1 x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  fiat_25519_uint1 x92;
+  uint64_t x93;
+  uint64_t x94;
+  uint64_t x95;
+  uint64_t x96;
+  fiat_25519_uint1 x97;
+  uint64_t x98;
+  uint64_t x99;
+  uint64_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint64_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  fiat_25519_uint1 x106;
+  uint64_t x107;
+  uint64_t x108;
+  x1 = ((arg1[4]) * UINT8_C(0x13));
+  x2 = (x1 * 0x2);
+  x3 = ((arg1[4]) * 0x2);
+  x4 = ((arg1[3]) * UINT8_C(0x13));
+  x5 = (x4 * 0x2);
+  x6 = ((arg1[3]) * 0x2);
+  x7 = ((arg1[2]) * 0x2);
+  x8 = ((arg1[1]) * 0x2);
+  fiat_25519_mulx_u64(&x9, &x10, (arg1[4]), x1);
+  fiat_25519_mulx_u64(&x11, &x12, (arg1[3]), x2);
+  fiat_25519_mulx_u64(&x13, &x14, (arg1[3]), x4);
+  fiat_25519_mulx_u64(&x15, &x16, (arg1[2]), x2);
+  fiat_25519_mulx_u64(&x17, &x18, (arg1[2]), x5);
+  fiat_25519_mulx_u64(&x19, &x20, (arg1[2]), (arg1[2]));
+  fiat_25519_mulx_u64(&x21, &x22, (arg1[1]), x2);
+  fiat_25519_mulx_u64(&x23, &x24, (arg1[1]), x6);
+  fiat_25519_mulx_u64(&x25, &x26, (arg1[1]), x7);
+  fiat_25519_mulx_u64(&x27, &x28, (arg1[1]), (arg1[1]));
+  fiat_25519_mulx_u64(&x29, &x30, (arg1[0]), x3);
+  fiat_25519_mulx_u64(&x31, &x32, (arg1[0]), x6);
+  fiat_25519_mulx_u64(&x33, &x34, (arg1[0]), x7);
+  fiat_25519_mulx_u64(&x35, &x36, (arg1[0]), x8);
+  fiat_25519_mulx_u64(&x37, &x38, (arg1[0]), (arg1[0]));
+  fiat_25519_addcarryx_u64(&x39, &x40, 0x0, x21, x17);
+  fiat_25519_addcarryx_u64(&x41, &x42, x40, x22, x18);
+  fiat_25519_addcarryx_u64(&x43, &x44, 0x0, x37, x39);
+  fiat_25519_addcarryx_u64(&x45, &x46, x44, x38, x41);
+  x47 = ((x43 >> 51) | ((x45 << 13) & UINT64_C(0xffffffffffffffff)));
+  x48 = (x43 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x49, &x50, 0x0, x23, x19);
+  fiat_25519_addcarryx_u64(&x51, &x52, x50, x24, x20);
+  fiat_25519_addcarryx_u64(&x53, &x54, 0x0, x29, x49);
+  fiat_25519_addcarryx_u64(&x55, &x56, x54, x30, x51);
+  fiat_25519_addcarryx_u64(&x57, &x58, 0x0, x25, x9);
+  fiat_25519_addcarryx_u64(&x59, &x60, x58, x26, x10);
+  fiat_25519_addcarryx_u64(&x61, &x62, 0x0, x31, x57);
+  fiat_25519_addcarryx_u64(&x63, &x64, x62, x32, x59);
+  fiat_25519_addcarryx_u64(&x65, &x66, 0x0, x27, x11);
+  fiat_25519_addcarryx_u64(&x67, &x68, x66, x28, x12);
+  fiat_25519_addcarryx_u64(&x69, &x70, 0x0, x33, x65);
+  fiat_25519_addcarryx_u64(&x71, &x72, x70, x34, x67);
+  fiat_25519_addcarryx_u64(&x73, &x74, 0x0, x15, x13);
+  fiat_25519_addcarryx_u64(&x75, &x76, x74, x16, x14);
+  fiat_25519_addcarryx_u64(&x77, &x78, 0x0, x35, x73);
+  fiat_25519_addcarryx_u64(&x79, &x80, x78, x36, x75);
+  fiat_25519_addcarryx_u64(&x81, &x82, 0x0, x47, x77);
+  x83 = (x82 + x79);
+  x84 = ((x81 >> 51) | ((x83 << 13) & UINT64_C(0xffffffffffffffff)));
+  x85 = (x81 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x86, &x87, 0x0, x84, x69);
+  x88 = (x87 + x71);
+  x89 = ((x86 >> 51) | ((x88 << 13) & UINT64_C(0xffffffffffffffff)));
+  x90 = (x86 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x91, &x92, 0x0, x89, x61);
+  x93 = (x92 + x63);
+  x94 = ((x91 >> 51) | ((x93 << 13) & UINT64_C(0xffffffffffffffff)));
+  x95 = (x91 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x96, &x97, 0x0, x94, x53);
+  x98 = (x97 + x55);
+  x99 = ((x96 >> 51) | ((x98 << 13) & UINT64_C(0xffffffffffffffff)));
+  x100 = (x96 & UINT64_C(0x7ffffffffffff));
+  x101 = (x99 * UINT8_C(0x13));
+  x102 = (x48 + x101);
+  x103 = (x102 >> 51);
+  x104 = (x102 & UINT64_C(0x7ffffffffffff));
+  x105 = (x103 + x85);
+  x106 = (fiat_25519_uint1)(x105 >> 51);
+  x107 = (x105 & UINT64_C(0x7ffffffffffff));
+  x108 = (x106 + x90);
+  out1[0] = x104;
+  out1[1] = x107;
+  out1[2] = x108;
+  out1[3] = x95;
+  out1[4] = x100;
+}
+
+/*
+ * The function fiat_25519_carry reduces a field element.
+ *
+ * Postconditions:
+ *   eval out1 mod m = eval arg1 mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  x1 = (arg1[0]);
+  x2 = ((x1 >> 51) + (arg1[1]));
+  x3 = ((x2 >> 51) + (arg1[2]));
+  x4 = ((x3 >> 51) + (arg1[3]));
+  x5 = ((x4 >> 51) + (arg1[4]));
+  x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13)));
+  x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff)));
+  x8 = (x6 & UINT64_C(0x7ffffffffffff));
+  x9 = (x7 & UINT64_C(0x7ffffffffffff));
+  x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff)));
+  x11 = (x4 & UINT64_C(0x7ffffffffffff));
+  x12 = (x5 & UINT64_C(0x7ffffffffffff));
+  out1[0] = x8;
+  out1[1] = x9;
+  out1[2] = x10;
+  out1[3] = x11;
+  out1[4] = x12;
+}
+
+/*
+ * The function fiat_25519_add adds two field elements.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 + eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = ((arg1[0]) + (arg2[0]));
+  x2 = ((arg1[1]) + (arg2[1]));
+  x3 = ((arg1[2]) + (arg2[2]));
+  x4 = ((arg1[3]) + (arg2[3]));
+  x5 = ((arg1[4]) + (arg2[4]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+}
+
+/*
+ * The function fiat_25519_sub subtracts two field elements.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 - eval arg2) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0]));
+  x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1]));
+  x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2]));
+  x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3]));
+  x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+}
+
+/*
+ * The function fiat_25519_opp negates a field element.
+ *
+ * Postconditions:
+ *   eval out1 mod m = -eval arg1 mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = (UINT64_C(0xfffffffffffda) - (arg1[0]));
+  x2 = (UINT64_C(0xffffffffffffe) - (arg1[1]));
+  x3 = (UINT64_C(0xffffffffffffe) - (arg1[2]));
+  x4 = (UINT64_C(0xffffffffffffe) - (arg1[3]));
+  x5 = (UINT64_C(0xffffffffffffe) - (arg1[4]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+}
+
+/*
+ * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order.
+ *
+ * Postconditions:
+ *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) {
+  uint64_t x1;
+  fiat_25519_uint1 x2;
+  uint64_t x3;
+  fiat_25519_uint1 x4;
+  uint64_t x5;
+  fiat_25519_uint1 x6;
+  uint64_t x7;
+  fiat_25519_uint1 x8;
+  uint64_t x9;
+  fiat_25519_uint1 x10;
+  uint64_t x11;
+  uint64_t x12;
+  fiat_25519_uint1 x13;
+  uint64_t x14;
+  fiat_25519_uint1 x15;
+  uint64_t x16;
+  fiat_25519_uint1 x17;
+  uint64_t x18;
+  fiat_25519_uint1 x19;
+  uint64_t x20;
+  fiat_25519_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint8_t x26;
+  uint64_t x27;
+  uint8_t x28;
+  uint64_t x29;
+  uint8_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint8_t x34;
+  uint64_t x35;
+  uint8_t x36;
+  uint8_t x37;
+  uint64_t x38;
+  uint8_t x39;
+  uint64_t x40;
+  uint8_t x41;
+  uint64_t x42;
+  uint8_t x43;
+  uint64_t x44;
+  uint8_t x45;
+  uint64_t x46;
+  uint8_t x47;
+  uint64_t x48;
+  uint8_t x49;
+  uint8_t x50;
+  uint64_t x51;
+  uint8_t x52;
+  uint64_t x53;
+  uint8_t x54;
+  uint64_t x55;
+  uint8_t x56;
+  uint64_t x57;
+  uint8_t x58;
+  uint64_t x59;
+  uint8_t x60;
+  uint64_t x61;
+  uint8_t x62;
+  uint64_t x63;
+  uint8_t x64;
+  fiat_25519_uint1 x65;
+  uint64_t x66;
+  uint8_t x67;
+  uint64_t x68;
+  uint8_t x69;
+  uint64_t x70;
+  uint8_t x71;
+  uint64_t x72;
+  uint8_t x73;
+  uint64_t x74;
+  uint8_t x75;
+  uint64_t x76;
+  uint8_t x77;
+  uint8_t x78;
+  uint64_t x79;
+  uint8_t x80;
+  uint64_t x81;
+  uint8_t x82;
+  uint64_t x83;
+  uint8_t x84;
+  uint64_t x85;
+  uint8_t x86;
+  uint64_t x87;
+  uint8_t x88;
+  uint64_t x89;
+  uint8_t x90;
+  uint8_t x91;
+  fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed));
+  fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed)));
+  fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff)));
+  x22 = (x20 << 4);
+  x23 = (x18 * (uint64_t)0x2);
+  x24 = (x16 << 6);
+  x25 = (x14 << 3);
+  x26 = (uint8_t)(x12 & UINT8_C(0xff));
+  x27 = (x12 >> 8);
+  x28 = (uint8_t)(x27 & UINT8_C(0xff));
+  x29 = (x27 >> 8);
+  x30 = (uint8_t)(x29 & UINT8_C(0xff));
+  x31 = (x29 >> 8);
+  x32 = (uint8_t)(x31 & UINT8_C(0xff));
+  x33 = (x31 >> 8);
+  x34 = (uint8_t)(x33 & UINT8_C(0xff));
+  x35 = (x33 >> 8);
+  x36 = (uint8_t)(x35 & UINT8_C(0xff));
+  x37 = (uint8_t)(x35 >> 8);
+  x38 = (x25 + (uint64_t)x37);
+  x39 = (uint8_t)(x38 & UINT8_C(0xff));
+  x40 = (x38 >> 8);
+  x41 = (uint8_t)(x40 & UINT8_C(0xff));
+  x42 = (x40 >> 8);
+  x43 = (uint8_t)(x42 & UINT8_C(0xff));
+  x44 = (x42 >> 8);
+  x45 = (uint8_t)(x44 & UINT8_C(0xff));
+  x46 = (x44 >> 8);
+  x47 = (uint8_t)(x46 & UINT8_C(0xff));
+  x48 = (x46 >> 8);
+  x49 = (uint8_t)(x48 & UINT8_C(0xff));
+  x50 = (uint8_t)(x48 >> 8);
+  x51 = (x24 + (uint64_t)x50);
+  x52 = (uint8_t)(x51 & UINT8_C(0xff));
+  x53 = (x51 >> 8);
+  x54 = (uint8_t)(x53 & UINT8_C(0xff));
+  x55 = (x53 >> 8);
+  x56 = (uint8_t)(x55 & UINT8_C(0xff));
+  x57 = (x55 >> 8);
+  x58 = (uint8_t)(x57 & UINT8_C(0xff));
+  x59 = (x57 >> 8);
+  x60 = (uint8_t)(x59 & UINT8_C(0xff));
+  x61 = (x59 >> 8);
+  x62 = (uint8_t)(x61 & UINT8_C(0xff));
+  x63 = (x61 >> 8);
+  x64 = (uint8_t)(x63 & UINT8_C(0xff));
+  x65 = (fiat_25519_uint1)(x63 >> 8);
+  x66 = (x23 + (uint64_t)x65);
+  x67 = (uint8_t)(x66 & UINT8_C(0xff));
+  x68 = (x66 >> 8);
+  x69 = (uint8_t)(x68 & UINT8_C(0xff));
+  x70 = (x68 >> 8);
+  x71 = (uint8_t)(x70 & UINT8_C(0xff));
+  x72 = (x70 >> 8);
+  x73 = (uint8_t)(x72 & UINT8_C(0xff));
+  x74 = (x72 >> 8);
+  x75 = (uint8_t)(x74 & UINT8_C(0xff));
+  x76 = (x74 >> 8);
+  x77 = (uint8_t)(x76 & UINT8_C(0xff));
+  x78 = (uint8_t)(x76 >> 8);
+  x79 = (x22 + (uint64_t)x78);
+  x80 = (uint8_t)(x79 & UINT8_C(0xff));
+  x81 = (x79 >> 8);
+  x82 = (uint8_t)(x81 & UINT8_C(0xff));
+  x83 = (x81 >> 8);
+  x84 = (uint8_t)(x83 & UINT8_C(0xff));
+  x85 = (x83 >> 8);
+  x86 = (uint8_t)(x85 & UINT8_C(0xff));
+  x87 = (x85 >> 8);
+  x88 = (uint8_t)(x87 & UINT8_C(0xff));
+  x89 = (x87 >> 8);
+  x90 = (uint8_t)(x89 & UINT8_C(0xff));
+  x91 = (uint8_t)(x89 >> 8);
+  out1[0] = x26;
+  out1[1] = x28;
+  out1[2] = x30;
+  out1[3] = x32;
+  out1[4] = x34;
+  out1[5] = x36;
+  out1[6] = x39;
+  out1[7] = x41;
+  out1[8] = x43;
+  out1[9] = x45;
+  out1[10] = x47;
+  out1[11] = x49;
+  out1[12] = x52;
+  out1[13] = x54;
+  out1[14] = x56;
+  out1[15] = x58;
+  out1[16] = x60;
+  out1[17] = x62;
+  out1[18] = x64;
+  out1[19] = x67;
+  out1[20] = x69;
+  out1[21] = x71;
+  out1[22] = x73;
+  out1[23] = x75;
+  out1[24] = x77;
+  out1[25] = x80;
+  out1[26] = x82;
+  out1[27] = x84;
+  out1[28] = x86;
+  out1[29] = x88;
+  out1[30] = x90;
+  out1[31] = x91;
+}
+
+/*
+ * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order.
+ *
+ * Postconditions:
+ *   eval out1 mod m = bytes_eval arg1 mod m
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint8_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint8_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint8_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint8_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  x1 = ((uint64_t)(arg1[31]) << 44);
+  x2 = ((uint64_t)(arg1[30]) << 36);
+  x3 = ((uint64_t)(arg1[29]) << 28);
+  x4 = ((uint64_t)(arg1[28]) << 20);
+  x5 = ((uint64_t)(arg1[27]) << 12);
+  x6 = ((uint64_t)(arg1[26]) << 4);
+  x7 = ((uint64_t)(arg1[25]) << 47);
+  x8 = ((uint64_t)(arg1[24]) << 39);
+  x9 = ((uint64_t)(arg1[23]) << 31);
+  x10 = ((uint64_t)(arg1[22]) << 23);
+  x11 = ((uint64_t)(arg1[21]) << 15);
+  x12 = ((uint64_t)(arg1[20]) << 7);
+  x13 = ((uint64_t)(arg1[19]) << 50);
+  x14 = ((uint64_t)(arg1[18]) << 42);
+  x15 = ((uint64_t)(arg1[17]) << 34);
+  x16 = ((uint64_t)(arg1[16]) << 26);
+  x17 = ((uint64_t)(arg1[15]) << 18);
+  x18 = ((uint64_t)(arg1[14]) << 10);
+  x19 = ((uint64_t)(arg1[13]) << 2);
+  x20 = ((uint64_t)(arg1[12]) << 45);
+  x21 = ((uint64_t)(arg1[11]) << 37);
+  x22 = ((uint64_t)(arg1[10]) << 29);
+  x23 = ((uint64_t)(arg1[9]) << 21);
+  x24 = ((uint64_t)(arg1[8]) << 13);
+  x25 = ((uint64_t)(arg1[7]) << 5);
+  x26 = ((uint64_t)(arg1[6]) << 48);
+  x27 = ((uint64_t)(arg1[5]) << 40);
+  x28 = ((uint64_t)(arg1[4]) << 32);
+  x29 = ((uint64_t)(arg1[3]) << 24);
+  x30 = ((uint64_t)(arg1[2]) << 16);
+  x31 = ((uint64_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint64_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x28 + x35);
+  x37 = (x27 + x36);
+  x38 = (x26 + x37);
+  x39 = (x38 & UINT64_C(0x7ffffffffffff));
+  x40 = (uint8_t)(x38 >> 51);
+  x41 = (x25 + (uint64_t)x40);
+  x42 = (x24 + x41);
+  x43 = (x23 + x42);
+  x44 = (x22 + x43);
+  x45 = (x21 + x44);
+  x46 = (x20 + x45);
+  x47 = (x46 & UINT64_C(0x7ffffffffffff));
+  x48 = (uint8_t)(x46 >> 51);
+  x49 = (x19 + (uint64_t)x48);
+  x50 = (x18 + x49);
+  x51 = (x17 + x50);
+  x52 = (x16 + x51);
+  x53 = (x15 + x52);
+  x54 = (x14 + x53);
+  x55 = (x13 + x54);
+  x56 = (x55 & UINT64_C(0x7ffffffffffff));
+  x57 = (uint8_t)(x55 >> 51);
+  x58 = (x12 + (uint64_t)x57);
+  x59 = (x11 + x58);
+  x60 = (x10 + x59);
+  x61 = (x9 + x60);
+  x62 = (x8 + x61);
+  x63 = (x7 + x62);
+  x64 = (x63 & UINT64_C(0x7ffffffffffff));
+  x65 = (uint8_t)(x63 >> 51);
+  x66 = (x6 + (uint64_t)x65);
+  x67 = (x5 + x66);
+  x68 = (x4 + x67);
+  x69 = (x3 + x68);
+  x70 = (x2 + x69);
+  x71 = (x1 + x70);
+  out1[0] = x39;
+  out1[1] = x47;
+  out1[2] = x56;
+  out1[3] = x64;
+  out1[4] = x71;
+}
+
+/*
+ * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result.
+ *
+ * Postconditions:
+ *   eval out1 mod m = (121666 * eval arg1) mod m
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_25519_uint1 x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  fiat_25519_uint1 x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  fiat_25519_uint1 x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  fiat_25519_uint1 x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  fiat_25519_uint1 x35;
+  uint64_t x36;
+  uint64_t x37;
+  fiat_25519_uint1 x38;
+  uint64_t x39;
+  uint64_t x40;
+  fiat_25519_mulx_u64(&x1, &x2, UINT32_C(0x1db42), (arg1[4]));
+  fiat_25519_mulx_u64(&x3, &x4, UINT32_C(0x1db42), (arg1[3]));
+  fiat_25519_mulx_u64(&x5, &x6, UINT32_C(0x1db42), (arg1[2]));
+  fiat_25519_mulx_u64(&x7, &x8, UINT32_C(0x1db42), (arg1[1]));
+  fiat_25519_mulx_u64(&x9, &x10, UINT32_C(0x1db42), (arg1[0]));
+  x11 = ((x9 >> 51) | ((x10 << 13) & UINT64_C(0xffffffffffffffff)));
+  x12 = (x9 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x13, &x14, 0x0, x11, x7);
+  x15 = (x14 + x8);
+  x16 = ((x13 >> 51) | ((x15 << 13) & UINT64_C(0xffffffffffffffff)));
+  x17 = (x13 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x18, &x19, 0x0, x16, x5);
+  x20 = (x19 + x6);
+  x21 = ((x18 >> 51) | ((x20 << 13) & UINT64_C(0xffffffffffffffff)));
+  x22 = (x18 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x23, &x24, 0x0, x21, x3);
+  x25 = (x24 + x4);
+  x26 = ((x23 >> 51) | ((x25 << 13) & UINT64_C(0xffffffffffffffff)));
+  x27 = (x23 & UINT64_C(0x7ffffffffffff));
+  fiat_25519_addcarryx_u64(&x28, &x29, 0x0, x26, x1);
+  x30 = (x29 + x2);
+  x31 = ((x28 >> 51) | ((x30 << 13) & UINT64_C(0xffffffffffffffff)));
+  x32 = (x28 & UINT64_C(0x7ffffffffffff));
+  x33 = (x31 * UINT8_C(0x13));
+  x34 = (x12 + x33);
+  x35 = (fiat_25519_uint1)(x34 >> 51);
+  x36 = (x34 & UINT64_C(0x7ffffffffffff));
+  x37 = (x35 + x17);
+  x38 = (fiat_25519_uint1)(x37 >> 51);
+  x39 = (x37 & UINT64_C(0x7ffffffffffff));
+  x40 = (x38 + x22);
+  out1[0] = x36;
+  out1[1] = x39;
+  out1[2] = x40;
+  out1[3] = x27;
+  out1[4] = x32;
+}
diff --git a/ring-0.17.14/third_party/fiat/p256_32.h b/ring-0.17.14/third_party/fiat/p256_32.h
new file mode 100644
index 0000000000..83289a1d9a
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/p256_32.h
@@ -0,0 +1,2511 @@
+/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 32 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */
+/* curve description: p256 */
+/* machine_wordsize = 32 (from "32") */
+/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */
+/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
+/*                                                                    */
+/* NOTE: In addition to the bounds specified above each function, all */
+/*   functions synthesized for this Montgomery arithmetic require the */
+/*   input to be strictly less than the prime modulus (m), and also   */
+/*   require the input to be in the unique saturated representation.  */
+/*   All functions also ensure that these two properties are true of  */
+/*   return values.                                                   */
+/*  */
+/* Computed values: */
+/*   eval z = z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   twos_complement_eval z = let x1 := z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) in */
+/*                            if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */
+
+#include <stdint.h>
+typedef unsigned char fiat_p256_uint1;
+typedef signed char fiat_p256_int1;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_P256_FIAT_INLINE __inline__
+#else
+#  define FIAT_P256_FIAT_INLINE
+#endif
+
+/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */
+typedef uint32_t fiat_p256_montgomery_domain_field_element[8];
+
+/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */
+typedef uint32_t fiat_p256_non_montgomery_domain_field_element[8];
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+#if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint32_t fiat_p256_value_barrier_u32(uint32_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_p256_value_barrier_u32(x) (x)
+#endif
+
+
+/*
+ * The function fiat_p256_addcarryx_u32 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^32
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^32⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffff]
+ *   arg3: [0x0 ~> 0xffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  uint64_t x1;
+  uint32_t x2;
+  fiat_p256_uint1 x3;
+  x1 = ((arg1 + (uint64_t)arg2) + arg3);
+  x2 = (uint32_t)(x1 & UINT32_C(0xffffffff));
+  x3 = (fiat_p256_uint1)(x1 >> 32);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_p256_subborrowx_u32 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^32
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^32⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffff]
+ *   arg3: [0x0 ~> 0xffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  int64_t x1;
+  fiat_p256_int1 x2;
+  uint32_t x3;
+  x1 = ((arg2 - (int64_t)arg1) - arg3);
+  x2 = (fiat_p256_int1)(x1 >> 32);
+  x3 = (uint32_t)(x1 & UINT32_C(0xffffffff));
+  *out1 = x3;
+  *out2 = (fiat_p256_uint1)(0x0 - x2);
+}
+
+/*
+ * The function fiat_p256_mulx_u32 is a multiplication, returning the full double-width result.
+ *
+ * Postconditions:
+ *   out1 = (arg1 * arg2) mod 2^32
+ *   out2 = ⌊arg1 * arg2 / 2^32⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffff]
+ *   arg2: [0x0 ~> 0xffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ *   out2: [0x0 ~> 0xffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u32(uint32_t* out1, uint32_t* out2, uint32_t arg1, uint32_t arg2) {
+  uint64_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  x1 = ((uint64_t)arg1 * arg2);
+  x2 = (uint32_t)(x1 & UINT32_C(0xffffffff));
+  x3 = (uint32_t)(x1 >> 32);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_p256_cmovznz_u32 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffff]
+ *   arg3: [0x0 ~> 0xffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u32(uint32_t* out1, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  fiat_p256_uint1 x1;
+  uint32_t x2;
+  uint32_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_p256_int1)(0x0 - x1) & UINT32_C(0xffffffff));
+  x3 = ((fiat_p256_value_barrier_u32(x2) & arg3) | (fiat_p256_value_barrier_u32((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * The function fiat_p256_mul multiplies two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  uint32_t x23;
+  uint32_t x24;
+  uint32_t x25;
+  fiat_p256_uint1 x26;
+  uint32_t x27;
+  fiat_p256_uint1 x28;
+  uint32_t x29;
+  fiat_p256_uint1 x30;
+  uint32_t x31;
+  fiat_p256_uint1 x32;
+  uint32_t x33;
+  fiat_p256_uint1 x34;
+  uint32_t x35;
+  fiat_p256_uint1 x36;
+  uint32_t x37;
+  fiat_p256_uint1 x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint32_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint32_t x47;
+  uint32_t x48;
+  fiat_p256_uint1 x49;
+  uint32_t x50;
+  fiat_p256_uint1 x51;
+  uint32_t x52;
+  uint32_t x53;
+  fiat_p256_uint1 x54;
+  uint32_t x55;
+  fiat_p256_uint1 x56;
+  uint32_t x57;
+  fiat_p256_uint1 x58;
+  uint32_t x59;
+  fiat_p256_uint1 x60;
+  uint32_t x61;
+  fiat_p256_uint1 x62;
+  uint32_t x63;
+  fiat_p256_uint1 x64;
+  uint32_t x65;
+  fiat_p256_uint1 x66;
+  uint32_t x67;
+  fiat_p256_uint1 x68;
+  uint32_t x69;
+  fiat_p256_uint1 x70;
+  uint32_t x71;
+  uint32_t x72;
+  uint32_t x73;
+  uint32_t x74;
+  uint32_t x75;
+  uint32_t x76;
+  uint32_t x77;
+  uint32_t x78;
+  uint32_t x79;
+  uint32_t x80;
+  uint32_t x81;
+  uint32_t x82;
+  uint32_t x83;
+  uint32_t x84;
+  uint32_t x85;
+  uint32_t x86;
+  uint32_t x87;
+  fiat_p256_uint1 x88;
+  uint32_t x89;
+  fiat_p256_uint1 x90;
+  uint32_t x91;
+  fiat_p256_uint1 x92;
+  uint32_t x93;
+  fiat_p256_uint1 x94;
+  uint32_t x95;
+  fiat_p256_uint1 x96;
+  uint32_t x97;
+  fiat_p256_uint1 x98;
+  uint32_t x99;
+  fiat_p256_uint1 x100;
+  uint32_t x101;
+  uint32_t x102;
+  fiat_p256_uint1 x103;
+  uint32_t x104;
+  fiat_p256_uint1 x105;
+  uint32_t x106;
+  fiat_p256_uint1 x107;
+  uint32_t x108;
+  fiat_p256_uint1 x109;
+  uint32_t x110;
+  fiat_p256_uint1 x111;
+  uint32_t x112;
+  fiat_p256_uint1 x113;
+  uint32_t x114;
+  fiat_p256_uint1 x115;
+  uint32_t x116;
+  fiat_p256_uint1 x117;
+  uint32_t x118;
+  fiat_p256_uint1 x119;
+  uint32_t x120;
+  uint32_t x121;
+  uint32_t x122;
+  uint32_t x123;
+  uint32_t x124;
+  uint32_t x125;
+  uint32_t x126;
+  uint32_t x127;
+  uint32_t x128;
+  fiat_p256_uint1 x129;
+  uint32_t x130;
+  fiat_p256_uint1 x131;
+  uint32_t x132;
+  uint32_t x133;
+  fiat_p256_uint1 x134;
+  uint32_t x135;
+  fiat_p256_uint1 x136;
+  uint32_t x137;
+  fiat_p256_uint1 x138;
+  uint32_t x139;
+  fiat_p256_uint1 x140;
+  uint32_t x141;
+  fiat_p256_uint1 x142;
+  uint32_t x143;
+  fiat_p256_uint1 x144;
+  uint32_t x145;
+  fiat_p256_uint1 x146;
+  uint32_t x147;
+  fiat_p256_uint1 x148;
+  uint32_t x149;
+  fiat_p256_uint1 x150;
+  uint32_t x151;
+  uint32_t x152;
+  uint32_t x153;
+  uint32_t x154;
+  uint32_t x155;
+  uint32_t x156;
+  uint32_t x157;
+  uint32_t x158;
+  uint32_t x159;
+  uint32_t x160;
+  uint32_t x161;
+  uint32_t x162;
+  uint32_t x163;
+  uint32_t x164;
+  uint32_t x165;
+  uint32_t x166;
+  uint32_t x167;
+  uint32_t x168;
+  fiat_p256_uint1 x169;
+  uint32_t x170;
+  fiat_p256_uint1 x171;
+  uint32_t x172;
+  fiat_p256_uint1 x173;
+  uint32_t x174;
+  fiat_p256_uint1 x175;
+  uint32_t x176;
+  fiat_p256_uint1 x177;
+  uint32_t x178;
+  fiat_p256_uint1 x179;
+  uint32_t x180;
+  fiat_p256_uint1 x181;
+  uint32_t x182;
+  uint32_t x183;
+  fiat_p256_uint1 x184;
+  uint32_t x185;
+  fiat_p256_uint1 x186;
+  uint32_t x187;
+  fiat_p256_uint1 x188;
+  uint32_t x189;
+  fiat_p256_uint1 x190;
+  uint32_t x191;
+  fiat_p256_uint1 x192;
+  uint32_t x193;
+  fiat_p256_uint1 x194;
+  uint32_t x195;
+  fiat_p256_uint1 x196;
+  uint32_t x197;
+  fiat_p256_uint1 x198;
+  uint32_t x199;
+  fiat_p256_uint1 x200;
+  uint32_t x201;
+  uint32_t x202;
+  uint32_t x203;
+  uint32_t x204;
+  uint32_t x205;
+  uint32_t x206;
+  uint32_t x207;
+  uint32_t x208;
+  uint32_t x209;
+  fiat_p256_uint1 x210;
+  uint32_t x211;
+  fiat_p256_uint1 x212;
+  uint32_t x213;
+  uint32_t x214;
+  fiat_p256_uint1 x215;
+  uint32_t x216;
+  fiat_p256_uint1 x217;
+  uint32_t x218;
+  fiat_p256_uint1 x219;
+  uint32_t x220;
+  fiat_p256_uint1 x221;
+  uint32_t x222;
+  fiat_p256_uint1 x223;
+  uint32_t x224;
+  fiat_p256_uint1 x225;
+  uint32_t x226;
+  fiat_p256_uint1 x227;
+  uint32_t x228;
+  fiat_p256_uint1 x229;
+  uint32_t x230;
+  fiat_p256_uint1 x231;
+  uint32_t x232;
+  uint32_t x233;
+  uint32_t x234;
+  uint32_t x235;
+  uint32_t x236;
+  uint32_t x237;
+  uint32_t x238;
+  uint32_t x239;
+  uint32_t x240;
+  uint32_t x241;
+  uint32_t x242;
+  uint32_t x243;
+  uint32_t x244;
+  uint32_t x245;
+  uint32_t x246;
+  uint32_t x247;
+  uint32_t x248;
+  uint32_t x249;
+  fiat_p256_uint1 x250;
+  uint32_t x251;
+  fiat_p256_uint1 x252;
+  uint32_t x253;
+  fiat_p256_uint1 x254;
+  uint32_t x255;
+  fiat_p256_uint1 x256;
+  uint32_t x257;
+  fiat_p256_uint1 x258;
+  uint32_t x259;
+  fiat_p256_uint1 x260;
+  uint32_t x261;
+  fiat_p256_uint1 x262;
+  uint32_t x263;
+  uint32_t x264;
+  fiat_p256_uint1 x265;
+  uint32_t x266;
+  fiat_p256_uint1 x267;
+  uint32_t x268;
+  fiat_p256_uint1 x269;
+  uint32_t x270;
+  fiat_p256_uint1 x271;
+  uint32_t x272;
+  fiat_p256_uint1 x273;
+  uint32_t x274;
+  fiat_p256_uint1 x275;
+  uint32_t x276;
+  fiat_p256_uint1 x277;
+  uint32_t x278;
+  fiat_p256_uint1 x279;
+  uint32_t x280;
+  fiat_p256_uint1 x281;
+  uint32_t x282;
+  uint32_t x283;
+  uint32_t x284;
+  uint32_t x285;
+  uint32_t x286;
+  uint32_t x287;
+  uint32_t x288;
+  uint32_t x289;
+  uint32_t x290;
+  fiat_p256_uint1 x291;
+  uint32_t x292;
+  fiat_p256_uint1 x293;
+  uint32_t x294;
+  uint32_t x295;
+  fiat_p256_uint1 x296;
+  uint32_t x297;
+  fiat_p256_uint1 x298;
+  uint32_t x299;
+  fiat_p256_uint1 x300;
+  uint32_t x301;
+  fiat_p256_uint1 x302;
+  uint32_t x303;
+  fiat_p256_uint1 x304;
+  uint32_t x305;
+  fiat_p256_uint1 x306;
+  uint32_t x307;
+  fiat_p256_uint1 x308;
+  uint32_t x309;
+  fiat_p256_uint1 x310;
+  uint32_t x311;
+  fiat_p256_uint1 x312;
+  uint32_t x313;
+  uint32_t x314;
+  uint32_t x315;
+  uint32_t x316;
+  uint32_t x317;
+  uint32_t x318;
+  uint32_t x319;
+  uint32_t x320;
+  uint32_t x321;
+  uint32_t x322;
+  uint32_t x323;
+  uint32_t x324;
+  uint32_t x325;
+  uint32_t x326;
+  uint32_t x327;
+  uint32_t x328;
+  uint32_t x329;
+  uint32_t x330;
+  fiat_p256_uint1 x331;
+  uint32_t x332;
+  fiat_p256_uint1 x333;
+  uint32_t x334;
+  fiat_p256_uint1 x335;
+  uint32_t x336;
+  fiat_p256_uint1 x337;
+  uint32_t x338;
+  fiat_p256_uint1 x339;
+  uint32_t x340;
+  fiat_p256_uint1 x341;
+  uint32_t x342;
+  fiat_p256_uint1 x343;
+  uint32_t x344;
+  uint32_t x345;
+  fiat_p256_uint1 x346;
+  uint32_t x347;
+  fiat_p256_uint1 x348;
+  uint32_t x349;
+  fiat_p256_uint1 x350;
+  uint32_t x351;
+  fiat_p256_uint1 x352;
+  uint32_t x353;
+  fiat_p256_uint1 x354;
+  uint32_t x355;
+  fiat_p256_uint1 x356;
+  uint32_t x357;
+  fiat_p256_uint1 x358;
+  uint32_t x359;
+  fiat_p256_uint1 x360;
+  uint32_t x361;
+  fiat_p256_uint1 x362;
+  uint32_t x363;
+  uint32_t x364;
+  uint32_t x365;
+  uint32_t x366;
+  uint32_t x367;
+  uint32_t x368;
+  uint32_t x369;
+  uint32_t x370;
+  uint32_t x371;
+  fiat_p256_uint1 x372;
+  uint32_t x373;
+  fiat_p256_uint1 x374;
+  uint32_t x375;
+  uint32_t x376;
+  fiat_p256_uint1 x377;
+  uint32_t x378;
+  fiat_p256_uint1 x379;
+  uint32_t x380;
+  fiat_p256_uint1 x381;
+  uint32_t x382;
+  fiat_p256_uint1 x383;
+  uint32_t x384;
+  fiat_p256_uint1 x385;
+  uint32_t x386;
+  fiat_p256_uint1 x387;
+  uint32_t x388;
+  fiat_p256_uint1 x389;
+  uint32_t x390;
+  fiat_p256_uint1 x391;
+  uint32_t x392;
+  fiat_p256_uint1 x393;
+  uint32_t x394;
+  uint32_t x395;
+  uint32_t x396;
+  uint32_t x397;
+  uint32_t x398;
+  uint32_t x399;
+  uint32_t x400;
+  uint32_t x401;
+  uint32_t x402;
+  uint32_t x403;
+  uint32_t x404;
+  uint32_t x405;
+  uint32_t x406;
+  uint32_t x407;
+  uint32_t x408;
+  uint32_t x409;
+  uint32_t x410;
+  uint32_t x411;
+  fiat_p256_uint1 x412;
+  uint32_t x413;
+  fiat_p256_uint1 x414;
+  uint32_t x415;
+  fiat_p256_uint1 x416;
+  uint32_t x417;
+  fiat_p256_uint1 x418;
+  uint32_t x419;
+  fiat_p256_uint1 x420;
+  uint32_t x421;
+  fiat_p256_uint1 x422;
+  uint32_t x423;
+  fiat_p256_uint1 x424;
+  uint32_t x425;
+  uint32_t x426;
+  fiat_p256_uint1 x427;
+  uint32_t x428;
+  fiat_p256_uint1 x429;
+  uint32_t x430;
+  fiat_p256_uint1 x431;
+  uint32_t x432;
+  fiat_p256_uint1 x433;
+  uint32_t x434;
+  fiat_p256_uint1 x435;
+  uint32_t x436;
+  fiat_p256_uint1 x437;
+  uint32_t x438;
+  fiat_p256_uint1 x439;
+  uint32_t x440;
+  fiat_p256_uint1 x441;
+  uint32_t x442;
+  fiat_p256_uint1 x443;
+  uint32_t x444;
+  uint32_t x445;
+  uint32_t x446;
+  uint32_t x447;
+  uint32_t x448;
+  uint32_t x449;
+  uint32_t x450;
+  uint32_t x451;
+  uint32_t x452;
+  fiat_p256_uint1 x453;
+  uint32_t x454;
+  fiat_p256_uint1 x455;
+  uint32_t x456;
+  uint32_t x457;
+  fiat_p256_uint1 x458;
+  uint32_t x459;
+  fiat_p256_uint1 x460;
+  uint32_t x461;
+  fiat_p256_uint1 x462;
+  uint32_t x463;
+  fiat_p256_uint1 x464;
+  uint32_t x465;
+  fiat_p256_uint1 x466;
+  uint32_t x467;
+  fiat_p256_uint1 x468;
+  uint32_t x469;
+  fiat_p256_uint1 x470;
+  uint32_t x471;
+  fiat_p256_uint1 x472;
+  uint32_t x473;
+  fiat_p256_uint1 x474;
+  uint32_t x475;
+  uint32_t x476;
+  uint32_t x477;
+  uint32_t x478;
+  uint32_t x479;
+  uint32_t x480;
+  uint32_t x481;
+  uint32_t x482;
+  uint32_t x483;
+  uint32_t x484;
+  uint32_t x485;
+  uint32_t x486;
+  uint32_t x487;
+  uint32_t x488;
+  uint32_t x489;
+  uint32_t x490;
+  uint32_t x491;
+  uint32_t x492;
+  fiat_p256_uint1 x493;
+  uint32_t x494;
+  fiat_p256_uint1 x495;
+  uint32_t x496;
+  fiat_p256_uint1 x497;
+  uint32_t x498;
+  fiat_p256_uint1 x499;
+  uint32_t x500;
+  fiat_p256_uint1 x501;
+  uint32_t x502;
+  fiat_p256_uint1 x503;
+  uint32_t x504;
+  fiat_p256_uint1 x505;
+  uint32_t x506;
+  uint32_t x507;
+  fiat_p256_uint1 x508;
+  uint32_t x509;
+  fiat_p256_uint1 x510;
+  uint32_t x511;
+  fiat_p256_uint1 x512;
+  uint32_t x513;
+  fiat_p256_uint1 x514;
+  uint32_t x515;
+  fiat_p256_uint1 x516;
+  uint32_t x517;
+  fiat_p256_uint1 x518;
+  uint32_t x519;
+  fiat_p256_uint1 x520;
+  uint32_t x521;
+  fiat_p256_uint1 x522;
+  uint32_t x523;
+  fiat_p256_uint1 x524;
+  uint32_t x525;
+  uint32_t x526;
+  uint32_t x527;
+  uint32_t x528;
+  uint32_t x529;
+  uint32_t x530;
+  uint32_t x531;
+  uint32_t x532;
+  uint32_t x533;
+  fiat_p256_uint1 x534;
+  uint32_t x535;
+  fiat_p256_uint1 x536;
+  uint32_t x537;
+  uint32_t x538;
+  fiat_p256_uint1 x539;
+  uint32_t x540;
+  fiat_p256_uint1 x541;
+  uint32_t x542;
+  fiat_p256_uint1 x543;
+  uint32_t x544;
+  fiat_p256_uint1 x545;
+  uint32_t x546;
+  fiat_p256_uint1 x547;
+  uint32_t x548;
+  fiat_p256_uint1 x549;
+  uint32_t x550;
+  fiat_p256_uint1 x551;
+  uint32_t x552;
+  fiat_p256_uint1 x553;
+  uint32_t x554;
+  fiat_p256_uint1 x555;
+  uint32_t x556;
+  uint32_t x557;
+  uint32_t x558;
+  uint32_t x559;
+  uint32_t x560;
+  uint32_t x561;
+  uint32_t x562;
+  uint32_t x563;
+  uint32_t x564;
+  uint32_t x565;
+  uint32_t x566;
+  uint32_t x567;
+  uint32_t x568;
+  uint32_t x569;
+  uint32_t x570;
+  uint32_t x571;
+  uint32_t x572;
+  uint32_t x573;
+  fiat_p256_uint1 x574;
+  uint32_t x575;
+  fiat_p256_uint1 x576;
+  uint32_t x577;
+  fiat_p256_uint1 x578;
+  uint32_t x579;
+  fiat_p256_uint1 x580;
+  uint32_t x581;
+  fiat_p256_uint1 x582;
+  uint32_t x583;
+  fiat_p256_uint1 x584;
+  uint32_t x585;
+  fiat_p256_uint1 x586;
+  uint32_t x587;
+  uint32_t x588;
+  fiat_p256_uint1 x589;
+  uint32_t x590;
+  fiat_p256_uint1 x591;
+  uint32_t x592;
+  fiat_p256_uint1 x593;
+  uint32_t x594;
+  fiat_p256_uint1 x595;
+  uint32_t x596;
+  fiat_p256_uint1 x597;
+  uint32_t x598;
+  fiat_p256_uint1 x599;
+  uint32_t x600;
+  fiat_p256_uint1 x601;
+  uint32_t x602;
+  fiat_p256_uint1 x603;
+  uint32_t x604;
+  fiat_p256_uint1 x605;
+  uint32_t x606;
+  uint32_t x607;
+  uint32_t x608;
+  uint32_t x609;
+  uint32_t x610;
+  uint32_t x611;
+  uint32_t x612;
+  uint32_t x613;
+  uint32_t x614;
+  fiat_p256_uint1 x615;
+  uint32_t x616;
+  fiat_p256_uint1 x617;
+  uint32_t x618;
+  uint32_t x619;
+  fiat_p256_uint1 x620;
+  uint32_t x621;
+  fiat_p256_uint1 x622;
+  uint32_t x623;
+  fiat_p256_uint1 x624;
+  uint32_t x625;
+  fiat_p256_uint1 x626;
+  uint32_t x627;
+  fiat_p256_uint1 x628;
+  uint32_t x629;
+  fiat_p256_uint1 x630;
+  uint32_t x631;
+  fiat_p256_uint1 x632;
+  uint32_t x633;
+  fiat_p256_uint1 x634;
+  uint32_t x635;
+  fiat_p256_uint1 x636;
+  uint32_t x637;
+  uint32_t x638;
+  fiat_p256_uint1 x639;
+  uint32_t x640;
+  fiat_p256_uint1 x641;
+  uint32_t x642;
+  fiat_p256_uint1 x643;
+  uint32_t x644;
+  fiat_p256_uint1 x645;
+  uint32_t x646;
+  fiat_p256_uint1 x647;
+  uint32_t x648;
+  fiat_p256_uint1 x649;
+  uint32_t x650;
+  fiat_p256_uint1 x651;
+  uint32_t x652;
+  fiat_p256_uint1 x653;
+  uint32_t x654;
+  fiat_p256_uint1 x655;
+  uint32_t x656;
+  uint32_t x657;
+  uint32_t x658;
+  uint32_t x659;
+  uint32_t x660;
+  uint32_t x661;
+  uint32_t x662;
+  uint32_t x663;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[4]);
+  x5 = (arg1[5]);
+  x6 = (arg1[6]);
+  x7 = (arg1[7]);
+  x8 = (arg1[0]);
+  fiat_p256_mulx_u32(&x9, &x10, x8, (arg2[7]));
+  fiat_p256_mulx_u32(&x11, &x12, x8, (arg2[6]));
+  fiat_p256_mulx_u32(&x13, &x14, x8, (arg2[5]));
+  fiat_p256_mulx_u32(&x15, &x16, x8, (arg2[4]));
+  fiat_p256_mulx_u32(&x17, &x18, x8, (arg2[3]));
+  fiat_p256_mulx_u32(&x19, &x20, x8, (arg2[2]));
+  fiat_p256_mulx_u32(&x21, &x22, x8, (arg2[1]));
+  fiat_p256_mulx_u32(&x23, &x24, x8, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21);
+  fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19);
+  fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17);
+  fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15);
+  fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13);
+  fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11);
+  fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9);
+  x39 = (x38 + x10);
+  fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44);
+  fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42);
+  x52 = (x51 + x43);
+  fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46);
+  fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48);
+  fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50);
+  fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52);
+  fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0);
+  fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0);
+  fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23);
+  fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40);
+  fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41);
+  fiat_p256_mulx_u32(&x71, &x72, x1, (arg2[7]));
+  fiat_p256_mulx_u32(&x73, &x74, x1, (arg2[6]));
+  fiat_p256_mulx_u32(&x75, &x76, x1, (arg2[5]));
+  fiat_p256_mulx_u32(&x77, &x78, x1, (arg2[4]));
+  fiat_p256_mulx_u32(&x79, &x80, x1, (arg2[3]));
+  fiat_p256_mulx_u32(&x81, &x82, x1, (arg2[2]));
+  fiat_p256_mulx_u32(&x83, &x84, x1, (arg2[1]));
+  fiat_p256_mulx_u32(&x85, &x86, x1, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83);
+  fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81);
+  fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79);
+  fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77);
+  fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75);
+  fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73);
+  fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71);
+  x101 = (x100 + x72);
+  fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85);
+  fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87);
+  fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89);
+  fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91);
+  fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93);
+  fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95);
+  fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97);
+  fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99);
+  fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101);
+  fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124);
+  fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122);
+  x132 = (x131 + x123);
+  fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126);
+  fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128);
+  fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130);
+  fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132);
+  fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0);
+  fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0);
+  fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102);
+  fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120);
+  fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121);
+  x151 = ((uint32_t)x150 + x119);
+  fiat_p256_mulx_u32(&x152, &x153, x2, (arg2[7]));
+  fiat_p256_mulx_u32(&x154, &x155, x2, (arg2[6]));
+  fiat_p256_mulx_u32(&x156, &x157, x2, (arg2[5]));
+  fiat_p256_mulx_u32(&x158, &x159, x2, (arg2[4]));
+  fiat_p256_mulx_u32(&x160, &x161, x2, (arg2[3]));
+  fiat_p256_mulx_u32(&x162, &x163, x2, (arg2[2]));
+  fiat_p256_mulx_u32(&x164, &x165, x2, (arg2[1]));
+  fiat_p256_mulx_u32(&x166, &x167, x2, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164);
+  fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162);
+  fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160);
+  fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158);
+  fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156);
+  fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154);
+  fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152);
+  x182 = (x181 + x153);
+  fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166);
+  fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168);
+  fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170);
+  fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172);
+  fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174);
+  fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176);
+  fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178);
+  fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180);
+  fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182);
+  fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205);
+  fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203);
+  x213 = (x212 + x204);
+  fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207);
+  fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209);
+  fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211);
+  fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213);
+  fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0);
+  fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0);
+  fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183);
+  fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201);
+  fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202);
+  x232 = ((uint32_t)x231 + x200);
+  fiat_p256_mulx_u32(&x233, &x234, x3, (arg2[7]));
+  fiat_p256_mulx_u32(&x235, &x236, x3, (arg2[6]));
+  fiat_p256_mulx_u32(&x237, &x238, x3, (arg2[5]));
+  fiat_p256_mulx_u32(&x239, &x240, x3, (arg2[4]));
+  fiat_p256_mulx_u32(&x241, &x242, x3, (arg2[3]));
+  fiat_p256_mulx_u32(&x243, &x244, x3, (arg2[2]));
+  fiat_p256_mulx_u32(&x245, &x246, x3, (arg2[1]));
+  fiat_p256_mulx_u32(&x247, &x248, x3, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245);
+  fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243);
+  fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241);
+  fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239);
+  fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237);
+  fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235);
+  fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233);
+  x263 = (x262 + x234);
+  fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247);
+  fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249);
+  fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251);
+  fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253);
+  fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255);
+  fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257);
+  fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259);
+  fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261);
+  fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263);
+  fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286);
+  fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284);
+  x294 = (x293 + x285);
+  fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288);
+  fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290);
+  fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292);
+  fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294);
+  fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0);
+  fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0);
+  fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264);
+  fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282);
+  fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283);
+  x313 = ((uint32_t)x312 + x281);
+  fiat_p256_mulx_u32(&x314, &x315, x4, (arg2[7]));
+  fiat_p256_mulx_u32(&x316, &x317, x4, (arg2[6]));
+  fiat_p256_mulx_u32(&x318, &x319, x4, (arg2[5]));
+  fiat_p256_mulx_u32(&x320, &x321, x4, (arg2[4]));
+  fiat_p256_mulx_u32(&x322, &x323, x4, (arg2[3]));
+  fiat_p256_mulx_u32(&x324, &x325, x4, (arg2[2]));
+  fiat_p256_mulx_u32(&x326, &x327, x4, (arg2[1]));
+  fiat_p256_mulx_u32(&x328, &x329, x4, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326);
+  fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324);
+  fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322);
+  fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320);
+  fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318);
+  fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316);
+  fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314);
+  x344 = (x343 + x315);
+  fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328);
+  fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330);
+  fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332);
+  fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334);
+  fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336);
+  fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338);
+  fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340);
+  fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342);
+  fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344);
+  fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367);
+  fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365);
+  x375 = (x374 + x366);
+  fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369);
+  fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371);
+  fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373);
+  fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375);
+  fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0);
+  fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0);
+  fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345);
+  fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363);
+  fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364);
+  x394 = ((uint32_t)x393 + x362);
+  fiat_p256_mulx_u32(&x395, &x396, x5, (arg2[7]));
+  fiat_p256_mulx_u32(&x397, &x398, x5, (arg2[6]));
+  fiat_p256_mulx_u32(&x399, &x400, x5, (arg2[5]));
+  fiat_p256_mulx_u32(&x401, &x402, x5, (arg2[4]));
+  fiat_p256_mulx_u32(&x403, &x404, x5, (arg2[3]));
+  fiat_p256_mulx_u32(&x405, &x406, x5, (arg2[2]));
+  fiat_p256_mulx_u32(&x407, &x408, x5, (arg2[1]));
+  fiat_p256_mulx_u32(&x409, &x410, x5, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407);
+  fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405);
+  fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403);
+  fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401);
+  fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399);
+  fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397);
+  fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395);
+  x425 = (x424 + x396);
+  fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409);
+  fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411);
+  fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413);
+  fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415);
+  fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417);
+  fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419);
+  fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421);
+  fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423);
+  fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425);
+  fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448);
+  fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446);
+  x456 = (x455 + x447);
+  fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450);
+  fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452);
+  fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454);
+  fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456);
+  fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0);
+  fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0);
+  fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426);
+  fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444);
+  fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445);
+  x475 = ((uint32_t)x474 + x443);
+  fiat_p256_mulx_u32(&x476, &x477, x6, (arg2[7]));
+  fiat_p256_mulx_u32(&x478, &x479, x6, (arg2[6]));
+  fiat_p256_mulx_u32(&x480, &x481, x6, (arg2[5]));
+  fiat_p256_mulx_u32(&x482, &x483, x6, (arg2[4]));
+  fiat_p256_mulx_u32(&x484, &x485, x6, (arg2[3]));
+  fiat_p256_mulx_u32(&x486, &x487, x6, (arg2[2]));
+  fiat_p256_mulx_u32(&x488, &x489, x6, (arg2[1]));
+  fiat_p256_mulx_u32(&x490, &x491, x6, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488);
+  fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486);
+  fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484);
+  fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482);
+  fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480);
+  fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478);
+  fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476);
+  x506 = (x505 + x477);
+  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490);
+  fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492);
+  fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494);
+  fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496);
+  fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498);
+  fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500);
+  fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502);
+  fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504);
+  fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506);
+  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529);
+  fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527);
+  x537 = (x536 + x528);
+  fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531);
+  fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533);
+  fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535);
+  fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537);
+  fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0);
+  fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0);
+  fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507);
+  fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525);
+  fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526);
+  x556 = ((uint32_t)x555 + x524);
+  fiat_p256_mulx_u32(&x557, &x558, x7, (arg2[7]));
+  fiat_p256_mulx_u32(&x559, &x560, x7, (arg2[6]));
+  fiat_p256_mulx_u32(&x561, &x562, x7, (arg2[5]));
+  fiat_p256_mulx_u32(&x563, &x564, x7, (arg2[4]));
+  fiat_p256_mulx_u32(&x565, &x566, x7, (arg2[3]));
+  fiat_p256_mulx_u32(&x567, &x568, x7, (arg2[2]));
+  fiat_p256_mulx_u32(&x569, &x570, x7, (arg2[1]));
+  fiat_p256_mulx_u32(&x571, &x572, x7, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569);
+  fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567);
+  fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565);
+  fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563);
+  fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561);
+  fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559);
+  fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557);
+  x587 = (x586 + x558);
+  fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571);
+  fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573);
+  fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575);
+  fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577);
+  fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579);
+  fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581);
+  fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583);
+  fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585);
+  fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587);
+  fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610);
+  fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608);
+  x618 = (x617 + x609);
+  fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612);
+  fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614);
+  fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616);
+  fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618);
+  fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0);
+  fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0);
+  fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588);
+  fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606);
+  fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607);
+  x637 = ((uint32_t)x636 + x605);
+  fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0);
+  fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0);
+  fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0);
+  fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1);
+  fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0);
+  fiat_p256_cmovznz_u32(&x656, x655, x638, x621);
+  fiat_p256_cmovznz_u32(&x657, x655, x640, x623);
+  fiat_p256_cmovznz_u32(&x658, x655, x642, x625);
+  fiat_p256_cmovznz_u32(&x659, x655, x644, x627);
+  fiat_p256_cmovznz_u32(&x660, x655, x646, x629);
+  fiat_p256_cmovznz_u32(&x661, x655, x648, x631);
+  fiat_p256_cmovznz_u32(&x662, x655, x650, x633);
+  fiat_p256_cmovznz_u32(&x663, x655, x652, x635);
+  out1[0] = x656;
+  out1[1] = x657;
+  out1[2] = x658;
+  out1[3] = x659;
+  out1[4] = x660;
+  out1[5] = x661;
+  out1[6] = x662;
+  out1[7] = x663;
+}
+
+/*
+ * The function fiat_p256_square squares a field element in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  uint32_t x23;
+  uint32_t x24;
+  uint32_t x25;
+  fiat_p256_uint1 x26;
+  uint32_t x27;
+  fiat_p256_uint1 x28;
+  uint32_t x29;
+  fiat_p256_uint1 x30;
+  uint32_t x31;
+  fiat_p256_uint1 x32;
+  uint32_t x33;
+  fiat_p256_uint1 x34;
+  uint32_t x35;
+  fiat_p256_uint1 x36;
+  uint32_t x37;
+  fiat_p256_uint1 x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint32_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint32_t x47;
+  uint32_t x48;
+  fiat_p256_uint1 x49;
+  uint32_t x50;
+  fiat_p256_uint1 x51;
+  uint32_t x52;
+  uint32_t x53;
+  fiat_p256_uint1 x54;
+  uint32_t x55;
+  fiat_p256_uint1 x56;
+  uint32_t x57;
+  fiat_p256_uint1 x58;
+  uint32_t x59;
+  fiat_p256_uint1 x60;
+  uint32_t x61;
+  fiat_p256_uint1 x62;
+  uint32_t x63;
+  fiat_p256_uint1 x64;
+  uint32_t x65;
+  fiat_p256_uint1 x66;
+  uint32_t x67;
+  fiat_p256_uint1 x68;
+  uint32_t x69;
+  fiat_p256_uint1 x70;
+  uint32_t x71;
+  uint32_t x72;
+  uint32_t x73;
+  uint32_t x74;
+  uint32_t x75;
+  uint32_t x76;
+  uint32_t x77;
+  uint32_t x78;
+  uint32_t x79;
+  uint32_t x80;
+  uint32_t x81;
+  uint32_t x82;
+  uint32_t x83;
+  uint32_t x84;
+  uint32_t x85;
+  uint32_t x86;
+  uint32_t x87;
+  fiat_p256_uint1 x88;
+  uint32_t x89;
+  fiat_p256_uint1 x90;
+  uint32_t x91;
+  fiat_p256_uint1 x92;
+  uint32_t x93;
+  fiat_p256_uint1 x94;
+  uint32_t x95;
+  fiat_p256_uint1 x96;
+  uint32_t x97;
+  fiat_p256_uint1 x98;
+  uint32_t x99;
+  fiat_p256_uint1 x100;
+  uint32_t x101;
+  uint32_t x102;
+  fiat_p256_uint1 x103;
+  uint32_t x104;
+  fiat_p256_uint1 x105;
+  uint32_t x106;
+  fiat_p256_uint1 x107;
+  uint32_t x108;
+  fiat_p256_uint1 x109;
+  uint32_t x110;
+  fiat_p256_uint1 x111;
+  uint32_t x112;
+  fiat_p256_uint1 x113;
+  uint32_t x114;
+  fiat_p256_uint1 x115;
+  uint32_t x116;
+  fiat_p256_uint1 x117;
+  uint32_t x118;
+  fiat_p256_uint1 x119;
+  uint32_t x120;
+  uint32_t x121;
+  uint32_t x122;
+  uint32_t x123;
+  uint32_t x124;
+  uint32_t x125;
+  uint32_t x126;
+  uint32_t x127;
+  uint32_t x128;
+  fiat_p256_uint1 x129;
+  uint32_t x130;
+  fiat_p256_uint1 x131;
+  uint32_t x132;
+  uint32_t x133;
+  fiat_p256_uint1 x134;
+  uint32_t x135;
+  fiat_p256_uint1 x136;
+  uint32_t x137;
+  fiat_p256_uint1 x138;
+  uint32_t x139;
+  fiat_p256_uint1 x140;
+  uint32_t x141;
+  fiat_p256_uint1 x142;
+  uint32_t x143;
+  fiat_p256_uint1 x144;
+  uint32_t x145;
+  fiat_p256_uint1 x146;
+  uint32_t x147;
+  fiat_p256_uint1 x148;
+  uint32_t x149;
+  fiat_p256_uint1 x150;
+  uint32_t x151;
+  uint32_t x152;
+  uint32_t x153;
+  uint32_t x154;
+  uint32_t x155;
+  uint32_t x156;
+  uint32_t x157;
+  uint32_t x158;
+  uint32_t x159;
+  uint32_t x160;
+  uint32_t x161;
+  uint32_t x162;
+  uint32_t x163;
+  uint32_t x164;
+  uint32_t x165;
+  uint32_t x166;
+  uint32_t x167;
+  uint32_t x168;
+  fiat_p256_uint1 x169;
+  uint32_t x170;
+  fiat_p256_uint1 x171;
+  uint32_t x172;
+  fiat_p256_uint1 x173;
+  uint32_t x174;
+  fiat_p256_uint1 x175;
+  uint32_t x176;
+  fiat_p256_uint1 x177;
+  uint32_t x178;
+  fiat_p256_uint1 x179;
+  uint32_t x180;
+  fiat_p256_uint1 x181;
+  uint32_t x182;
+  uint32_t x183;
+  fiat_p256_uint1 x184;
+  uint32_t x185;
+  fiat_p256_uint1 x186;
+  uint32_t x187;
+  fiat_p256_uint1 x188;
+  uint32_t x189;
+  fiat_p256_uint1 x190;
+  uint32_t x191;
+  fiat_p256_uint1 x192;
+  uint32_t x193;
+  fiat_p256_uint1 x194;
+  uint32_t x195;
+  fiat_p256_uint1 x196;
+  uint32_t x197;
+  fiat_p256_uint1 x198;
+  uint32_t x199;
+  fiat_p256_uint1 x200;
+  uint32_t x201;
+  uint32_t x202;
+  uint32_t x203;
+  uint32_t x204;
+  uint32_t x205;
+  uint32_t x206;
+  uint32_t x207;
+  uint32_t x208;
+  uint32_t x209;
+  fiat_p256_uint1 x210;
+  uint32_t x211;
+  fiat_p256_uint1 x212;
+  uint32_t x213;
+  uint32_t x214;
+  fiat_p256_uint1 x215;
+  uint32_t x216;
+  fiat_p256_uint1 x217;
+  uint32_t x218;
+  fiat_p256_uint1 x219;
+  uint32_t x220;
+  fiat_p256_uint1 x221;
+  uint32_t x222;
+  fiat_p256_uint1 x223;
+  uint32_t x224;
+  fiat_p256_uint1 x225;
+  uint32_t x226;
+  fiat_p256_uint1 x227;
+  uint32_t x228;
+  fiat_p256_uint1 x229;
+  uint32_t x230;
+  fiat_p256_uint1 x231;
+  uint32_t x232;
+  uint32_t x233;
+  uint32_t x234;
+  uint32_t x235;
+  uint32_t x236;
+  uint32_t x237;
+  uint32_t x238;
+  uint32_t x239;
+  uint32_t x240;
+  uint32_t x241;
+  uint32_t x242;
+  uint32_t x243;
+  uint32_t x244;
+  uint32_t x245;
+  uint32_t x246;
+  uint32_t x247;
+  uint32_t x248;
+  uint32_t x249;
+  fiat_p256_uint1 x250;
+  uint32_t x251;
+  fiat_p256_uint1 x252;
+  uint32_t x253;
+  fiat_p256_uint1 x254;
+  uint32_t x255;
+  fiat_p256_uint1 x256;
+  uint32_t x257;
+  fiat_p256_uint1 x258;
+  uint32_t x259;
+  fiat_p256_uint1 x260;
+  uint32_t x261;
+  fiat_p256_uint1 x262;
+  uint32_t x263;
+  uint32_t x264;
+  fiat_p256_uint1 x265;
+  uint32_t x266;
+  fiat_p256_uint1 x267;
+  uint32_t x268;
+  fiat_p256_uint1 x269;
+  uint32_t x270;
+  fiat_p256_uint1 x271;
+  uint32_t x272;
+  fiat_p256_uint1 x273;
+  uint32_t x274;
+  fiat_p256_uint1 x275;
+  uint32_t x276;
+  fiat_p256_uint1 x277;
+  uint32_t x278;
+  fiat_p256_uint1 x279;
+  uint32_t x280;
+  fiat_p256_uint1 x281;
+  uint32_t x282;
+  uint32_t x283;
+  uint32_t x284;
+  uint32_t x285;
+  uint32_t x286;
+  uint32_t x287;
+  uint32_t x288;
+  uint32_t x289;
+  uint32_t x290;
+  fiat_p256_uint1 x291;
+  uint32_t x292;
+  fiat_p256_uint1 x293;
+  uint32_t x294;
+  uint32_t x295;
+  fiat_p256_uint1 x296;
+  uint32_t x297;
+  fiat_p256_uint1 x298;
+  uint32_t x299;
+  fiat_p256_uint1 x300;
+  uint32_t x301;
+  fiat_p256_uint1 x302;
+  uint32_t x303;
+  fiat_p256_uint1 x304;
+  uint32_t x305;
+  fiat_p256_uint1 x306;
+  uint32_t x307;
+  fiat_p256_uint1 x308;
+  uint32_t x309;
+  fiat_p256_uint1 x310;
+  uint32_t x311;
+  fiat_p256_uint1 x312;
+  uint32_t x313;
+  uint32_t x314;
+  uint32_t x315;
+  uint32_t x316;
+  uint32_t x317;
+  uint32_t x318;
+  uint32_t x319;
+  uint32_t x320;
+  uint32_t x321;
+  uint32_t x322;
+  uint32_t x323;
+  uint32_t x324;
+  uint32_t x325;
+  uint32_t x326;
+  uint32_t x327;
+  uint32_t x328;
+  uint32_t x329;
+  uint32_t x330;
+  fiat_p256_uint1 x331;
+  uint32_t x332;
+  fiat_p256_uint1 x333;
+  uint32_t x334;
+  fiat_p256_uint1 x335;
+  uint32_t x336;
+  fiat_p256_uint1 x337;
+  uint32_t x338;
+  fiat_p256_uint1 x339;
+  uint32_t x340;
+  fiat_p256_uint1 x341;
+  uint32_t x342;
+  fiat_p256_uint1 x343;
+  uint32_t x344;
+  uint32_t x345;
+  fiat_p256_uint1 x346;
+  uint32_t x347;
+  fiat_p256_uint1 x348;
+  uint32_t x349;
+  fiat_p256_uint1 x350;
+  uint32_t x351;
+  fiat_p256_uint1 x352;
+  uint32_t x353;
+  fiat_p256_uint1 x354;
+  uint32_t x355;
+  fiat_p256_uint1 x356;
+  uint32_t x357;
+  fiat_p256_uint1 x358;
+  uint32_t x359;
+  fiat_p256_uint1 x360;
+  uint32_t x361;
+  fiat_p256_uint1 x362;
+  uint32_t x363;
+  uint32_t x364;
+  uint32_t x365;
+  uint32_t x366;
+  uint32_t x367;
+  uint32_t x368;
+  uint32_t x369;
+  uint32_t x370;
+  uint32_t x371;
+  fiat_p256_uint1 x372;
+  uint32_t x373;
+  fiat_p256_uint1 x374;
+  uint32_t x375;
+  uint32_t x376;
+  fiat_p256_uint1 x377;
+  uint32_t x378;
+  fiat_p256_uint1 x379;
+  uint32_t x380;
+  fiat_p256_uint1 x381;
+  uint32_t x382;
+  fiat_p256_uint1 x383;
+  uint32_t x384;
+  fiat_p256_uint1 x385;
+  uint32_t x386;
+  fiat_p256_uint1 x387;
+  uint32_t x388;
+  fiat_p256_uint1 x389;
+  uint32_t x390;
+  fiat_p256_uint1 x391;
+  uint32_t x392;
+  fiat_p256_uint1 x393;
+  uint32_t x394;
+  uint32_t x395;
+  uint32_t x396;
+  uint32_t x397;
+  uint32_t x398;
+  uint32_t x399;
+  uint32_t x400;
+  uint32_t x401;
+  uint32_t x402;
+  uint32_t x403;
+  uint32_t x404;
+  uint32_t x405;
+  uint32_t x406;
+  uint32_t x407;
+  uint32_t x408;
+  uint32_t x409;
+  uint32_t x410;
+  uint32_t x411;
+  fiat_p256_uint1 x412;
+  uint32_t x413;
+  fiat_p256_uint1 x414;
+  uint32_t x415;
+  fiat_p256_uint1 x416;
+  uint32_t x417;
+  fiat_p256_uint1 x418;
+  uint32_t x419;
+  fiat_p256_uint1 x420;
+  uint32_t x421;
+  fiat_p256_uint1 x422;
+  uint32_t x423;
+  fiat_p256_uint1 x424;
+  uint32_t x425;
+  uint32_t x426;
+  fiat_p256_uint1 x427;
+  uint32_t x428;
+  fiat_p256_uint1 x429;
+  uint32_t x430;
+  fiat_p256_uint1 x431;
+  uint32_t x432;
+  fiat_p256_uint1 x433;
+  uint32_t x434;
+  fiat_p256_uint1 x435;
+  uint32_t x436;
+  fiat_p256_uint1 x437;
+  uint32_t x438;
+  fiat_p256_uint1 x439;
+  uint32_t x440;
+  fiat_p256_uint1 x441;
+  uint32_t x442;
+  fiat_p256_uint1 x443;
+  uint32_t x444;
+  uint32_t x445;
+  uint32_t x446;
+  uint32_t x447;
+  uint32_t x448;
+  uint32_t x449;
+  uint32_t x450;
+  uint32_t x451;
+  uint32_t x452;
+  fiat_p256_uint1 x453;
+  uint32_t x454;
+  fiat_p256_uint1 x455;
+  uint32_t x456;
+  uint32_t x457;
+  fiat_p256_uint1 x458;
+  uint32_t x459;
+  fiat_p256_uint1 x460;
+  uint32_t x461;
+  fiat_p256_uint1 x462;
+  uint32_t x463;
+  fiat_p256_uint1 x464;
+  uint32_t x465;
+  fiat_p256_uint1 x466;
+  uint32_t x467;
+  fiat_p256_uint1 x468;
+  uint32_t x469;
+  fiat_p256_uint1 x470;
+  uint32_t x471;
+  fiat_p256_uint1 x472;
+  uint32_t x473;
+  fiat_p256_uint1 x474;
+  uint32_t x475;
+  uint32_t x476;
+  uint32_t x477;
+  uint32_t x478;
+  uint32_t x479;
+  uint32_t x480;
+  uint32_t x481;
+  uint32_t x482;
+  uint32_t x483;
+  uint32_t x484;
+  uint32_t x485;
+  uint32_t x486;
+  uint32_t x487;
+  uint32_t x488;
+  uint32_t x489;
+  uint32_t x490;
+  uint32_t x491;
+  uint32_t x492;
+  fiat_p256_uint1 x493;
+  uint32_t x494;
+  fiat_p256_uint1 x495;
+  uint32_t x496;
+  fiat_p256_uint1 x497;
+  uint32_t x498;
+  fiat_p256_uint1 x499;
+  uint32_t x500;
+  fiat_p256_uint1 x501;
+  uint32_t x502;
+  fiat_p256_uint1 x503;
+  uint32_t x504;
+  fiat_p256_uint1 x505;
+  uint32_t x506;
+  uint32_t x507;
+  fiat_p256_uint1 x508;
+  uint32_t x509;
+  fiat_p256_uint1 x510;
+  uint32_t x511;
+  fiat_p256_uint1 x512;
+  uint32_t x513;
+  fiat_p256_uint1 x514;
+  uint32_t x515;
+  fiat_p256_uint1 x516;
+  uint32_t x517;
+  fiat_p256_uint1 x518;
+  uint32_t x519;
+  fiat_p256_uint1 x520;
+  uint32_t x521;
+  fiat_p256_uint1 x522;
+  uint32_t x523;
+  fiat_p256_uint1 x524;
+  uint32_t x525;
+  uint32_t x526;
+  uint32_t x527;
+  uint32_t x528;
+  uint32_t x529;
+  uint32_t x530;
+  uint32_t x531;
+  uint32_t x532;
+  uint32_t x533;
+  fiat_p256_uint1 x534;
+  uint32_t x535;
+  fiat_p256_uint1 x536;
+  uint32_t x537;
+  uint32_t x538;
+  fiat_p256_uint1 x539;
+  uint32_t x540;
+  fiat_p256_uint1 x541;
+  uint32_t x542;
+  fiat_p256_uint1 x543;
+  uint32_t x544;
+  fiat_p256_uint1 x545;
+  uint32_t x546;
+  fiat_p256_uint1 x547;
+  uint32_t x548;
+  fiat_p256_uint1 x549;
+  uint32_t x550;
+  fiat_p256_uint1 x551;
+  uint32_t x552;
+  fiat_p256_uint1 x553;
+  uint32_t x554;
+  fiat_p256_uint1 x555;
+  uint32_t x556;
+  uint32_t x557;
+  uint32_t x558;
+  uint32_t x559;
+  uint32_t x560;
+  uint32_t x561;
+  uint32_t x562;
+  uint32_t x563;
+  uint32_t x564;
+  uint32_t x565;
+  uint32_t x566;
+  uint32_t x567;
+  uint32_t x568;
+  uint32_t x569;
+  uint32_t x570;
+  uint32_t x571;
+  uint32_t x572;
+  uint32_t x573;
+  fiat_p256_uint1 x574;
+  uint32_t x575;
+  fiat_p256_uint1 x576;
+  uint32_t x577;
+  fiat_p256_uint1 x578;
+  uint32_t x579;
+  fiat_p256_uint1 x580;
+  uint32_t x581;
+  fiat_p256_uint1 x582;
+  uint32_t x583;
+  fiat_p256_uint1 x584;
+  uint32_t x585;
+  fiat_p256_uint1 x586;
+  uint32_t x587;
+  uint32_t x588;
+  fiat_p256_uint1 x589;
+  uint32_t x590;
+  fiat_p256_uint1 x591;
+  uint32_t x592;
+  fiat_p256_uint1 x593;
+  uint32_t x594;
+  fiat_p256_uint1 x595;
+  uint32_t x596;
+  fiat_p256_uint1 x597;
+  uint32_t x598;
+  fiat_p256_uint1 x599;
+  uint32_t x600;
+  fiat_p256_uint1 x601;
+  uint32_t x602;
+  fiat_p256_uint1 x603;
+  uint32_t x604;
+  fiat_p256_uint1 x605;
+  uint32_t x606;
+  uint32_t x607;
+  uint32_t x608;
+  uint32_t x609;
+  uint32_t x610;
+  uint32_t x611;
+  uint32_t x612;
+  uint32_t x613;
+  uint32_t x614;
+  fiat_p256_uint1 x615;
+  uint32_t x616;
+  fiat_p256_uint1 x617;
+  uint32_t x618;
+  uint32_t x619;
+  fiat_p256_uint1 x620;
+  uint32_t x621;
+  fiat_p256_uint1 x622;
+  uint32_t x623;
+  fiat_p256_uint1 x624;
+  uint32_t x625;
+  fiat_p256_uint1 x626;
+  uint32_t x627;
+  fiat_p256_uint1 x628;
+  uint32_t x629;
+  fiat_p256_uint1 x630;
+  uint32_t x631;
+  fiat_p256_uint1 x632;
+  uint32_t x633;
+  fiat_p256_uint1 x634;
+  uint32_t x635;
+  fiat_p256_uint1 x636;
+  uint32_t x637;
+  uint32_t x638;
+  fiat_p256_uint1 x639;
+  uint32_t x640;
+  fiat_p256_uint1 x641;
+  uint32_t x642;
+  fiat_p256_uint1 x643;
+  uint32_t x644;
+  fiat_p256_uint1 x645;
+  uint32_t x646;
+  fiat_p256_uint1 x647;
+  uint32_t x648;
+  fiat_p256_uint1 x649;
+  uint32_t x650;
+  fiat_p256_uint1 x651;
+  uint32_t x652;
+  fiat_p256_uint1 x653;
+  uint32_t x654;
+  fiat_p256_uint1 x655;
+  uint32_t x656;
+  uint32_t x657;
+  uint32_t x658;
+  uint32_t x659;
+  uint32_t x660;
+  uint32_t x661;
+  uint32_t x662;
+  uint32_t x663;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[4]);
+  x5 = (arg1[5]);
+  x6 = (arg1[6]);
+  x7 = (arg1[7]);
+  x8 = (arg1[0]);
+  fiat_p256_mulx_u32(&x9, &x10, x8, (arg1[7]));
+  fiat_p256_mulx_u32(&x11, &x12, x8, (arg1[6]));
+  fiat_p256_mulx_u32(&x13, &x14, x8, (arg1[5]));
+  fiat_p256_mulx_u32(&x15, &x16, x8, (arg1[4]));
+  fiat_p256_mulx_u32(&x17, &x18, x8, (arg1[3]));
+  fiat_p256_mulx_u32(&x19, &x20, x8, (arg1[2]));
+  fiat_p256_mulx_u32(&x21, &x22, x8, (arg1[1]));
+  fiat_p256_mulx_u32(&x23, &x24, x8, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21);
+  fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19);
+  fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17);
+  fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15);
+  fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13);
+  fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11);
+  fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9);
+  x39 = (x38 + x10);
+  fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44);
+  fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42);
+  x52 = (x51 + x43);
+  fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46);
+  fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48);
+  fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50);
+  fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52);
+  fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0);
+  fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0);
+  fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23);
+  fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40);
+  fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41);
+  fiat_p256_mulx_u32(&x71, &x72, x1, (arg1[7]));
+  fiat_p256_mulx_u32(&x73, &x74, x1, (arg1[6]));
+  fiat_p256_mulx_u32(&x75, &x76, x1, (arg1[5]));
+  fiat_p256_mulx_u32(&x77, &x78, x1, (arg1[4]));
+  fiat_p256_mulx_u32(&x79, &x80, x1, (arg1[3]));
+  fiat_p256_mulx_u32(&x81, &x82, x1, (arg1[2]));
+  fiat_p256_mulx_u32(&x83, &x84, x1, (arg1[1]));
+  fiat_p256_mulx_u32(&x85, &x86, x1, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83);
+  fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81);
+  fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79);
+  fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77);
+  fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75);
+  fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73);
+  fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71);
+  x101 = (x100 + x72);
+  fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85);
+  fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87);
+  fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89);
+  fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91);
+  fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93);
+  fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95);
+  fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97);
+  fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99);
+  fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101);
+  fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124);
+  fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122);
+  x132 = (x131 + x123);
+  fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126);
+  fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128);
+  fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130);
+  fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132);
+  fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0);
+  fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0);
+  fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102);
+  fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120);
+  fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121);
+  x151 = ((uint32_t)x150 + x119);
+  fiat_p256_mulx_u32(&x152, &x153, x2, (arg1[7]));
+  fiat_p256_mulx_u32(&x154, &x155, x2, (arg1[6]));
+  fiat_p256_mulx_u32(&x156, &x157, x2, (arg1[5]));
+  fiat_p256_mulx_u32(&x158, &x159, x2, (arg1[4]));
+  fiat_p256_mulx_u32(&x160, &x161, x2, (arg1[3]));
+  fiat_p256_mulx_u32(&x162, &x163, x2, (arg1[2]));
+  fiat_p256_mulx_u32(&x164, &x165, x2, (arg1[1]));
+  fiat_p256_mulx_u32(&x166, &x167, x2, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164);
+  fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162);
+  fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160);
+  fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158);
+  fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156);
+  fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154);
+  fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152);
+  x182 = (x181 + x153);
+  fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166);
+  fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168);
+  fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170);
+  fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172);
+  fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174);
+  fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176);
+  fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178);
+  fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180);
+  fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182);
+  fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205);
+  fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203);
+  x213 = (x212 + x204);
+  fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207);
+  fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209);
+  fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211);
+  fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213);
+  fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0);
+  fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0);
+  fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183);
+  fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201);
+  fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202);
+  x232 = ((uint32_t)x231 + x200);
+  fiat_p256_mulx_u32(&x233, &x234, x3, (arg1[7]));
+  fiat_p256_mulx_u32(&x235, &x236, x3, (arg1[6]));
+  fiat_p256_mulx_u32(&x237, &x238, x3, (arg1[5]));
+  fiat_p256_mulx_u32(&x239, &x240, x3, (arg1[4]));
+  fiat_p256_mulx_u32(&x241, &x242, x3, (arg1[3]));
+  fiat_p256_mulx_u32(&x243, &x244, x3, (arg1[2]));
+  fiat_p256_mulx_u32(&x245, &x246, x3, (arg1[1]));
+  fiat_p256_mulx_u32(&x247, &x248, x3, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245);
+  fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243);
+  fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241);
+  fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239);
+  fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237);
+  fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235);
+  fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233);
+  x263 = (x262 + x234);
+  fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247);
+  fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249);
+  fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251);
+  fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253);
+  fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255);
+  fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257);
+  fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259);
+  fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261);
+  fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263);
+  fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286);
+  fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284);
+  x294 = (x293 + x285);
+  fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288);
+  fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290);
+  fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292);
+  fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294);
+  fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0);
+  fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0);
+  fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264);
+  fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282);
+  fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283);
+  x313 = ((uint32_t)x312 + x281);
+  fiat_p256_mulx_u32(&x314, &x315, x4, (arg1[7]));
+  fiat_p256_mulx_u32(&x316, &x317, x4, (arg1[6]));
+  fiat_p256_mulx_u32(&x318, &x319, x4, (arg1[5]));
+  fiat_p256_mulx_u32(&x320, &x321, x4, (arg1[4]));
+  fiat_p256_mulx_u32(&x322, &x323, x4, (arg1[3]));
+  fiat_p256_mulx_u32(&x324, &x325, x4, (arg1[2]));
+  fiat_p256_mulx_u32(&x326, &x327, x4, (arg1[1]));
+  fiat_p256_mulx_u32(&x328, &x329, x4, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326);
+  fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324);
+  fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322);
+  fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320);
+  fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318);
+  fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316);
+  fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314);
+  x344 = (x343 + x315);
+  fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328);
+  fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330);
+  fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332);
+  fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334);
+  fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336);
+  fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338);
+  fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340);
+  fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342);
+  fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344);
+  fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367);
+  fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365);
+  x375 = (x374 + x366);
+  fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369);
+  fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371);
+  fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373);
+  fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375);
+  fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0);
+  fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0);
+  fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345);
+  fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363);
+  fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364);
+  x394 = ((uint32_t)x393 + x362);
+  fiat_p256_mulx_u32(&x395, &x396, x5, (arg1[7]));
+  fiat_p256_mulx_u32(&x397, &x398, x5, (arg1[6]));
+  fiat_p256_mulx_u32(&x399, &x400, x5, (arg1[5]));
+  fiat_p256_mulx_u32(&x401, &x402, x5, (arg1[4]));
+  fiat_p256_mulx_u32(&x403, &x404, x5, (arg1[3]));
+  fiat_p256_mulx_u32(&x405, &x406, x5, (arg1[2]));
+  fiat_p256_mulx_u32(&x407, &x408, x5, (arg1[1]));
+  fiat_p256_mulx_u32(&x409, &x410, x5, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407);
+  fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405);
+  fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403);
+  fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401);
+  fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399);
+  fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397);
+  fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395);
+  x425 = (x424 + x396);
+  fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409);
+  fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411);
+  fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413);
+  fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415);
+  fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417);
+  fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419);
+  fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421);
+  fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423);
+  fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425);
+  fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448);
+  fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446);
+  x456 = (x455 + x447);
+  fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450);
+  fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452);
+  fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454);
+  fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456);
+  fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0);
+  fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0);
+  fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426);
+  fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444);
+  fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445);
+  x475 = ((uint32_t)x474 + x443);
+  fiat_p256_mulx_u32(&x476, &x477, x6, (arg1[7]));
+  fiat_p256_mulx_u32(&x478, &x479, x6, (arg1[6]));
+  fiat_p256_mulx_u32(&x480, &x481, x6, (arg1[5]));
+  fiat_p256_mulx_u32(&x482, &x483, x6, (arg1[4]));
+  fiat_p256_mulx_u32(&x484, &x485, x6, (arg1[3]));
+  fiat_p256_mulx_u32(&x486, &x487, x6, (arg1[2]));
+  fiat_p256_mulx_u32(&x488, &x489, x6, (arg1[1]));
+  fiat_p256_mulx_u32(&x490, &x491, x6, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488);
+  fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486);
+  fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484);
+  fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482);
+  fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480);
+  fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478);
+  fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476);
+  x506 = (x505 + x477);
+  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490);
+  fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492);
+  fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494);
+  fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496);
+  fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498);
+  fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500);
+  fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502);
+  fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504);
+  fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506);
+  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529);
+  fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527);
+  x537 = (x536 + x528);
+  fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531);
+  fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533);
+  fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535);
+  fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537);
+  fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0);
+  fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0);
+  fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507);
+  fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525);
+  fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526);
+  x556 = ((uint32_t)x555 + x524);
+  fiat_p256_mulx_u32(&x557, &x558, x7, (arg1[7]));
+  fiat_p256_mulx_u32(&x559, &x560, x7, (arg1[6]));
+  fiat_p256_mulx_u32(&x561, &x562, x7, (arg1[5]));
+  fiat_p256_mulx_u32(&x563, &x564, x7, (arg1[4]));
+  fiat_p256_mulx_u32(&x565, &x566, x7, (arg1[3]));
+  fiat_p256_mulx_u32(&x567, &x568, x7, (arg1[2]));
+  fiat_p256_mulx_u32(&x569, &x570, x7, (arg1[1]));
+  fiat_p256_mulx_u32(&x571, &x572, x7, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569);
+  fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567);
+  fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565);
+  fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563);
+  fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561);
+  fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559);
+  fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557);
+  x587 = (x586 + x558);
+  fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571);
+  fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573);
+  fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575);
+  fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577);
+  fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579);
+  fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581);
+  fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583);
+  fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585);
+  fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587);
+  fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610);
+  fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608);
+  x618 = (x617 + x609);
+  fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612);
+  fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614);
+  fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616);
+  fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618);
+  fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0);
+  fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0);
+  fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588);
+  fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606);
+  fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607);
+  x637 = ((uint32_t)x636 + x605);
+  fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0);
+  fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0);
+  fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0);
+  fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1);
+  fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0);
+  fiat_p256_cmovznz_u32(&x656, x655, x638, x621);
+  fiat_p256_cmovznz_u32(&x657, x655, x640, x623);
+  fiat_p256_cmovznz_u32(&x658, x655, x642, x625);
+  fiat_p256_cmovznz_u32(&x659, x655, x644, x627);
+  fiat_p256_cmovznz_u32(&x660, x655, x646, x629);
+  fiat_p256_cmovznz_u32(&x661, x655, x648, x631);
+  fiat_p256_cmovznz_u32(&x662, x655, x650, x633);
+  fiat_p256_cmovznz_u32(&x663, x655, x652, x635);
+  out1[0] = x656;
+  out1[1] = x657;
+  out1[2] = x658;
+  out1[3] = x659;
+  out1[4] = x660;
+  out1[5] = x661;
+  out1[6] = x662;
+  out1[7] = x663;
+}
+
+/*
+ * The function fiat_p256_add adds two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint32_t x1;
+  fiat_p256_uint1 x2;
+  uint32_t x3;
+  fiat_p256_uint1 x4;
+  uint32_t x5;
+  fiat_p256_uint1 x6;
+  uint32_t x7;
+  fiat_p256_uint1 x8;
+  uint32_t x9;
+  fiat_p256_uint1 x10;
+  uint32_t x11;
+  fiat_p256_uint1 x12;
+  uint32_t x13;
+  fiat_p256_uint1 x14;
+  uint32_t x15;
+  fiat_p256_uint1 x16;
+  uint32_t x17;
+  fiat_p256_uint1 x18;
+  uint32_t x19;
+  fiat_p256_uint1 x20;
+  uint32_t x21;
+  fiat_p256_uint1 x22;
+  uint32_t x23;
+  fiat_p256_uint1 x24;
+  uint32_t x25;
+  fiat_p256_uint1 x26;
+  uint32_t x27;
+  fiat_p256_uint1 x28;
+  uint32_t x29;
+  fiat_p256_uint1 x30;
+  uint32_t x31;
+  fiat_p256_uint1 x32;
+  uint32_t x33;
+  fiat_p256_uint1 x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint32_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint32_t x42;
+  fiat_p256_addcarryx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_addcarryx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_addcarryx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_addcarryx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_addcarryx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4]));
+  fiat_p256_addcarryx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5]));
+  fiat_p256_addcarryx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6]));
+  fiat_p256_addcarryx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7]));
+  fiat_p256_subborrowx_u32(&x17, &x18, 0x0, x1, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x19, &x20, x18, x3, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x21, &x22, x20, x5, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x23, &x24, x22, x7, 0x0);
+  fiat_p256_subborrowx_u32(&x25, &x26, x24, x9, 0x0);
+  fiat_p256_subborrowx_u32(&x27, &x28, x26, x11, 0x0);
+  fiat_p256_subborrowx_u32(&x29, &x30, x28, x13, 0x1);
+  fiat_p256_subborrowx_u32(&x31, &x32, x30, x15, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x33, &x34, x32, x16, 0x0);
+  fiat_p256_cmovznz_u32(&x35, x34, x17, x1);
+  fiat_p256_cmovznz_u32(&x36, x34, x19, x3);
+  fiat_p256_cmovznz_u32(&x37, x34, x21, x5);
+  fiat_p256_cmovznz_u32(&x38, x34, x23, x7);
+  fiat_p256_cmovznz_u32(&x39, x34, x25, x9);
+  fiat_p256_cmovznz_u32(&x40, x34, x27, x11);
+  fiat_p256_cmovznz_u32(&x41, x34, x29, x13);
+  fiat_p256_cmovznz_u32(&x42, x34, x31, x15);
+  out1[0] = x35;
+  out1[1] = x36;
+  out1[2] = x37;
+  out1[3] = x38;
+  out1[4] = x39;
+  out1[5] = x40;
+  out1[6] = x41;
+  out1[7] = x42;
+}
+
+/*
+ * The function fiat_p256_sub subtracts two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint32_t x1;
+  fiat_p256_uint1 x2;
+  uint32_t x3;
+  fiat_p256_uint1 x4;
+  uint32_t x5;
+  fiat_p256_uint1 x6;
+  uint32_t x7;
+  fiat_p256_uint1 x8;
+  uint32_t x9;
+  fiat_p256_uint1 x10;
+  uint32_t x11;
+  fiat_p256_uint1 x12;
+  uint32_t x13;
+  fiat_p256_uint1 x14;
+  uint32_t x15;
+  fiat_p256_uint1 x16;
+  uint32_t x17;
+  uint32_t x18;
+  fiat_p256_uint1 x19;
+  uint32_t x20;
+  fiat_p256_uint1 x21;
+  uint32_t x22;
+  fiat_p256_uint1 x23;
+  uint32_t x24;
+  fiat_p256_uint1 x25;
+  uint32_t x26;
+  fiat_p256_uint1 x27;
+  uint32_t x28;
+  fiat_p256_uint1 x29;
+  uint32_t x30;
+  fiat_p256_uint1 x31;
+  uint32_t x32;
+  fiat_p256_uint1 x33;
+  fiat_p256_subborrowx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_subborrowx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_subborrowx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_subborrowx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_subborrowx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4]));
+  fiat_p256_subborrowx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5]));
+  fiat_p256_subborrowx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6]));
+  fiat_p256_subborrowx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7]));
+  fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17);
+  fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17);
+  fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17);
+  fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0);
+  fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0);
+  fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0);
+  fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1));
+  fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17);
+  out1[0] = x18;
+  out1[1] = x20;
+  out1[2] = x22;
+  out1[3] = x24;
+  out1[4] = x26;
+  out1[5] = x28;
+  out1[6] = x30;
+  out1[7] = x32;
+}
+
+/*
+ * The function fiat_p256_opp negates a field element in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint32_t x1;
+  fiat_p256_uint1 x2;
+  uint32_t x3;
+  fiat_p256_uint1 x4;
+  uint32_t x5;
+  fiat_p256_uint1 x6;
+  uint32_t x7;
+  fiat_p256_uint1 x8;
+  uint32_t x9;
+  fiat_p256_uint1 x10;
+  uint32_t x11;
+  fiat_p256_uint1 x12;
+  uint32_t x13;
+  fiat_p256_uint1 x14;
+  uint32_t x15;
+  fiat_p256_uint1 x16;
+  uint32_t x17;
+  uint32_t x18;
+  fiat_p256_uint1 x19;
+  uint32_t x20;
+  fiat_p256_uint1 x21;
+  uint32_t x22;
+  fiat_p256_uint1 x23;
+  uint32_t x24;
+  fiat_p256_uint1 x25;
+  uint32_t x26;
+  fiat_p256_uint1 x27;
+  uint32_t x28;
+  fiat_p256_uint1 x29;
+  uint32_t x30;
+  fiat_p256_uint1 x31;
+  uint32_t x32;
+  fiat_p256_uint1 x33;
+  fiat_p256_subborrowx_u32(&x1, &x2, 0x0, 0x0, (arg1[0]));
+  fiat_p256_subborrowx_u32(&x3, &x4, x2, 0x0, (arg1[1]));
+  fiat_p256_subborrowx_u32(&x5, &x6, x4, 0x0, (arg1[2]));
+  fiat_p256_subborrowx_u32(&x7, &x8, x6, 0x0, (arg1[3]));
+  fiat_p256_subborrowx_u32(&x9, &x10, x8, 0x0, (arg1[4]));
+  fiat_p256_subborrowx_u32(&x11, &x12, x10, 0x0, (arg1[5]));
+  fiat_p256_subborrowx_u32(&x13, &x14, x12, 0x0, (arg1[6]));
+  fiat_p256_subborrowx_u32(&x15, &x16, x14, 0x0, (arg1[7]));
+  fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17);
+  fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17);
+  fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17);
+  fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0);
+  fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0);
+  fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0);
+  fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1));
+  fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17);
+  out1[0] = x18;
+  out1[1] = x20;
+  out1[2] = x22;
+  out1[3] = x24;
+  out1[4] = x26;
+  out1[5] = x28;
+  out1[6] = x30;
+  out1[7] = x32;
+}
+
+/*
+ * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint32_t* out1, const uint32_t arg1[8]) {
+  uint32_t x1;
+  x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | ((arg1[3]) | ((arg1[4]) | ((arg1[5]) | ((arg1[6]) | (arg1[7]))))))));
+  *out1 = x1;
+}
+
+/*
+ * The function fiat_p256_selectznz is a multi-limb conditional select.
+ *
+ * Postconditions:
+ *   eval out1 = (if arg1 = 0 then eval arg2 else eval arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint32_t out1[8], fiat_p256_uint1 arg1, const uint32_t arg2[8], const uint32_t arg3[8]) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  fiat_p256_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3]));
+  fiat_p256_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4]));
+  fiat_p256_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5]));
+  fiat_p256_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6]));
+  fiat_p256_cmovznz_u32(&x8, arg1, (arg2[7]), (arg3[7]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+  out1[5] = x6;
+  out1[6] = x7;
+  out1[7] = x8;
+}
diff --git a/ring-0.17.14/third_party/fiat/p256_64.h b/ring-0.17.14/third_party/fiat/p256_64.h
new file mode 100644
index 0000000000..a06dd225e4
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/p256_64.h
@@ -0,0 +1,957 @@
+/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */
+/* curve description: p256 */
+/* machine_wordsize = 64 (from "64") */
+/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */
+/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
+/*                                                                    */
+/* NOTE: In addition to the bounds specified above each function, all */
+/*   functions synthesized for this Montgomery arithmetic require the */
+/*   input to be strictly less than the prime modulus (m), and also   */
+/*   require the input to be in the unique saturated representation.  */
+/*   All functions also ensure that these two properties are true of  */
+/*   return values.                                                   */
+/*  */
+/* Computed values: */
+/*   eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */
+/*                            if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */
+
+#include <stdint.h>
+typedef unsigned char fiat_p256_uint1;
+typedef signed char fiat_p256_int1;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_P256_FIAT_EXTENSION __extension__
+#  define FIAT_P256_FIAT_INLINE __inline__
+#else
+#  define FIAT_P256_FIAT_EXTENSION
+#  define FIAT_P256_FIAT_INLINE
+#endif
+
+FIAT_P256_FIAT_EXTENSION typedef signed __int128 fiat_p256_int128;
+FIAT_P256_FIAT_EXTENSION typedef unsigned __int128 fiat_p256_uint128;
+
+/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */
+typedef uint64_t fiat_p256_montgomery_domain_field_element[4];
+
+/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */
+typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4];
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+#if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint64_t fiat_p256_value_barrier_u64(uint64_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_p256_value_barrier_u64(x) (x)
+#endif
+
+
+/*
+ * The function fiat_p256_addcarryx_u64 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^64
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_uint128 x1;
+  uint64_t x2;
+  fiat_p256_uint1 x3;
+  x1 = ((arg1 + (fiat_p256_uint128)arg2) + arg3);
+  x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
+  x3 = (fiat_p256_uint1)(x1 >> 64);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_p256_subborrowx_u64 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^64
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_int128 x1;
+  fiat_p256_int1 x2;
+  uint64_t x3;
+  x1 = ((arg2 - (fiat_p256_int128)arg1) - arg3);
+  x2 = (fiat_p256_int1)(x1 >> 64);
+  x3 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
+  *out1 = x3;
+  *out2 = (fiat_p256_uint1)(0x0 - x2);
+}
+
+/*
+ * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result.
+ *
+ * Postconditions:
+ *   out1 = (arg1 * arg2) mod 2^64
+ *   out2 = ⌊arg1 * arg2 / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
+  fiat_p256_uint128 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = ((fiat_p256_uint128)arg1 * arg2);
+  x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
+  x3 = (uint64_t)(x1 >> 64);
+  *out1 = x2;
+  *out2 = x3;
+}
+
+/*
+ * The function fiat_p256_cmovznz_u64 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * The function fiat_p256_mul multiplies two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_p256_uint1 x27;
+  uint64_t x28;
+  uint64_t x29;
+  fiat_p256_uint1 x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  fiat_p256_uint1 x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  fiat_p256_uint1 x50;
+  uint64_t x51;
+  fiat_p256_uint1 x52;
+  uint64_t x53;
+  uint64_t x54;
+  fiat_p256_uint1 x55;
+  uint64_t x56;
+  fiat_p256_uint1 x57;
+  uint64_t x58;
+  fiat_p256_uint1 x59;
+  uint64_t x60;
+  fiat_p256_uint1 x61;
+  uint64_t x62;
+  fiat_p256_uint1 x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  fiat_p256_uint1 x71;
+  uint64_t x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  fiat_p256_uint1 x76;
+  uint64_t x77;
+  fiat_p256_uint1 x78;
+  uint64_t x79;
+  fiat_p256_uint1 x80;
+  uint64_t x81;
+  fiat_p256_uint1 x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  fiat_p256_uint1 x93;
+  uint64_t x94;
+  fiat_p256_uint1 x95;
+  uint64_t x96;
+  fiat_p256_uint1 x97;
+  uint64_t x98;
+  uint64_t x99;
+  fiat_p256_uint1 x100;
+  uint64_t x101;
+  fiat_p256_uint1 x102;
+  uint64_t x103;
+  fiat_p256_uint1 x104;
+  uint64_t x105;
+  fiat_p256_uint1 x106;
+  uint64_t x107;
+  fiat_p256_uint1 x108;
+  uint64_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint64_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint64_t x115;
+  fiat_p256_uint1 x116;
+  uint64_t x117;
+  uint64_t x118;
+  fiat_p256_uint1 x119;
+  uint64_t x120;
+  fiat_p256_uint1 x121;
+  uint64_t x122;
+  fiat_p256_uint1 x123;
+  uint64_t x124;
+  fiat_p256_uint1 x125;
+  uint64_t x126;
+  fiat_p256_uint1 x127;
+  uint64_t x128;
+  uint64_t x129;
+  uint64_t x130;
+  uint64_t x131;
+  uint64_t x132;
+  uint64_t x133;
+  uint64_t x134;
+  uint64_t x135;
+  uint64_t x136;
+  uint64_t x137;
+  fiat_p256_uint1 x138;
+  uint64_t x139;
+  fiat_p256_uint1 x140;
+  uint64_t x141;
+  fiat_p256_uint1 x142;
+  uint64_t x143;
+  uint64_t x144;
+  fiat_p256_uint1 x145;
+  uint64_t x146;
+  fiat_p256_uint1 x147;
+  uint64_t x148;
+  fiat_p256_uint1 x149;
+  uint64_t x150;
+  fiat_p256_uint1 x151;
+  uint64_t x152;
+  fiat_p256_uint1 x153;
+  uint64_t x154;
+  uint64_t x155;
+  uint64_t x156;
+  uint64_t x157;
+  uint64_t x158;
+  uint64_t x159;
+  uint64_t x160;
+  fiat_p256_uint1 x161;
+  uint64_t x162;
+  uint64_t x163;
+  fiat_p256_uint1 x164;
+  uint64_t x165;
+  fiat_p256_uint1 x166;
+  uint64_t x167;
+  fiat_p256_uint1 x168;
+  uint64_t x169;
+  fiat_p256_uint1 x170;
+  uint64_t x171;
+  fiat_p256_uint1 x172;
+  uint64_t x173;
+  uint64_t x174;
+  fiat_p256_uint1 x175;
+  uint64_t x176;
+  fiat_p256_uint1 x177;
+  uint64_t x178;
+  fiat_p256_uint1 x179;
+  uint64_t x180;
+  fiat_p256_uint1 x181;
+  uint64_t x182;
+  fiat_p256_uint1 x183;
+  uint64_t x184;
+  uint64_t x185;
+  uint64_t x186;
+  uint64_t x187;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3]));
+  fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2]));
+  fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1]));
+  fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  x19 = (x18 + x6);
+  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
+  x28 = (x27 + x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
+  fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3]));
+  fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2]));
+  fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1]));
+  fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
+  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
+  x53 = (x52 + x40);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
+  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
+  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
+  x72 = (x71 + x67);
+  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
+  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
+  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
+  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
+  x83 = ((uint64_t)x82 + x63);
+  fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3]));
+  fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2]));
+  fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1]));
+  fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
+  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
+  x98 = (x97 + x85);
+  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
+  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
+  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
+  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
+  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
+  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
+  x117 = (x116 + x112);
+  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
+  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
+  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
+  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
+  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
+  x128 = ((uint64_t)x127 + x108);
+  fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3]));
+  fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2]));
+  fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1]));
+  fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
+  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
+  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
+  x143 = (x142 + x130);
+  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
+  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
+  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
+  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
+  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
+  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
+  x162 = (x161 + x157);
+  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
+  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
+  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
+  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
+  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
+  x173 = ((uint64_t)x172 + x153);
+  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
+  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
+  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
+  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
+  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
+  fiat_p256_cmovznz_u64(&x187, x183, x180, x171);
+  out1[0] = x184;
+  out1[1] = x185;
+  out1[2] = x186;
+  out1[3] = x187;
+}
+
+/*
+ * The function fiat_p256_square squares a field element in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_p256_uint1 x27;
+  uint64_t x28;
+  uint64_t x29;
+  fiat_p256_uint1 x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  fiat_p256_uint1 x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  fiat_p256_uint1 x50;
+  uint64_t x51;
+  fiat_p256_uint1 x52;
+  uint64_t x53;
+  uint64_t x54;
+  fiat_p256_uint1 x55;
+  uint64_t x56;
+  fiat_p256_uint1 x57;
+  uint64_t x58;
+  fiat_p256_uint1 x59;
+  uint64_t x60;
+  fiat_p256_uint1 x61;
+  uint64_t x62;
+  fiat_p256_uint1 x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  fiat_p256_uint1 x71;
+  uint64_t x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  fiat_p256_uint1 x76;
+  uint64_t x77;
+  fiat_p256_uint1 x78;
+  uint64_t x79;
+  fiat_p256_uint1 x80;
+  uint64_t x81;
+  fiat_p256_uint1 x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  fiat_p256_uint1 x93;
+  uint64_t x94;
+  fiat_p256_uint1 x95;
+  uint64_t x96;
+  fiat_p256_uint1 x97;
+  uint64_t x98;
+  uint64_t x99;
+  fiat_p256_uint1 x100;
+  uint64_t x101;
+  fiat_p256_uint1 x102;
+  uint64_t x103;
+  fiat_p256_uint1 x104;
+  uint64_t x105;
+  fiat_p256_uint1 x106;
+  uint64_t x107;
+  fiat_p256_uint1 x108;
+  uint64_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint64_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint64_t x115;
+  fiat_p256_uint1 x116;
+  uint64_t x117;
+  uint64_t x118;
+  fiat_p256_uint1 x119;
+  uint64_t x120;
+  fiat_p256_uint1 x121;
+  uint64_t x122;
+  fiat_p256_uint1 x123;
+  uint64_t x124;
+  fiat_p256_uint1 x125;
+  uint64_t x126;
+  fiat_p256_uint1 x127;
+  uint64_t x128;
+  uint64_t x129;
+  uint64_t x130;
+  uint64_t x131;
+  uint64_t x132;
+  uint64_t x133;
+  uint64_t x134;
+  uint64_t x135;
+  uint64_t x136;
+  uint64_t x137;
+  fiat_p256_uint1 x138;
+  uint64_t x139;
+  fiat_p256_uint1 x140;
+  uint64_t x141;
+  fiat_p256_uint1 x142;
+  uint64_t x143;
+  uint64_t x144;
+  fiat_p256_uint1 x145;
+  uint64_t x146;
+  fiat_p256_uint1 x147;
+  uint64_t x148;
+  fiat_p256_uint1 x149;
+  uint64_t x150;
+  fiat_p256_uint1 x151;
+  uint64_t x152;
+  fiat_p256_uint1 x153;
+  uint64_t x154;
+  uint64_t x155;
+  uint64_t x156;
+  uint64_t x157;
+  uint64_t x158;
+  uint64_t x159;
+  uint64_t x160;
+  fiat_p256_uint1 x161;
+  uint64_t x162;
+  uint64_t x163;
+  fiat_p256_uint1 x164;
+  uint64_t x165;
+  fiat_p256_uint1 x166;
+  uint64_t x167;
+  fiat_p256_uint1 x168;
+  uint64_t x169;
+  fiat_p256_uint1 x170;
+  uint64_t x171;
+  fiat_p256_uint1 x172;
+  uint64_t x173;
+  uint64_t x174;
+  fiat_p256_uint1 x175;
+  uint64_t x176;
+  fiat_p256_uint1 x177;
+  uint64_t x178;
+  fiat_p256_uint1 x179;
+  uint64_t x180;
+  fiat_p256_uint1 x181;
+  uint64_t x182;
+  fiat_p256_uint1 x183;
+  uint64_t x184;
+  uint64_t x185;
+  uint64_t x186;
+  uint64_t x187;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3]));
+  fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2]));
+  fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1]));
+  fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  x19 = (x18 + x6);
+  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
+  x28 = (x27 + x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
+  fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3]));
+  fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2]));
+  fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1]));
+  fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
+  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
+  x53 = (x52 + x40);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
+  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
+  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
+  x72 = (x71 + x67);
+  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
+  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
+  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
+  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
+  x83 = ((uint64_t)x82 + x63);
+  fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3]));
+  fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2]));
+  fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1]));
+  fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
+  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
+  x98 = (x97 + x85);
+  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
+  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
+  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
+  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
+  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
+  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
+  x117 = (x116 + x112);
+  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
+  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
+  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
+  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
+  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
+  x128 = ((uint64_t)x127 + x108);
+  fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3]));
+  fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2]));
+  fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1]));
+  fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
+  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
+  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
+  x143 = (x142 + x130);
+  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
+  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
+  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
+  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
+  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
+  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
+  x162 = (x161 + x157);
+  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
+  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
+  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
+  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
+  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
+  x173 = ((uint64_t)x172 + x153);
+  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
+  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
+  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
+  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
+  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
+  fiat_p256_cmovznz_u64(&x187, x183, x180, x171);
+  out1[0] = x184;
+  out1[1] = x185;
+  out1[2] = x186;
+  out1[3] = x187;
+}
+
+/*
+ * The function fiat_p256_add adds two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  uint64_t x3;
+  fiat_p256_uint1 x4;
+  uint64_t x5;
+  fiat_p256_uint1 x6;
+  uint64_t x7;
+  fiat_p256_uint1 x8;
+  uint64_t x9;
+  fiat_p256_uint1 x10;
+  uint64_t x11;
+  fiat_p256_uint1 x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0);
+  fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0);
+  fiat_p256_cmovznz_u64(&x19, x18, x9, x1);
+  fiat_p256_cmovznz_u64(&x20, x18, x11, x3);
+  fiat_p256_cmovznz_u64(&x21, x18, x13, x5);
+  fiat_p256_cmovznz_u64(&x22, x18, x15, x7);
+  out1[0] = x19;
+  out1[1] = x20;
+  out1[2] = x21;
+  out1[3] = x22;
+}
+
+/*
+ * The function fiat_p256_sub subtracts two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  uint64_t x3;
+  fiat_p256_uint1 x4;
+  uint64_t x5;
+  fiat_p256_uint1 x6;
+  uint64_t x7;
+  fiat_p256_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_p256_uint1 x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  fiat_p256_uint1 x17;
+  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
+  fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001)));
+  out1[0] = x10;
+  out1[1] = x12;
+  out1[2] = x14;
+  out1[3] = x16;
+}
+
+/*
+ * The function fiat_p256_opp negates a field element in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  uint64_t x3;
+  fiat_p256_uint1 x4;
+  uint64_t x5;
+  fiat_p256_uint1 x6;
+  uint64_t x7;
+  fiat_p256_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_p256_uint1 x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  fiat_p256_uint1 x17;
+  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0]));
+  fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1]));
+  fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2]));
+  fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3]));
+  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
+  fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001)));
+  out1[0] = x10;
+  out1[1] = x12;
+  out1[2] = x14;
+  out1[3] = x16;
+}
+
+/*
+ * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) {
+  uint64_t x1;
+  x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3]))));
+  *out1 = x1;
+}
+
+/*
+ * The function fiat_p256_selectznz is a multi-limb conditional select.
+ *
+ * Postconditions:
+ *   eval out1 = (if arg1 = 0 then eval arg2 else eval arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+}
+
diff --git a/ring-0.17.14/third_party/fiat/p256_64_msvc.h b/ring-0.17.14/third_party/fiat/p256_64_msvc.h
new file mode 100644
index 0000000000..8b65a37342
--- /dev/null
+++ b/ring-0.17.14/third_party/fiat/p256_64_msvc.h
@@ -0,0 +1,2002 @@
+/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier --no-wide-int p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */
+/* curve description: p256 */
+/* machine_wordsize = 64 (from "64") */
+/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */
+/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
+/*                                                                    */
+/* NOTE: In addition to the bounds specified above each function, all */
+/*   functions synthesized for this Montgomery arithmetic require the */
+/*   input to be strictly less than the prime modulus (m), and also   */
+/*   require the input to be in the unique saturated representation.  */
+/*   All functions also ensure that these two properties are true of  */
+/*   return values.                                                   */
+/*  */
+/* Computed values: */
+/*   eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */
+/*                            if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */
+
+#include <stdint.h>
+#include <intrin.h>
+#if defined(_M_X64)
+#include <immintrin.h>
+#endif
+
+typedef unsigned char fiat_p256_uint1;
+typedef signed char fiat_p256_int1;
+
+#define FIAT_P256_FIAT_INLINE inline
+
+/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */
+typedef uint64_t fiat_p256_montgomery_domain_field_element[4];
+
+/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */
+typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4];
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+#define fiat_p256_value_barrier_u64(x) (x)
+
+
+/*
+ * The function fiat_p256_addcarryx_u64 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^64
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+#if defined(_M_X64)
+  *out2 = _addcarry_u64(arg1, arg2, arg3, out1);
+#else
+  arg2 += arg1;
+  arg1 = arg2 < arg1;
+  arg3 += arg2;
+  arg1 += arg3 < arg2;
+  *out1 = arg3;
+  *out2 = arg1;
+#endif
+}
+
+/*
+ * The function fiat_p256_subborrowx_u64 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^64
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+#if defined(_M_X64)
+  *out2 = _subborrow_u64(arg1, arg2, arg3, out1); // NOTE: edited after generation
+#else
+  *out1 = arg2 - arg3 - arg1;
+  *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1);
+#endif
+}
+
+/*
+ * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result.
+ *
+ * Postconditions:
+ *   out1 = (arg1 * arg2) mod 2^64
+ *   out2 = ⌊arg1 * arg2 / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
+// NOTE: edited after generation
+#if defined(_M_X64)
+  *out1 = _umul128(arg1, arg2, out2);
+#elif defined(_M_ARM64)
+  *out1 = arg1 * arg2;
+  *out2 = __umulh(arg1, arg2);
+#else
+#error "This file is intended for MSVC on X64 or ARM64"
+#endif
+}
+
+/*
+ * The function fiat_p256_cmovznz_u64 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * The function fiat_p256_mul multiplies two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_p256_uint1 x27;
+  uint64_t x28;
+  uint64_t x29;
+  fiat_p256_uint1 x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  fiat_p256_uint1 x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  fiat_p256_uint1 x50;
+  uint64_t x51;
+  fiat_p256_uint1 x52;
+  uint64_t x53;
+  uint64_t x54;
+  fiat_p256_uint1 x55;
+  uint64_t x56;
+  fiat_p256_uint1 x57;
+  uint64_t x58;
+  fiat_p256_uint1 x59;
+  uint64_t x60;
+  fiat_p256_uint1 x61;
+  uint64_t x62;
+  fiat_p256_uint1 x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  fiat_p256_uint1 x71;
+  uint64_t x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  fiat_p256_uint1 x76;
+  uint64_t x77;
+  fiat_p256_uint1 x78;
+  uint64_t x79;
+  fiat_p256_uint1 x80;
+  uint64_t x81;
+  fiat_p256_uint1 x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  fiat_p256_uint1 x93;
+  uint64_t x94;
+  fiat_p256_uint1 x95;
+  uint64_t x96;
+  fiat_p256_uint1 x97;
+  uint64_t x98;
+  uint64_t x99;
+  fiat_p256_uint1 x100;
+  uint64_t x101;
+  fiat_p256_uint1 x102;
+  uint64_t x103;
+  fiat_p256_uint1 x104;
+  uint64_t x105;
+  fiat_p256_uint1 x106;
+  uint64_t x107;
+  fiat_p256_uint1 x108;
+  uint64_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint64_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint64_t x115;
+  fiat_p256_uint1 x116;
+  uint64_t x117;
+  uint64_t x118;
+  fiat_p256_uint1 x119;
+  uint64_t x120;
+  fiat_p256_uint1 x121;
+  uint64_t x122;
+  fiat_p256_uint1 x123;
+  uint64_t x124;
+  fiat_p256_uint1 x125;
+  uint64_t x126;
+  fiat_p256_uint1 x127;
+  uint64_t x128;
+  uint64_t x129;
+  uint64_t x130;
+  uint64_t x131;
+  uint64_t x132;
+  uint64_t x133;
+  uint64_t x134;
+  uint64_t x135;
+  uint64_t x136;
+  uint64_t x137;
+  fiat_p256_uint1 x138;
+  uint64_t x139;
+  fiat_p256_uint1 x140;
+  uint64_t x141;
+  fiat_p256_uint1 x142;
+  uint64_t x143;
+  uint64_t x144;
+  fiat_p256_uint1 x145;
+  uint64_t x146;
+  fiat_p256_uint1 x147;
+  uint64_t x148;
+  fiat_p256_uint1 x149;
+  uint64_t x150;
+  fiat_p256_uint1 x151;
+  uint64_t x152;
+  fiat_p256_uint1 x153;
+  uint64_t x154;
+  uint64_t x155;
+  uint64_t x156;
+  uint64_t x157;
+  uint64_t x158;
+  uint64_t x159;
+  uint64_t x160;
+  fiat_p256_uint1 x161;
+  uint64_t x162;
+  uint64_t x163;
+  fiat_p256_uint1 x164;
+  uint64_t x165;
+  fiat_p256_uint1 x166;
+  uint64_t x167;
+  fiat_p256_uint1 x168;
+  uint64_t x169;
+  fiat_p256_uint1 x170;
+  uint64_t x171;
+  fiat_p256_uint1 x172;
+  uint64_t x173;
+  uint64_t x174;
+  fiat_p256_uint1 x175;
+  uint64_t x176;
+  fiat_p256_uint1 x177;
+  uint64_t x178;
+  fiat_p256_uint1 x179;
+  uint64_t x180;
+  fiat_p256_uint1 x181;
+  uint64_t x182;
+  fiat_p256_uint1 x183;
+  uint64_t x184;
+  uint64_t x185;
+  uint64_t x186;
+  uint64_t x187;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3]));
+  fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2]));
+  fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1]));
+  fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  x19 = (x18 + x6);
+  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
+  x28 = (x27 + x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
+  fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3]));
+  fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2]));
+  fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1]));
+  fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
+  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
+  x53 = (x52 + x40);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
+  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
+  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
+  x72 = (x71 + x67);
+  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
+  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
+  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
+  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
+  x83 = ((uint64_t)x82 + x63);
+  fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3]));
+  fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2]));
+  fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1]));
+  fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
+  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
+  x98 = (x97 + x85);
+  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
+  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
+  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
+  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
+  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
+  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
+  x117 = (x116 + x112);
+  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
+  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
+  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
+  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
+  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
+  x128 = ((uint64_t)x127 + x108);
+  fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3]));
+  fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2]));
+  fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1]));
+  fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
+  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
+  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
+  x143 = (x142 + x130);
+  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
+  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
+  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
+  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
+  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
+  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
+  x162 = (x161 + x157);
+  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
+  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
+  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
+  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
+  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
+  x173 = ((uint64_t)x172 + x153);
+  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
+  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
+  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
+  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
+  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
+  fiat_p256_cmovznz_u64(&x187, x183, x180, x171);
+  out1[0] = x184;
+  out1[1] = x185;
+  out1[2] = x186;
+  out1[3] = x187;
+}
+
+/*
+ * The function fiat_p256_square squares a field element in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_p256_uint1 x27;
+  uint64_t x28;
+  uint64_t x29;
+  fiat_p256_uint1 x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  fiat_p256_uint1 x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  fiat_p256_uint1 x50;
+  uint64_t x51;
+  fiat_p256_uint1 x52;
+  uint64_t x53;
+  uint64_t x54;
+  fiat_p256_uint1 x55;
+  uint64_t x56;
+  fiat_p256_uint1 x57;
+  uint64_t x58;
+  fiat_p256_uint1 x59;
+  uint64_t x60;
+  fiat_p256_uint1 x61;
+  uint64_t x62;
+  fiat_p256_uint1 x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  fiat_p256_uint1 x71;
+  uint64_t x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  fiat_p256_uint1 x76;
+  uint64_t x77;
+  fiat_p256_uint1 x78;
+  uint64_t x79;
+  fiat_p256_uint1 x80;
+  uint64_t x81;
+  fiat_p256_uint1 x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  fiat_p256_uint1 x93;
+  uint64_t x94;
+  fiat_p256_uint1 x95;
+  uint64_t x96;
+  fiat_p256_uint1 x97;
+  uint64_t x98;
+  uint64_t x99;
+  fiat_p256_uint1 x100;
+  uint64_t x101;
+  fiat_p256_uint1 x102;
+  uint64_t x103;
+  fiat_p256_uint1 x104;
+  uint64_t x105;
+  fiat_p256_uint1 x106;
+  uint64_t x107;
+  fiat_p256_uint1 x108;
+  uint64_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint64_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint64_t x115;
+  fiat_p256_uint1 x116;
+  uint64_t x117;
+  uint64_t x118;
+  fiat_p256_uint1 x119;
+  uint64_t x120;
+  fiat_p256_uint1 x121;
+  uint64_t x122;
+  fiat_p256_uint1 x123;
+  uint64_t x124;
+  fiat_p256_uint1 x125;
+  uint64_t x126;
+  fiat_p256_uint1 x127;
+  uint64_t x128;
+  uint64_t x129;
+  uint64_t x130;
+  uint64_t x131;
+  uint64_t x132;
+  uint64_t x133;
+  uint64_t x134;
+  uint64_t x135;
+  uint64_t x136;
+  uint64_t x137;
+  fiat_p256_uint1 x138;
+  uint64_t x139;
+  fiat_p256_uint1 x140;
+  uint64_t x141;
+  fiat_p256_uint1 x142;
+  uint64_t x143;
+  uint64_t x144;
+  fiat_p256_uint1 x145;
+  uint64_t x146;
+  fiat_p256_uint1 x147;
+  uint64_t x148;
+  fiat_p256_uint1 x149;
+  uint64_t x150;
+  fiat_p256_uint1 x151;
+  uint64_t x152;
+  fiat_p256_uint1 x153;
+  uint64_t x154;
+  uint64_t x155;
+  uint64_t x156;
+  uint64_t x157;
+  uint64_t x158;
+  uint64_t x159;
+  uint64_t x160;
+  fiat_p256_uint1 x161;
+  uint64_t x162;
+  uint64_t x163;
+  fiat_p256_uint1 x164;
+  uint64_t x165;
+  fiat_p256_uint1 x166;
+  uint64_t x167;
+  fiat_p256_uint1 x168;
+  uint64_t x169;
+  fiat_p256_uint1 x170;
+  uint64_t x171;
+  fiat_p256_uint1 x172;
+  uint64_t x173;
+  uint64_t x174;
+  fiat_p256_uint1 x175;
+  uint64_t x176;
+  fiat_p256_uint1 x177;
+  uint64_t x178;
+  fiat_p256_uint1 x179;
+  uint64_t x180;
+  fiat_p256_uint1 x181;
+  uint64_t x182;
+  fiat_p256_uint1 x183;
+  uint64_t x184;
+  uint64_t x185;
+  uint64_t x186;
+  uint64_t x187;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3]));
+  fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2]));
+  fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1]));
+  fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  x19 = (x18 + x6);
+  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
+  x28 = (x27 + x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
+  fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3]));
+  fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2]));
+  fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1]));
+  fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
+  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
+  x53 = (x52 + x40);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
+  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
+  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
+  x72 = (x71 + x67);
+  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
+  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
+  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
+  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
+  x83 = ((uint64_t)x82 + x63);
+  fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3]));
+  fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2]));
+  fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1]));
+  fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
+  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
+  x98 = (x97 + x85);
+  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
+  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
+  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
+  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
+  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
+  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
+  x117 = (x116 + x112);
+  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
+  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
+  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
+  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
+  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
+  x128 = ((uint64_t)x127 + x108);
+  fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3]));
+  fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2]));
+  fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1]));
+  fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
+  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
+  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
+  x143 = (x142 + x130);
+  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
+  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
+  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
+  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
+  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
+  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
+  x162 = (x161 + x157);
+  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
+  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
+  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
+  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
+  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
+  x173 = ((uint64_t)x172 + x153);
+  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
+  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
+  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
+  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
+  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
+  fiat_p256_cmovznz_u64(&x187, x183, x180, x171);
+  out1[0] = x184;
+  out1[1] = x185;
+  out1[2] = x186;
+  out1[3] = x187;
+}
+
+/*
+ * The function fiat_p256_add adds two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  uint64_t x3;
+  fiat_p256_uint1 x4;
+  uint64_t x5;
+  fiat_p256_uint1 x6;
+  uint64_t x7;
+  fiat_p256_uint1 x8;
+  uint64_t x9;
+  fiat_p256_uint1 x10;
+  uint64_t x11;
+  fiat_p256_uint1 x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0);
+  fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0);
+  fiat_p256_cmovznz_u64(&x19, x18, x9, x1);
+  fiat_p256_cmovznz_u64(&x20, x18, x11, x3);
+  fiat_p256_cmovznz_u64(&x21, x18, x13, x5);
+  fiat_p256_cmovznz_u64(&x22, x18, x15, x7);
+  out1[0] = x19;
+  out1[1] = x20;
+  out1[2] = x21;
+  out1[3] = x22;
+}
+
+/*
+ * The function fiat_p256_sub subtracts two field elements in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ *   0 ≤ eval arg2 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  uint64_t x3;
+  fiat_p256_uint1 x4;
+  uint64_t x5;
+  fiat_p256_uint1 x6;
+  uint64_t x7;
+  fiat_p256_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_p256_uint1 x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  fiat_p256_uint1 x17;
+  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
+  fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001)));
+  out1[0] = x10;
+  out1[1] = x12;
+  out1[2] = x14;
+  out1[3] = x16;
+}
+
+/*
+ * The function fiat_p256_opp negates a field element in the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  uint64_t x3;
+  fiat_p256_uint1 x4;
+  uint64_t x5;
+  fiat_p256_uint1 x6;
+  uint64_t x7;
+  fiat_p256_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_p256_uint1 x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  fiat_p256_uint1 x17;
+  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0]));
+  fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1]));
+  fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2]));
+  fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3]));
+  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
+  fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001)));
+  out1[0] = x10;
+  out1[1] = x12;
+  out1[2] = x14;
+  out1[3] = x16;
+}
+
+/*
+ * The function fiat_p256_from_montgomery translates a field element out of the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval out1 mod m = (eval arg1 * ((2^64)⁻¹ mod m)^4) mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_from_montgomery(fiat_p256_non_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  fiat_p256_uint1 x9;
+  uint64_t x10;
+  fiat_p256_uint1 x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  fiat_p256_uint1 x23;
+  uint64_t x24;
+  fiat_p256_uint1 x25;
+  uint64_t x26;
+  fiat_p256_uint1 x27;
+  uint64_t x28;
+  fiat_p256_uint1 x29;
+  uint64_t x30;
+  fiat_p256_uint1 x31;
+  uint64_t x32;
+  fiat_p256_uint1 x33;
+  uint64_t x34;
+  fiat_p256_uint1 x35;
+  uint64_t x36;
+  fiat_p256_uint1 x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  fiat_p256_uint1 x45;
+  uint64_t x46;
+  fiat_p256_uint1 x47;
+  uint64_t x48;
+  fiat_p256_uint1 x49;
+  uint64_t x50;
+  fiat_p256_uint1 x51;
+  uint64_t x52;
+  fiat_p256_uint1 x53;
+  uint64_t x54;
+  fiat_p256_uint1 x55;
+  uint64_t x56;
+  fiat_p256_uint1 x57;
+  uint64_t x58;
+  fiat_p256_uint1 x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  fiat_p256_uint1 x67;
+  uint64_t x68;
+  fiat_p256_uint1 x69;
+  uint64_t x70;
+  fiat_p256_uint1 x71;
+  uint64_t x72;
+  fiat_p256_uint1 x73;
+  uint64_t x74;
+  fiat_p256_uint1 x75;
+  uint64_t x76;
+  uint64_t x77;
+  fiat_p256_uint1 x78;
+  uint64_t x79;
+  fiat_p256_uint1 x80;
+  uint64_t x81;
+  fiat_p256_uint1 x82;
+  uint64_t x83;
+  fiat_p256_uint1 x84;
+  uint64_t x85;
+  fiat_p256_uint1 x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  x1 = (arg1[0]);
+  fiat_p256_mulx_u64(&x2, &x3, x1, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x4, &x5, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x6, &x7, x1, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x8, &x9, 0x0, x7, x4);
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x6);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, 0x0, x8);
+  fiat_p256_addcarryx_u64(&x14, &x15, 0x0, x12, (arg1[1]));
+  fiat_p256_mulx_u64(&x16, &x17, x14, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x18, &x19, x14, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x20, &x21, x14, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x22, &x23, 0x0, x21, x18);
+  fiat_p256_addcarryx_u64(&x24, &x25, 0x0, x14, x20);
+  fiat_p256_addcarryx_u64(&x26, &x27, x25, (x15 + (x13 + (x9 + x5))), x22);
+  fiat_p256_addcarryx_u64(&x28, &x29, x27, x2, (x23 + x19));
+  fiat_p256_addcarryx_u64(&x30, &x31, x29, x3, x16);
+  fiat_p256_addcarryx_u64(&x32, &x33, 0x0, x26, (arg1[2]));
+  fiat_p256_addcarryx_u64(&x34, &x35, x33, x28, 0x0);
+  fiat_p256_addcarryx_u64(&x36, &x37, x35, x30, 0x0);
+  fiat_p256_mulx_u64(&x38, &x39, x32, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x40, &x41, x32, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x42, &x43, x32, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x44, &x45, 0x0, x43, x40);
+  fiat_p256_addcarryx_u64(&x46, &x47, 0x0, x32, x42);
+  fiat_p256_addcarryx_u64(&x48, &x49, x47, x34, x44);
+  fiat_p256_addcarryx_u64(&x50, &x51, x49, x36, (x45 + x41));
+  fiat_p256_addcarryx_u64(&x52, &x53, x51, (x37 + (x31 + x17)), x38);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x48, (arg1[3]));
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x50, 0x0);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x52, 0x0);
+  fiat_p256_mulx_u64(&x60, &x61, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x62, &x63, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x66, &x67, 0x0, x65, x62);
+  fiat_p256_addcarryx_u64(&x68, &x69, 0x0, x54, x64);
+  fiat_p256_addcarryx_u64(&x70, &x71, x69, x56, x66);
+  fiat_p256_addcarryx_u64(&x72, &x73, x71, x58, (x67 + x63));
+  fiat_p256_addcarryx_u64(&x74, &x75, x73, (x59 + (x53 + x39)), x60);
+  x76 = (x75 + x61);
+  fiat_p256_subborrowx_u64(&x77, &x78, 0x0, x70, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x79, &x80, x78, x72, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x81, &x82, x80, x74, 0x0);
+  fiat_p256_subborrowx_u64(&x83, &x84, x82, x76, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x85, &x86, x84, 0x0, 0x0);
+  fiat_p256_cmovznz_u64(&x87, x86, x77, x70);
+  fiat_p256_cmovznz_u64(&x88, x86, x79, x72);
+  fiat_p256_cmovznz_u64(&x89, x86, x81, x74);
+  fiat_p256_cmovznz_u64(&x90, x86, x83, x76);
+  out1[0] = x87;
+  out1[1] = x88;
+  out1[2] = x89;
+  out1[3] = x90;
+}
+
+/*
+ * The function fiat_p256_to_montgomery translates a field element into the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = eval arg1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_to_montgomery(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_non_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  fiat_p256_uint1 x26;
+  uint64_t x27;
+  fiat_p256_uint1 x28;
+  uint64_t x29;
+  fiat_p256_uint1 x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  fiat_p256_uint1 x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  fiat_p256_uint1 x50;
+  uint64_t x51;
+  fiat_p256_uint1 x52;
+  uint64_t x53;
+  fiat_p256_uint1 x54;
+  uint64_t x55;
+  fiat_p256_uint1 x56;
+  uint64_t x57;
+  fiat_p256_uint1 x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  fiat_p256_uint1 x66;
+  uint64_t x67;
+  fiat_p256_uint1 x68;
+  uint64_t x69;
+  fiat_p256_uint1 x70;
+  uint64_t x71;
+  fiat_p256_uint1 x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  fiat_p256_uint1 x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  uint64_t x81;
+  uint64_t x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  fiat_p256_uint1 x86;
+  uint64_t x87;
+  fiat_p256_uint1 x88;
+  uint64_t x89;
+  fiat_p256_uint1 x90;
+  uint64_t x91;
+  fiat_p256_uint1 x92;
+  uint64_t x93;
+  fiat_p256_uint1 x94;
+  uint64_t x95;
+  fiat_p256_uint1 x96;
+  uint64_t x97;
+  fiat_p256_uint1 x98;
+  uint64_t x99;
+  uint64_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint64_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  fiat_p256_uint1 x106;
+  uint64_t x107;
+  fiat_p256_uint1 x108;
+  uint64_t x109;
+  fiat_p256_uint1 x110;
+  uint64_t x111;
+  fiat_p256_uint1 x112;
+  uint64_t x113;
+  fiat_p256_uint1 x114;
+  uint64_t x115;
+  fiat_p256_uint1 x116;
+  uint64_t x117;
+  uint64_t x118;
+  uint64_t x119;
+  uint64_t x120;
+  uint64_t x121;
+  uint64_t x122;
+  uint64_t x123;
+  uint64_t x124;
+  uint64_t x125;
+  fiat_p256_uint1 x126;
+  uint64_t x127;
+  fiat_p256_uint1 x128;
+  uint64_t x129;
+  fiat_p256_uint1 x130;
+  uint64_t x131;
+  fiat_p256_uint1 x132;
+  uint64_t x133;
+  fiat_p256_uint1 x134;
+  uint64_t x135;
+  fiat_p256_uint1 x136;
+  uint64_t x137;
+  fiat_p256_uint1 x138;
+  uint64_t x139;
+  uint64_t x140;
+  uint64_t x141;
+  uint64_t x142;
+  uint64_t x143;
+  uint64_t x144;
+  uint64_t x145;
+  fiat_p256_uint1 x146;
+  uint64_t x147;
+  fiat_p256_uint1 x148;
+  uint64_t x149;
+  fiat_p256_uint1 x150;
+  uint64_t x151;
+  fiat_p256_uint1 x152;
+  uint64_t x153;
+  fiat_p256_uint1 x154;
+  uint64_t x155;
+  fiat_p256_uint1 x156;
+  uint64_t x157;
+  fiat_p256_uint1 x158;
+  uint64_t x159;
+  fiat_p256_uint1 x160;
+  uint64_t x161;
+  fiat_p256_uint1 x162;
+  uint64_t x163;
+  fiat_p256_uint1 x164;
+  uint64_t x165;
+  fiat_p256_uint1 x166;
+  uint64_t x167;
+  uint64_t x168;
+  uint64_t x169;
+  uint64_t x170;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x7, &x8, x4, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x9, &x10, x4, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x11, &x12, x4, 0x3);
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  fiat_p256_mulx_u64(&x19, &x20, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x21, &x22, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x23, &x24, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x25, &x26, 0x0, x24, x21);
+  fiat_p256_addcarryx_u64(&x27, &x28, 0x0, x11, x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, x28, x13, x25);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x15, (x26 + x22));
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x17, x19);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, (x18 + x6), x20);
+  fiat_p256_mulx_u64(&x37, &x38, x1, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x39, &x40, x1, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x41, &x42, x1, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x43, &x44, x1, 0x3);
+  fiat_p256_addcarryx_u64(&x45, &x46, 0x0, x44, x41);
+  fiat_p256_addcarryx_u64(&x47, &x48, x46, x42, x39);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x40, x37);
+  fiat_p256_addcarryx_u64(&x51, &x52, 0x0, x29, x43);
+  fiat_p256_addcarryx_u64(&x53, &x54, x52, x31, x45);
+  fiat_p256_addcarryx_u64(&x55, &x56, x54, x33, x47);
+  fiat_p256_addcarryx_u64(&x57, &x58, x56, x35, x49);
+  fiat_p256_mulx_u64(&x59, &x60, x51, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x61, &x62, x51, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x63, &x64, x51, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x65, &x66, 0x0, x64, x61);
+  fiat_p256_addcarryx_u64(&x67, &x68, 0x0, x51, x63);
+  fiat_p256_addcarryx_u64(&x69, &x70, x68, x53, x65);
+  fiat_p256_addcarryx_u64(&x71, &x72, x70, x55, (x66 + x62));
+  fiat_p256_addcarryx_u64(&x73, &x74, x72, x57, x59);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, (((uint64_t)x58 + x36) + (x50 + x38)), x60);
+  fiat_p256_mulx_u64(&x77, &x78, x2, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x79, &x80, x2, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x81, &x82, x2, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x83, &x84, x2, 0x3);
+  fiat_p256_addcarryx_u64(&x85, &x86, 0x0, x84, x81);
+  fiat_p256_addcarryx_u64(&x87, &x88, x86, x82, x79);
+  fiat_p256_addcarryx_u64(&x89, &x90, x88, x80, x77);
+  fiat_p256_addcarryx_u64(&x91, &x92, 0x0, x69, x83);
+  fiat_p256_addcarryx_u64(&x93, &x94, x92, x71, x85);
+  fiat_p256_addcarryx_u64(&x95, &x96, x94, x73, x87);
+  fiat_p256_addcarryx_u64(&x97, &x98, x96, x75, x89);
+  fiat_p256_mulx_u64(&x99, &x100, x91, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x101, &x102, x91, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x103, &x104, x91, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x105, &x106, 0x0, x104, x101);
+  fiat_p256_addcarryx_u64(&x107, &x108, 0x0, x91, x103);
+  fiat_p256_addcarryx_u64(&x109, &x110, x108, x93, x105);
+  fiat_p256_addcarryx_u64(&x111, &x112, x110, x95, (x106 + x102));
+  fiat_p256_addcarryx_u64(&x113, &x114, x112, x97, x99);
+  fiat_p256_addcarryx_u64(&x115, &x116, x114, (((uint64_t)x98 + x76) + (x90 + x78)), x100);
+  fiat_p256_mulx_u64(&x117, &x118, x3, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x119, &x120, x3, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x121, &x122, x3, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x123, &x124, x3, 0x3);
+  fiat_p256_addcarryx_u64(&x125, &x126, 0x0, x124, x121);
+  fiat_p256_addcarryx_u64(&x127, &x128, x126, x122, x119);
+  fiat_p256_addcarryx_u64(&x129, &x130, x128, x120, x117);
+  fiat_p256_addcarryx_u64(&x131, &x132, 0x0, x109, x123);
+  fiat_p256_addcarryx_u64(&x133, &x134, x132, x111, x125);
+  fiat_p256_addcarryx_u64(&x135, &x136, x134, x113, x127);
+  fiat_p256_addcarryx_u64(&x137, &x138, x136, x115, x129);
+  fiat_p256_mulx_u64(&x139, &x140, x131, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x141, &x142, x131, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x143, &x144, x131, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x145, &x146, 0x0, x144, x141);
+  fiat_p256_addcarryx_u64(&x147, &x148, 0x0, x131, x143);
+  fiat_p256_addcarryx_u64(&x149, &x150, x148, x133, x145);
+  fiat_p256_addcarryx_u64(&x151, &x152, x150, x135, (x146 + x142));
+  fiat_p256_addcarryx_u64(&x153, &x154, x152, x137, x139);
+  fiat_p256_addcarryx_u64(&x155, &x156, x154, (((uint64_t)x138 + x116) + (x130 + x118)), x140);
+  fiat_p256_subborrowx_u64(&x157, &x158, 0x0, x149, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x159, &x160, x158, x151, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x161, &x162, x160, x153, 0x0);
+  fiat_p256_subborrowx_u64(&x163, &x164, x162, x155, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x165, &x166, x164, x156, 0x0);
+  fiat_p256_cmovznz_u64(&x167, x166, x157, x149);
+  fiat_p256_cmovznz_u64(&x168, x166, x159, x151);
+  fiat_p256_cmovznz_u64(&x169, x166, x161, x153);
+  fiat_p256_cmovznz_u64(&x170, x166, x163, x155);
+  out1[0] = x167;
+  out1[1] = x168;
+  out1[2] = x169;
+  out1[3] = x170;
+}
+
+/*
+ * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) {
+  uint64_t x1;
+  x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3]))));
+  *out1 = x1;
+}
+
+/*
+ * The function fiat_p256_selectznz is a multi-limb conditional select.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+}
+
+/*
+ * The function fiat_p256_to_bytes serializes a field element NOT in the Montgomery domain to bytes in little-endian order.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_to_bytes(uint8_t out1[32], const uint64_t arg1[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint8_t x5;
+  uint64_t x6;
+  uint8_t x7;
+  uint64_t x8;
+  uint8_t x9;
+  uint64_t x10;
+  uint8_t x11;
+  uint64_t x12;
+  uint8_t x13;
+  uint64_t x14;
+  uint8_t x15;
+  uint64_t x16;
+  uint8_t x17;
+  uint8_t x18;
+  uint8_t x19;
+  uint64_t x20;
+  uint8_t x21;
+  uint64_t x22;
+  uint8_t x23;
+  uint64_t x24;
+  uint8_t x25;
+  uint64_t x26;
+  uint8_t x27;
+  uint64_t x28;
+  uint8_t x29;
+  uint64_t x30;
+  uint8_t x31;
+  uint8_t x32;
+  uint8_t x33;
+  uint64_t x34;
+  uint8_t x35;
+  uint64_t x36;
+  uint8_t x37;
+  uint64_t x38;
+  uint8_t x39;
+  uint64_t x40;
+  uint8_t x41;
+  uint64_t x42;
+  uint8_t x43;
+  uint64_t x44;
+  uint8_t x45;
+  uint8_t x46;
+  uint8_t x47;
+  uint64_t x48;
+  uint8_t x49;
+  uint64_t x50;
+  uint8_t x51;
+  uint64_t x52;
+  uint8_t x53;
+  uint64_t x54;
+  uint8_t x55;
+  uint64_t x56;
+  uint8_t x57;
+  uint64_t x58;
+  uint8_t x59;
+  uint8_t x60;
+  x1 = (arg1[3]);
+  x2 = (arg1[2]);
+  x3 = (arg1[1]);
+  x4 = (arg1[0]);
+  x5 = (uint8_t)(x4 & UINT8_C(0xff));
+  x6 = (x4 >> 8);
+  x7 = (uint8_t)(x6 & UINT8_C(0xff));
+  x8 = (x6 >> 8);
+  x9 = (uint8_t)(x8 & UINT8_C(0xff));
+  x10 = (x8 >> 8);
+  x11 = (uint8_t)(x10 & UINT8_C(0xff));
+  x12 = (x10 >> 8);
+  x13 = (uint8_t)(x12 & UINT8_C(0xff));
+  x14 = (x12 >> 8);
+  x15 = (uint8_t)(x14 & UINT8_C(0xff));
+  x16 = (x14 >> 8);
+  x17 = (uint8_t)(x16 & UINT8_C(0xff));
+  x18 = (uint8_t)(x16 >> 8);
+  x19 = (uint8_t)(x3 & UINT8_C(0xff));
+  x20 = (x3 >> 8);
+  x21 = (uint8_t)(x20 & UINT8_C(0xff));
+  x22 = (x20 >> 8);
+  x23 = (uint8_t)(x22 & UINT8_C(0xff));
+  x24 = (x22 >> 8);
+  x25 = (uint8_t)(x24 & UINT8_C(0xff));
+  x26 = (x24 >> 8);
+  x27 = (uint8_t)(x26 & UINT8_C(0xff));
+  x28 = (x26 >> 8);
+  x29 = (uint8_t)(x28 & UINT8_C(0xff));
+  x30 = (x28 >> 8);
+  x31 = (uint8_t)(x30 & UINT8_C(0xff));
+  x32 = (uint8_t)(x30 >> 8);
+  x33 = (uint8_t)(x2 & UINT8_C(0xff));
+  x34 = (x2 >> 8);
+  x35 = (uint8_t)(x34 & UINT8_C(0xff));
+  x36 = (x34 >> 8);
+  x37 = (uint8_t)(x36 & UINT8_C(0xff));
+  x38 = (x36 >> 8);
+  x39 = (uint8_t)(x38 & UINT8_C(0xff));
+  x40 = (x38 >> 8);
+  x41 = (uint8_t)(x40 & UINT8_C(0xff));
+  x42 = (x40 >> 8);
+  x43 = (uint8_t)(x42 & UINT8_C(0xff));
+  x44 = (x42 >> 8);
+  x45 = (uint8_t)(x44 & UINT8_C(0xff));
+  x46 = (uint8_t)(x44 >> 8);
+  x47 = (uint8_t)(x1 & UINT8_C(0xff));
+  x48 = (x1 >> 8);
+  x49 = (uint8_t)(x48 & UINT8_C(0xff));
+  x50 = (x48 >> 8);
+  x51 = (uint8_t)(x50 & UINT8_C(0xff));
+  x52 = (x50 >> 8);
+  x53 = (uint8_t)(x52 & UINT8_C(0xff));
+  x54 = (x52 >> 8);
+  x55 = (uint8_t)(x54 & UINT8_C(0xff));
+  x56 = (x54 >> 8);
+  x57 = (uint8_t)(x56 & UINT8_C(0xff));
+  x58 = (x56 >> 8);
+  x59 = (uint8_t)(x58 & UINT8_C(0xff));
+  x60 = (uint8_t)(x58 >> 8);
+  out1[0] = x5;
+  out1[1] = x7;
+  out1[2] = x9;
+  out1[3] = x11;
+  out1[4] = x13;
+  out1[5] = x15;
+  out1[6] = x17;
+  out1[7] = x18;
+  out1[8] = x19;
+  out1[9] = x21;
+  out1[10] = x23;
+  out1[11] = x25;
+  out1[12] = x27;
+  out1[13] = x29;
+  out1[14] = x31;
+  out1[15] = x32;
+  out1[16] = x33;
+  out1[17] = x35;
+  out1[18] = x37;
+  out1[19] = x39;
+  out1[20] = x41;
+  out1[21] = x43;
+  out1[22] = x45;
+  out1[23] = x46;
+  out1[24] = x47;
+  out1[25] = x49;
+  out1[26] = x51;
+  out1[27] = x53;
+  out1[28] = x55;
+  out1[29] = x57;
+  out1[30] = x59;
+  out1[31] = x60;
+}
+
+/*
+ * The function fiat_p256_from_bytes deserializes a field element NOT in the Montgomery domain from bytes in little-endian order.
+ *
+ * Preconditions:
+ *   0 ≤ bytes_eval arg1 < m
+ * Postconditions:
+ *   eval out1 mod m = bytes_eval arg1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_from_bytes(uint64_t out1[4], const uint8_t arg1[32]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint8_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint8_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint8_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint64_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  x1 = ((uint64_t)(arg1[31]) << 56);
+  x2 = ((uint64_t)(arg1[30]) << 48);
+  x3 = ((uint64_t)(arg1[29]) << 40);
+  x4 = ((uint64_t)(arg1[28]) << 32);
+  x5 = ((uint64_t)(arg1[27]) << 24);
+  x6 = ((uint64_t)(arg1[26]) << 16);
+  x7 = ((uint64_t)(arg1[25]) << 8);
+  x8 = (arg1[24]);
+  x9 = ((uint64_t)(arg1[23]) << 56);
+  x10 = ((uint64_t)(arg1[22]) << 48);
+  x11 = ((uint64_t)(arg1[21]) << 40);
+  x12 = ((uint64_t)(arg1[20]) << 32);
+  x13 = ((uint64_t)(arg1[19]) << 24);
+  x14 = ((uint64_t)(arg1[18]) << 16);
+  x15 = ((uint64_t)(arg1[17]) << 8);
+  x16 = (arg1[16]);
+  x17 = ((uint64_t)(arg1[15]) << 56);
+  x18 = ((uint64_t)(arg1[14]) << 48);
+  x19 = ((uint64_t)(arg1[13]) << 40);
+  x20 = ((uint64_t)(arg1[12]) << 32);
+  x21 = ((uint64_t)(arg1[11]) << 24);
+  x22 = ((uint64_t)(arg1[10]) << 16);
+  x23 = ((uint64_t)(arg1[9]) << 8);
+  x24 = (arg1[8]);
+  x25 = ((uint64_t)(arg1[7]) << 56);
+  x26 = ((uint64_t)(arg1[6]) << 48);
+  x27 = ((uint64_t)(arg1[5]) << 40);
+  x28 = ((uint64_t)(arg1[4]) << 32);
+  x29 = ((uint64_t)(arg1[3]) << 24);
+  x30 = ((uint64_t)(arg1[2]) << 16);
+  x31 = ((uint64_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint64_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x28 + x35);
+  x37 = (x27 + x36);
+  x38 = (x26 + x37);
+  x39 = (x25 + x38);
+  x40 = (x23 + (uint64_t)x24);
+  x41 = (x22 + x40);
+  x42 = (x21 + x41);
+  x43 = (x20 + x42);
+  x44 = (x19 + x43);
+  x45 = (x18 + x44);
+  x46 = (x17 + x45);
+  x47 = (x15 + (uint64_t)x16);
+  x48 = (x14 + x47);
+  x49 = (x13 + x48);
+  x50 = (x12 + x49);
+  x51 = (x11 + x50);
+  x52 = (x10 + x51);
+  x53 = (x9 + x52);
+  x54 = (x7 + (uint64_t)x8);
+  x55 = (x6 + x54);
+  x56 = (x5 + x55);
+  x57 = (x4 + x56);
+  x58 = (x3 + x57);
+  x59 = (x2 + x58);
+  x60 = (x1 + x59);
+  out1[0] = x39;
+  out1[1] = x46;
+  out1[2] = x53;
+  out1[3] = x60;
+}
+
+/*
+ * The function fiat_p256_set_one returns the field element one in the Montgomery domain.
+ *
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = 1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_set_one(fiat_p256_montgomery_domain_field_element out1) {
+  out1[0] = 0x1;
+  out1[1] = UINT64_C(0xffffffff00000000);
+  out1[2] = UINT64_C(0xffffffffffffffff);
+  out1[3] = UINT32_C(0xfffffffe);
+}
+
+/*
+ * The function fiat_p256_msat returns the saturated representation of the prime modulus.
+ *
+ * Postconditions:
+ *   twos_complement_eval out1 = m
+ *   0 ≤ eval out1 < m
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_msat(uint64_t out1[5]) {
+  out1[0] = UINT64_C(0xffffffffffffffff);
+  out1[1] = UINT32_C(0xffffffff);
+  out1[2] = 0x0;
+  out1[3] = UINT64_C(0xffffffff00000001);
+  out1[4] = 0x0;
+}
+
+/*
+ * The function fiat_p256_divstep computes a divstep.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg4 < m
+ *   0 ≤ eval arg5 < m
+ * Postconditions:
+ *   out1 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then 1 - arg1 else 1 + arg1)
+ *   twos_complement_eval out2 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then twos_complement_eval arg3 else twos_complement_eval arg2)
+ *   twos_complement_eval out3 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then ⌊(twos_complement_eval arg3 - twos_complement_eval arg2) / 2⌋ else ⌊(twos_complement_eval arg3 + (twos_complement_eval arg3 mod 2) * twos_complement_eval arg2) / 2⌋)
+ *   eval (from_montgomery out4) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (2 * eval (from_montgomery arg5)) mod m else (2 * eval (from_montgomery arg4)) mod m)
+ *   eval (from_montgomery out5) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (eval (from_montgomery arg4) - eval (from_montgomery arg4)) mod m else (eval (from_montgomery arg5) + (twos_complement_eval arg3 mod 2) * eval (from_montgomery arg4)) mod m)
+ *   0 ≤ eval out5 < m
+ *   0 ≤ eval out5 < m
+ *   0 ≤ eval out2 < m
+ *   0 ≤ eval out3 < m
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_divstep(uint64_t* out1, uint64_t out2[5], uint64_t out3[5], uint64_t out4[4], uint64_t out5[4], uint64_t arg1, const uint64_t arg2[5], const uint64_t arg3[5], const uint64_t arg4[4], const uint64_t arg5[4]) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  fiat_p256_uint1 x3;
+  uint64_t x4;
+  fiat_p256_uint1 x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  fiat_p256_uint1 x17;
+  uint64_t x18;
+  fiat_p256_uint1 x19;
+  uint64_t x20;
+  fiat_p256_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  fiat_p256_uint1 x38;
+  uint64_t x39;
+  fiat_p256_uint1 x40;
+  uint64_t x41;
+  fiat_p256_uint1 x42;
+  uint64_t x43;
+  fiat_p256_uint1 x44;
+  uint64_t x45;
+  fiat_p256_uint1 x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  fiat_p256_uint1 x54;
+  uint64_t x55;
+  fiat_p256_uint1 x56;
+  uint64_t x57;
+  fiat_p256_uint1 x58;
+  uint64_t x59;
+  fiat_p256_uint1 x60;
+  uint64_t x61;
+  uint64_t x62;
+  fiat_p256_uint1 x63;
+  uint64_t x64;
+  fiat_p256_uint1 x65;
+  uint64_t x66;
+  fiat_p256_uint1 x67;
+  uint64_t x68;
+  fiat_p256_uint1 x69;
+  uint64_t x70;
+  uint64_t x71;
+  uint64_t x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  uint64_t x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  fiat_p256_uint1 x81;
+  uint64_t x82;
+  fiat_p256_uint1 x83;
+  uint64_t x84;
+  fiat_p256_uint1 x85;
+  uint64_t x86;
+  fiat_p256_uint1 x87;
+  uint64_t x88;
+  fiat_p256_uint1 x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  uint64_t x93;
+  uint64_t x94;
+  fiat_p256_uint1 x95;
+  uint64_t x96;
+  fiat_p256_uint1 x97;
+  uint64_t x98;
+  fiat_p256_uint1 x99;
+  uint64_t x100;
+  fiat_p256_uint1 x101;
+  uint64_t x102;
+  fiat_p256_uint1 x103;
+  uint64_t x104;
+  fiat_p256_uint1 x105;
+  uint64_t x106;
+  fiat_p256_uint1 x107;
+  uint64_t x108;
+  fiat_p256_uint1 x109;
+  uint64_t x110;
+  fiat_p256_uint1 x111;
+  uint64_t x112;
+  fiat_p256_uint1 x113;
+  uint64_t x114;
+  uint64_t x115;
+  uint64_t x116;
+  uint64_t x117;
+  uint64_t x118;
+  uint64_t x119;
+  uint64_t x120;
+  uint64_t x121;
+  uint64_t x122;
+  uint64_t x123;
+  uint64_t x124;
+  uint64_t x125;
+  uint64_t x126;
+  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (~arg1), 0x1);
+  x3 = (fiat_p256_uint1)((fiat_p256_uint1)(x1 >> 63) & (fiat_p256_uint1)((arg3[0]) & 0x1));
+  fiat_p256_addcarryx_u64(&x4, &x5, 0x0, (~arg1), 0x1);
+  fiat_p256_cmovznz_u64(&x6, x3, arg1, x4);
+  fiat_p256_cmovznz_u64(&x7, x3, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u64(&x8, x3, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u64(&x9, x3, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u64(&x10, x3, (arg2[3]), (arg3[3]));
+  fiat_p256_cmovznz_u64(&x11, x3, (arg2[4]), (arg3[4]));
+  fiat_p256_addcarryx_u64(&x12, &x13, 0x0, 0x1, (~(arg2[0])));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, 0x0, (~(arg2[1])));
+  fiat_p256_addcarryx_u64(&x16, &x17, x15, 0x0, (~(arg2[2])));
+  fiat_p256_addcarryx_u64(&x18, &x19, x17, 0x0, (~(arg2[3])));
+  fiat_p256_addcarryx_u64(&x20, &x21, x19, 0x0, (~(arg2[4])));
+  fiat_p256_cmovznz_u64(&x22, x3, (arg3[0]), x12);
+  fiat_p256_cmovznz_u64(&x23, x3, (arg3[1]), x14);
+  fiat_p256_cmovznz_u64(&x24, x3, (arg3[2]), x16);
+  fiat_p256_cmovznz_u64(&x25, x3, (arg3[3]), x18);
+  fiat_p256_cmovznz_u64(&x26, x3, (arg3[4]), x20);
+  fiat_p256_cmovznz_u64(&x27, x3, (arg4[0]), (arg5[0]));
+  fiat_p256_cmovznz_u64(&x28, x3, (arg4[1]), (arg5[1]));
+  fiat_p256_cmovznz_u64(&x29, x3, (arg4[2]), (arg5[2]));
+  fiat_p256_cmovznz_u64(&x30, x3, (arg4[3]), (arg5[3]));
+  fiat_p256_addcarryx_u64(&x31, &x32, 0x0, x27, x27);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x28, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x29, x29);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x30, x30);
+  fiat_p256_subborrowx_u64(&x39, &x40, 0x0, x31, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x41, &x42, x40, x33, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x43, &x44, x42, x35, 0x0);
+  fiat_p256_subborrowx_u64(&x45, &x46, x44, x37, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x47, &x48, x46, x38, 0x0);
+  x49 = (arg4[3]);
+  x50 = (arg4[2]);
+  x51 = (arg4[1]);
+  x52 = (arg4[0]);
+  fiat_p256_subborrowx_u64(&x53, &x54, 0x0, 0x0, x52);
+  fiat_p256_subborrowx_u64(&x55, &x56, x54, 0x0, x51);
+  fiat_p256_subborrowx_u64(&x57, &x58, x56, 0x0, x50);
+  fiat_p256_subborrowx_u64(&x59, &x60, x58, 0x0, x49);
+  fiat_p256_cmovznz_u64(&x61, x60, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x62, &x63, 0x0, x53, x61);
+  fiat_p256_addcarryx_u64(&x64, &x65, x63, x55, (x61 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x66, &x67, x65, x57, 0x0);
+  fiat_p256_addcarryx_u64(&x68, &x69, x67, x59, (x61 & UINT64_C(0xffffffff00000001)));
+  fiat_p256_cmovznz_u64(&x70, x3, (arg5[0]), x62);
+  fiat_p256_cmovznz_u64(&x71, x3, (arg5[1]), x64);
+  fiat_p256_cmovznz_u64(&x72, x3, (arg5[2]), x66);
+  fiat_p256_cmovznz_u64(&x73, x3, (arg5[3]), x68);
+  x74 = (fiat_p256_uint1)(x22 & 0x1);
+  fiat_p256_cmovznz_u64(&x75, x74, 0x0, x7);
+  fiat_p256_cmovznz_u64(&x76, x74, 0x0, x8);
+  fiat_p256_cmovznz_u64(&x77, x74, 0x0, x9);
+  fiat_p256_cmovznz_u64(&x78, x74, 0x0, x10);
+  fiat_p256_cmovznz_u64(&x79, x74, 0x0, x11);
+  fiat_p256_addcarryx_u64(&x80, &x81, 0x0, x22, x75);
+  fiat_p256_addcarryx_u64(&x82, &x83, x81, x23, x76);
+  fiat_p256_addcarryx_u64(&x84, &x85, x83, x24, x77);
+  fiat_p256_addcarryx_u64(&x86, &x87, x85, x25, x78);
+  fiat_p256_addcarryx_u64(&x88, &x89, x87, x26, x79);
+  fiat_p256_cmovznz_u64(&x90, x74, 0x0, x27);
+  fiat_p256_cmovznz_u64(&x91, x74, 0x0, x28);
+  fiat_p256_cmovznz_u64(&x92, x74, 0x0, x29);
+  fiat_p256_cmovznz_u64(&x93, x74, 0x0, x30);
+  fiat_p256_addcarryx_u64(&x94, &x95, 0x0, x70, x90);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x71, x91);
+  fiat_p256_addcarryx_u64(&x98, &x99, x97, x72, x92);
+  fiat_p256_addcarryx_u64(&x100, &x101, x99, x73, x93);
+  fiat_p256_subborrowx_u64(&x102, &x103, 0x0, x94, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x104, &x105, x103, x96, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x106, &x107, x105, x98, 0x0);
+  fiat_p256_subborrowx_u64(&x108, &x109, x107, x100, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x110, &x111, x109, x101, 0x0);
+  fiat_p256_addcarryx_u64(&x112, &x113, 0x0, x6, 0x1);
+  x114 = ((x80 >> 1) | ((x82 << 63) & UINT64_C(0xffffffffffffffff)));
+  x115 = ((x82 >> 1) | ((x84 << 63) & UINT64_C(0xffffffffffffffff)));
+  x116 = ((x84 >> 1) | ((x86 << 63) & UINT64_C(0xffffffffffffffff)));
+  x117 = ((x86 >> 1) | ((x88 << 63) & UINT64_C(0xffffffffffffffff)));
+  x118 = ((x88 & UINT64_C(0x8000000000000000)) | (x88 >> 1));
+  fiat_p256_cmovznz_u64(&x119, x48, x39, x31);
+  fiat_p256_cmovznz_u64(&x120, x48, x41, x33);
+  fiat_p256_cmovznz_u64(&x121, x48, x43, x35);
+  fiat_p256_cmovznz_u64(&x122, x48, x45, x37);
+  fiat_p256_cmovznz_u64(&x123, x111, x102, x94);
+  fiat_p256_cmovznz_u64(&x124, x111, x104, x96);
+  fiat_p256_cmovznz_u64(&x125, x111, x106, x98);
+  fiat_p256_cmovznz_u64(&x126, x111, x108, x100);
+  *out1 = x112;
+  out2[0] = x7;
+  out2[1] = x8;
+  out2[2] = x9;
+  out2[3] = x10;
+  out2[4] = x11;
+  out3[0] = x114;
+  out3[1] = x115;
+  out3[2] = x116;
+  out3[3] = x117;
+  out3[4] = x118;
+  out4[0] = x119;
+  out4[1] = x120;
+  out4[2] = x121;
+  out4[3] = x122;
+  out5[0] = x123;
+  out5[1] = x124;
+  out5[2] = x125;
+  out5[3] = x126;
+}
+
+/*
+ * The function fiat_p256_divstep_precomp returns the precomputed value for Bernstein-Yang-inversion (in montgomery form).
+ *
+ * Postconditions:
+ *   eval (from_montgomery out1) = ⌊(m - 1) / 2⌋^(if ⌊log2 m⌋ + 1 < 46 then ⌊(49 * (⌊log2 m⌋ + 1) + 80) / 17⌋ else ⌊(49 * (⌊log2 m⌋ + 1) + 57) / 17⌋)
+ *   0 ≤ eval out1 < m
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_divstep_precomp(uint64_t out1[4]) {
+  out1[0] = UINT64_C(0x67ffffffb8000000);
+  out1[1] = UINT64_C(0xc000000038000000);
+  out1[2] = UINT64_C(0xd80000007fffffff);
+  out1[3] = UINT64_C(0x2fffffffffffffff);
+}