diff --git a/.clang-format b/.clang-format
index 43032d44e2..3ccecaac1f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,3 +2,10 @@ BasedOnStyle: Google
 MaxEmptyLinesToKeep: 3
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
+DerivePointerAlignment: false
+PointerAlignment: Right
+# TODO(davidben): The default for Google style is now Regroup, but the default
+# IncludeCategories does not recognize <openssl/header.h>. We should
+# reconfigure IncludeCategories to match. For now, keep it at Preserve.
+IncludeBlocks: Preserve
+
diff --git a/.coveralls.yml b/.coveralls.yml
new file mode 100644
index 0000000000..cf27a37024
--- /dev/null
+++ b/.coveralls.yml
@@ -0,0 +1 @@
+service_name: travis-pro
diff --git a/.gitattributes b/.gitattributes
index bf4e88576e..0271bc950c 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,10 +1,7 @@
 * text=auto !eol
-*.sln eol=crlf
-*.vcxproj eol=crlf
-*.vcxproj.filters eol=crlf
-*.props eol=crlf
-*.bat eol=crlf
-*.rc eol=crlf
-*.pl linguist-language=Assembly
+crypto/**/*.pl linguist-language=Assembly
+crypto/perlasm/*.pl linguist-language=Perl
 *.bin binary
 *.der binary
+**/*.h linguist-language=C
+**/*.inl linguist-language=C
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000..a9261f441f
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,422 @@
+name: ci
+on:
+  pull_request:
+  push:
+jobs:
+  rustfmt:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-18.04
+
+    steps:
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          components: rustfmt
+      - uses: actions/checkout@v2
+      - run: cargo fmt --all -- --check
+
+  clippy:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-18.04
+
+    steps:
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          components: clippy
+
+      - uses: actions/checkout@v2
+
+      - run: cargo clippy --all-features ---all-targets -- --deny warnings
+
+  audit:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-18.04
+
+    steps:
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+
+      - uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cargo/bin/cargo-audit
+            ~/.cargo/.crates.toml
+            ~/.cargo/.crates2.json
+          key: ${{ runner.os }}-v2-cargo-audit-0.13.1
+
+      - run: cargo install cargo-audit --vers "0.13.1"
+
+      - uses: actions/checkout@v2
+
+      - run: cargo generate-lockfile
+
+      - run: cargo audit --deny warnings
+
+  deny:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-18.04
+
+    steps:
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+
+      - uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cargo/bin/cargo-deny
+            ~/.cargo/.crates.toml
+            ~/.cargo/.crates2.json
+          key: ${{ runner.os }}-v2-cargo-deny-0.8.4
+
+      - run: cargo install cargo-deny --vers "0.8.4"
+
+      - uses: actions/checkout@v2
+
+      - run: cargo deny check
+
+  # Verify that documentation builds.
+  rustdoc:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-18.04
+
+    strategy:
+      matrix:
+        rust_channel:
+          - stable
+          - beta
+          - nightly
+
+        include:
+          - target: x86_64-unknown-linux-gnu
+
+    steps:
+      - uses: actions-rs/toolchain@v1
+        with:
+          override: true
+          target: ${{ matrix.target }}
+          toolchain: ${{ matrix.rust_channel }}
+
+      - uses: actions/checkout@v2
+
+      - run: |
+          cargo doc --all-features
+
+  package:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: windows-latest
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - run: powershell -ExecutionPolicy Bypass ./mk/install-build-tools.ps1
+
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+
+      - run: sh mk/package.sh
+        shell: bash
+
+  test:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ${{ matrix.host_os }}
+
+    strategy:
+      matrix:
+        features:
+          - # Default
+
+        target:
+          - aarch64-apple-ios
+          - aarch64-apple-darwin
+          - aarch64-linux-android
+          - aarch64-unknown-linux-gnu
+          - aarch64-unknown-linux-musl
+          - arm-unknown-linux-gnueabihf
+          - armv7-linux-androideabi
+          - armv7-unknown-linux-musleabihf
+          - i686-pc-windows-msvc
+          - i686-unknown-linux-gnu
+          - i686-unknown-linux-musl
+          - x86_64-pc-windows-gnu
+          - x86_64-pc-windows-msvc
+          - x86_64-apple-darwin
+          - x86_64-unknown-linux-musl
+          - x86_64-unknown-linux-gnu
+
+        mode:
+          - # debug
+          - --release
+
+        rust_channel:
+          - stable
+          - nightly
+          - 1.37.0 # MSRV
+          - beta
+
+        exclude:
+          # The stable channel doesn't have aarch64-apple-darwin support yet.
+          - target: aarch64-apple-darwin
+            rust_channel: stable
+
+          # The MSRV channel doesn't have aarch64-apple-darwin support yet.
+          - target: aarch64-apple-darwin
+            rust_channel: 1.37.0
+
+          # Only do MSRV testing on release builds.
+          - mode: # debug
+            rust_channel: 1.37.0
+
+          # 1.37.0 doesn't support `-Clink-self-contained`.
+          - target: aarch64-unknown-linux-musl
+            rust_channel: 1.37.0
+
+          # 1.37.0 doesn't support `-Clink-self-contained`.
+          - target: armv7-unknown-linux-musleabihf
+            rust_channel: 1.37.0
+
+          # 1.37.0 doesn't support `-Clink-self-contained`.
+          - target: i686-unknown-linux-musl
+            rust_channel: 1.37.0
+
+          # 1.37.0 doesn't support `-Clink-self-contained`.
+          - target: x86_64-unknown-linux-musl
+            rust_channel: 1.37.0
+
+          # https://github.com/rust-lang/rust/pull/67429
+          - target: x86_64-pc-windows-gnu
+            rust_channel: 1.37.0
+
+        include:
+          - target: aarch64-apple-darwin
+            # macos-latest didn't work.
+            host_os: macos-11.0
+            # GitHub Actions doesn't have a way to run this target yet.
+            cargo_options: --no-run
+
+          - target: aarch64-apple-ios
+            host_os: macos-latest
+            # GitHub Actions doesn't have a way to run this target yet.
+            cargo_options: --no-run
+
+          - target: aarch64-linux-android
+            host_os: ubuntu-18.04
+            # TODO: https://github.com/briansmith/ring/issues/486
+            cargo_options: --no-run
+
+          - target: aarch64-unknown-linux-gnu
+            host_os: ubuntu-18.04
+
+          - target: aarch64-unknown-linux-musl
+            host_os: ubuntu-18.04
+
+          - target: arm-unknown-linux-gnueabihf
+            host_os: ubuntu-18.04
+
+          - target: armv7-linux-androideabi
+            host_os: ubuntu-18.04
+            # TODO: https://github.com/briansmith/ring/issues/838
+            cargo_options: --no-run
+
+          - target: armv7-unknown-linux-musleabihf
+            host_os: ubuntu-18.04
+            # TODO: https://github.com/briansmith/ring/issues/1115
+            cargo_options: --no-run
+
+          - target: i686-pc-windows-msvc
+            host_os: windows-latest
+
+          - target: i686-unknown-linux-gnu
+            host_os: ubuntu-18.04
+
+          - target: i686-unknown-linux-musl
+            host_os: ubuntu-18.04
+
+          - target: x86_64-pc-windows-gnu
+            host_os: windows-latest
+
+          - target: x86_64-pc-windows-msvc
+            host_os: windows-latest
+
+          - target: x86_64-apple-darwin
+            host_os: macos-latest
+
+          - target: x86_64-unknown-linux-musl
+            host_os: ubuntu-18.04
+
+          - target: x86_64-unknown-linux-gnu
+            host_os: ubuntu-18.04
+
+    steps:
+      - if: ${{ contains(matrix.host_os, 'ubuntu') }}
+        run: sudo apt-get update -y
+
+      - uses: actions/checkout@v2
+
+      - if: ${{ !contains(matrix.host_os, 'windows') }}
+        run: mk/install-build-tools.sh --target=${{ matrix.target }} ${{ matrix.features }}
+
+      - if: ${{ contains(matrix.host_os, 'windows') }}
+        run: >
+          (powershell -ExecutionPolicy Bypass ./mk/install-build-tools.ps1) -and
+          ("$pwd\target\tools" >> $env:GITHUB_PATH)
+
+      - uses: actions-rs/toolchain@v1
+        with:
+          override: true
+          target: ${{ matrix.target }}
+          toolchain: ${{ matrix.rust_channel }}
+
+      - if: ${{ matrix.target == 'aarch64-apple-darwin' }}
+        run: echo "DEVELOPER_DIR=/Applications/Xcode_12.2.app/Contents/Developer" >> $GITHUB_ENV
+
+      - if: ${{ !contains(matrix.host_os, 'windows') }}
+        run: |
+          mk/cargo.sh test -vv --target=${{ matrix.target }} ${{ matrix.cargo_options }} ${{ matrix.features }} ${{ matrix.mode }}
+
+      - if: ${{ contains(matrix.host_os, 'windows') }}
+        run: |
+          cargo test -vv --target=${{ matrix.target }} ${{ matrix.cargo_options }} ${{ matrix.features }} ${{ matrix.mode }}
+
+  # The wasm32-unknown-unknown targets have a different set of feature sets and
+  # an additional `webdriver` dimension.
+  test-wasm32:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ${{ matrix.host_os }}
+
+    strategy:
+      matrix:
+        features:
+          - # Default
+          - --features=wasm32_c
+        host_os:
+          - ubuntu-18.04
+        mode:
+          - # debug
+          - --release
+        rust_channel:
+          - stable
+          - beta
+          - nightly
+        target:
+          - wasm32-unknown-unknown
+        webdriver:
+          - GECKODRIVER=geckodriver
+          - CHROMEDRIVER=chromedriver
+
+    steps:
+      - if: ${{ contains(matrix.host_os, 'ubuntu') }}
+        run: sudo apt-get update -y
+
+      - uses: actions/checkout@v2
+
+      - run: cargo generate-lockfile
+
+      - run: mk/install-build-tools.sh --target=${{ matrix.target }} ${{ matrix.features }}
+
+      - uses: actions-rs/toolchain@v1
+        with:
+          override: true
+          target: ${{ matrix.target }}
+          toolchain: ${{ matrix.rust_channel }}
+
+      - run: |
+          ${{ matrix.webdriver }} mk/cargo.sh test -vv --target=${{ matrix.target }} ${{ matrix.features }} ${{ matrix.mode }}
+
+  coverage:
+    # Don't run duplicate `push` jobs for the repo owner's PRs.
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ${{ matrix.host_os }}
+
+    strategy:
+      matrix:
+        features:
+          - --all-features
+
+        # TODO: targets
+        target:
+          - aarch64-unknown-linux-gnu
+          - i686-unknown-linux-gnu
+          - x86_64-unknown-linux-musl
+
+        mode:
+          - # debug
+
+        # Coverage collection is Nightly-only
+        rust_channel:
+          - nightly
+
+        # TODO: targets
+        include:
+          # TODO: Use the -musl target after
+          # https://github.com/rust-lang/rust/issues/79556 and
+          # https://github.com/rust-lang/rust/issues/79555 are fixed.
+          - target: aarch64-unknown-linux-gnu
+            host_os: ubuntu-18.04
+
+          # TODO: Use the -musl target after
+          # https://github.com/rust-lang/rust/issues/79556 and
+          # https://github.com/rust-lang/rust/issues/79555 are fixed.
+          - target: i686-unknown-linux-gnu
+            host_os: ubuntu-18.04
+
+          - target: x86_64-unknown-linux-musl
+            host_os: ubuntu-18.04
+
+          # TODO: Add an ARM target after
+          # https://github.com/rust-lang/rust/issues/79555 is fixed. This may
+          # require https://github.com/rust-lang/rust/issues/79555 to be fixed
+          # too.
+
+    steps:
+      - if: ${{ contains(matrix.host_os, 'ubuntu') }}
+        run: sudo apt-get update -y
+
+      - uses: actions/checkout@v2
+
+      - if: ${{ !contains(matrix.host_os, 'windows') }}
+        run: RING_COVERAGE=1 mk/install-build-tools.sh --target=${{ matrix.target }} ${{ matrix.features }}
+
+      - uses: actions-rs/toolchain@v1
+        with:
+          override: true
+          target: ${{ matrix.target }}
+          toolchain: ${{ matrix.rust_channel }}
+
+      - if: ${{ matrix.target == 'aarch64-apple-darwin' }}
+        run: echo "DEVELOPER_DIR=/Applications/Xcode_12.2.app/Contents/Developer" >> $GITHUB_ENV
+
+      - if: ${{ !contains(matrix.host_os, 'windows') }}
+        run: |
+          RING_COVERAGE=1 mk/cargo.sh +${{ matrix.rust_channel }} test -vv --target=${{ matrix.target }} ${{ matrix.cargo_options }} ${{ matrix.features }} ${{ matrix.mode }}
+
+      - uses: codecov/codecov-action@v1
+        with:
+          directory: ./target/${{ matrix.target }}/debug/coverage/reports
+          fail_ci_if_error: true
+          verbose: true
diff --git a/.gitignore b/.gitignore
index 7219d6bd38..3b63d5972c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,29 +7,6 @@ ssl/test/runner/runner
 doc/*.html
 doc/doc.css
 
-util/bot/android_ndk
-util/bot/android_tools
-util/bot/cmake-linux64
-util/bot/cmake-linux64.tar.gz
-util/bot/cmake-mac
-util/bot/cmake-mac.tar.gz
-util/bot/cmake-win32
-util/bot/cmake-win32.zip
-util/bot/golang
-util/bot/gyp
-util/bot/libcxx
-util/bot/libcxxabi
-util/bot/llvm-build
-util/bot/nasm-win32.exe
-util/bot/perl-win32
-util/bot/perl-win32.zip
-util/bot/sde-linux64
-util/bot/sde-linux64.tar.bz2
-util/bot/sde-win32
-util/bot/sde-win32.tar.bz2
-util/bot/win_toolchain.json
-util/bot/yasm-win32.exe
-
 *.bk
 *.orig
 *~
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index dbbe8ac24a..0000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,683 +0,0 @@
-language: rust
-cache:
-  directories:
-    - $HOME/kcov-i686-unknown-linux-gnu
-    - $HOME/kcov-x86_64-unknown-linux-gnu
-matrix:
-  fast_finish: true
-  allow_failures:
-    - rust: nightly
-  include:
-    # The lines from "# BEGIN GENERATED" through "# END GENERATED" are
-    # generated by running |python mk/update-travis-yml.py|. Any changes
-    # made to those lines will be overwritten while other lines will be left
-    # untouched.
-    #
-    # BEGIN GENERATED
-
-    - env: TARGET_X=x86_64-apple-darwin  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: osx
-      osx_image: xcode10.1
-
-    - env: TARGET_X=x86_64-apple-darwin  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: osx
-      osx_image: xcode10.1
-
-    - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2
-      dist: trusty
-
-    - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2
-      dist: trusty
-
-    - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18
-      dist: trusty
-
-    - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-aarch64-linux-gnu
-            - libc6-dev-arm64-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-aarch64-linux-gnu
-            - libc6-dev-arm64-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-            - gcc-7-multilib
-            - linux-libc-dev:i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-            - gcc-7-multilib
-            - linux-libc-dev:i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-arm-linux-gnueabihf
-            - libc6-dev-armhf-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable
-      rust: stable
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-arm-linux-gnueabihf
-            - libc6-dev-armhf-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=x86_64-apple-darwin  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: osx
-      osx_image: xcode10.1
-
-    - env: TARGET_X=x86_64-apple-darwin  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: osx
-      osx_image: xcode10.1
-
-    - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2
-      dist: trusty
-
-    - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2
-      dist: trusty
-
-    - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18
-      dist: trusty
-
-    - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=1 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - binutils-dev
-            - g++-7
-            - gcc-7
-            - libcurl4-openssl-dev
-            - libdw-dev
-            - libelf-dev
-            - libiberty-dev
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-aarch64-linux-gnu
-            - libc6-dev-arm64-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-aarch64-linux-gnu
-            - libc6-dev-arm64-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=1 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - g++-7
-            - g++-7-multilib
-            - gcc-7
-            - gcc-7-multilib
-            - libcurl3:i386
-            - libcurl4-openssl-dev:i386
-            - libdw-dev:i386
-            - libelf-dev:i386
-            - libiberty-dev:i386
-            - libkrb5-dev:i386
-            - libssl-dev:i386
-            - linux-libc-dev:i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-            - gcc-7-multilib
-            - linux-libc-dev:i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-arm-linux-gnueabihf
-            - libc6-dev-armhf-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly
-      rust: nightly
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-arm-linux-gnueabihf
-            - libc6-dev-armhf-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=x86_64-apple-darwin  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: osx
-      osx_image: xcode10.1
-
-    - env: TARGET_X=x86_64-apple-darwin  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: osx
-      osx_image: xcode10.1
-
-    - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2
-      dist: trusty
-
-    - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2
-      dist: trusty
-
-    - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18
-      dist: trusty
-
-    - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-aarch64-linux-gnu
-            - libc6-dev-arm64-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-aarch64-linux-gnu
-            - libc6-dev-arm64-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu  FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu  FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - libc6-dev-i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-            - gcc-7-multilib
-            - linux-libc-dev:i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-7
-            - gcc-7-multilib
-            - linux-libc-dev:i386
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-arm-linux-gnueabihf
-            - libc6-dev-armhf-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta
-      rust: beta
-      os: linux
-      dist: trusty
-      addons:
-        apt:
-          packages:
-            - gcc-arm-linux-gnueabihf
-            - libc6-dev-armhf-cross
-          sources:
-            - ubuntu-toolchain-r-test
-
-    # END GENERATED
-
-script: if [[ "$TARGET_X" =~ ^a*.*linux-.* && "$MODE_X" == "RELWITHDEBINFO" ]]; then travis_wait 60 mk/travis.sh; else mk/travis.sh; fi
diff --git a/BUILDING.md b/BUILDING.md
index ab6d89558c..e5f6b7e548 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -26,15 +26,16 @@ Builds directly from Git
 ------------------------
 
 If you want to hack on *ring* then you need to build it directly from its Git
-repository. In this case, you must also have Perl installed, because the
-assembly language modules inherited from BoringSSL (inherited from OpenSSL)
-use Perl as a macro assembly language.
+repository. There are some additional requirements for doing this that do not
+apply when building from crates.io:
 
-When building from Git for Windows, directories containing yasm.exe and
-perl.exe must be in `%PATH%`, where yasm.exe is
-[Yasm](http://yasm.tortall.net/Download.html) 1.3 or later and where perl.exe
-is recommended to be [Strawberry Perl](http://strawberryperl.com).
+* For any target for which *ring* has assembly language implementations of
+  primitives (32- and 64- bit Intel, and 32- and 64-bit ARM), Perl must be
+  installed and in `$PATH`.
 
+* For Windows targets, `target/tools/nasm[.exe]` is used as the assembler;
+  [mk/install-build-tools.ps1](mk/install-build-tools.ps1) downloads it for
+  Windows hosts.
 
 Cross Compiling
 ---------------
@@ -79,11 +80,6 @@ e.g. export `CFLAGS=-D__ANDROID_API__=21`.
 
 Additional Features that are Useful for Development
 ---------------------------------------------------
-The `internal_benches` feature enable benchmarks of internal functions. These
-benchmarks are only useful for people hacking on the implementation of *ring*.
-(The benchmarks for the *ring* API are in the
-[crypto-bench](https://github.com/briansmith/crypto-bench) project.)
-
 The `slow_tests` feature runs additional tests that are too slow to run during
 a normal edit-compile-test cycle.
 
diff --git a/Cargo.toml b/Cargo.toml
index 275d42c621..bef69f9932 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,7 @@ license-file = "LICENSE"
 name = "ring"
 readme = "doc/link-to-readme.md"
 repository = "https://github.com/briansmith/ring"
-version = "0.16.15"
+version = "0.16.19"
 
 # Prevent multiple versions of *ring* from being linked into the same program.
 links = "ring-asm"
@@ -72,6 +72,7 @@ include = [
     "crypto/fipsmodule/modes/asm/ghash-x86.pl",
     "crypto/fipsmodule/modes/asm/ghash-x86_64.pl",
     "crypto/fipsmodule/modes/asm/ghashv8-armx.pl",
+    "crypto/fipsmodule/modes/gcm.c",
     "crypto/fipsmodule/modes/internal.h",
     "crypto/fipsmodule/rand/asm/rdrand-x86_64.pl",
     "crypto/fipsmodule/sha/asm/sha256-armv4.pl",
@@ -88,10 +89,11 @@ include = [
     "crypto/perlasm/x86gas.pl",
     "crypto/perlasm/x86nasm.pl",
     "crypto/perlasm/x86_64-xlate.pl",
-    "crypto/poly1305/asm/poly1305-armv4.pl",
-    "crypto/poly1305/asm/poly1305-armv8.pl",
-    "crypto/poly1305/asm/poly1305-x86.pl",
-    "crypto/poly1305/asm/poly1305-x86_64.pl",
+    "crypto/poly1305/internal.h",
+    "crypto/poly1305/poly1305.c",
+    "crypto/poly1305/poly1305_arm.c",
+    "crypto/poly1305/poly1305_arm_asm.S",
+    "crypto/poly1305/poly1305_vec.c",
     "doc/link-to-readme.md",
     "examples/checkdigest.rs",
     "include/GFp/aes.h",
@@ -100,6 +102,7 @@ include = [
     "include/GFp/check.h",
     "include/GFp/cpu.h",
     "include/GFp/mem.h",
+    "include/GFp/poly1305.h",
     "include/GFp/type_check.h",
     "src/aead.rs",
     "src/aead/aes.rs",
@@ -299,14 +302,15 @@ name = "ring"
 [dependencies]
 untrusted = { version = "0.7.1" }
 
-[target.'cfg(all(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86", target_arch = "x86_64"), not(target_os = "ios")))'.dependencies]
+[target.'cfg(any(target_arch = "x86",target_arch = "x86_64", all(any(target_arch = "aarch64", target_arch = "arm"), any(target_os = "android", target_os = "fuchsia", target_os = "linux"))))'.dependencies]
 spin = { version = "0.5.2", default-features = false }
 
 [target.'cfg(any(target_os = "android", target_os = "linux"))'.dependencies]
 libc = { version = "0.2.69", default-features = false }
+once_cell = { version = "1.5.2", default-features = false, features=["std"], optional = true }
 
-[target.'cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux", target_os = "netbsd", target_os = "openbsd", target_os = "solaris"))'.dependencies]
-once_cell = { version = "1.3.1", default-features = false, features=["std"], optional = true }
+[target.'cfg(any(target_os = "dragonfly", target_os = "freebsd", target_os = "illumos", target_os = "netbsd", target_os = "openbsd", target_os = "solaris"))'.dependencies]
+once_cell = { version = "1.5.2", default-features = false, features=["std"] }
 
 [target.'cfg(all(target_arch = "wasm32", target_vendor = "unknown", target_os = "unknown", target_env = ""))'.dependencies]
 web-sys = { version = "0.3.37", default-features = false, features = ["Crypto", "Window"] }
@@ -315,14 +319,14 @@ web-sys = { version = "0.3.37", default-features = false, features = ["Crypto",
 winapi = { version = "0.3.8", default-features = false, features = ["ntsecapi", "wtypesbase"] }
 
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
-wasm-bindgen-test = { version = "0.3.10", default-features = false }
+wasm-bindgen-test = { version = "0.3.18", default-features = false }
 
 [target.'cfg(any(unix, windows))'.dev-dependencies]
-libc = { version = "0.2.69", default-features = false }
+libc = { version = "0.2.80", default-features = false }
 
 # Keep this in sync with `[dependencies]` in pregenerate_asm/Cargo.toml.
 [build-dependencies]
-cc = { version = "1.0.41", default-features = false }
+cc = { version = "1.0.62", default-features = false }
 
 [features]
 # These features are documented in the top-level module's documentation.
diff --git a/README.md b/README.md
index 50c8b77722..47ae07be03 100644
--- a/README.md
+++ b/README.md
@@ -131,7 +131,7 @@ Users of *ring* should always use the latest released version, and users
 should upgrade to the latest released version as soon as it is released.
 *ring* has a linear release model that favors users of the latest released
 version. We have never backported fixes to earlier releases and we don't
-maintain branches other than the master branch. Further, for some obscure
+maintain branches other than the main branch. Further, for some obscure
 technical reasons it's currently not possible to link two different versions
 of *ring* into the same program; for policy reasons we don't bother to try
 to work around that. Thus it is important that libraries using *ring* update
@@ -169,8 +169,8 @@ source libraries use. The idea behind *our* model is to encourage all users to
 work together to ensure that the latest version is good *as it is being
 developed*. In particular, because users know that correctness/security fixes
 (if any) aren't going to get backported, they have a strong incentive to help
-review pull requests before they are merged and/or review commits on the master
-branch after they've landed to ensure that code quality on the master branch
+review pull requests before they are merged and/or review commits on the main
+branch after they've landed to ensure that code quality on the main branch
 stays high.
 
 The more common model, where there are stable versions that have important
@@ -198,41 +198,31 @@ any security vulnerability in this code privately to anybody.**
 Online Automated Testing
 ------------------------
 
-Travis CI is used for Android, Linux, and macOS. Appveyor is used for Windows.
-The tests are run in debug and release configurations, for the current release
-of each Rust channel (Stable, Beta, Nightly), for each configuration listed in
-the table below. The C compilers listed are used for compiling the C portions.
-
-<table>
-<tr><th>OS</th><th>Arch.</th><th>Compilers</th><th>Status</th>
-<tr><td rowspan=2>Linux</td>
-    <td>x86, x86_64</td>
-    <td>GCC 4.8, GCC 7, Clang 5</td>
-    <td rowspan=4><a href=https://travis-ci.org/briansmith/ring/branches>Build Status</a></td>
-</tr>
-<tr><td>32&#8209;bit&nbsp;ARM, AAarch64</td>
-    <td>GCC (Ubuntu/Linaro 4.8.4-2ubuntu1~14.04.1), tested using
-        <code>qemu-user-arm</code>.</td>
-</tr>
-<tr><td>Android</td>
-    <td>ARMv7, Aarch64</td>
-    <td>*ring* for ARMv7 Android is built in CI using SDK version 26 targeting
-        API level 18 (Android 4.3+); it is tested in the emulator using the
-        corresponding system image. *ring* for AArch64 Android is built in CI
-        using SDK version 26 targeting API level 21 (Android 5.0).</td>
-</tr>
-<tr><td>Mac&nbsp;OS&nbsp;X</td>
-    <td>x64</td>
-    <td>Apple LLVM version 9.0.0 (clang-900.0.39.2) from Xcode 9.3</td>
-</tr>
-<tr><td>Windows</td>
-    <td>x86, x86_64</td>
-    <td>MSVC 2015 Update 3 (14.0)</td>
-    <td><a href=https://ci.appveyor.com/project/briansmith/ring/branch/master>Build Status</a></td>
-</tr>
-</table>
-
-
+The following targets are tested in GitHub Actions. The tests are run in debug
+and release configurations, for the current release of each Rust channel
+(Stable, Beta, Nightly). A C compiler is currently required to compile some
+parts of *ring*; *ring* should be compatible with GCC 4.8+, Clang 10+, and MSVC
+2019+, at least.
+
+| Target                         | Notes |
+| -------------------------------| ----- |
+| aarch64-apple-darwin           | Build-only (GitHub Actions doesn't have a way to run the tests)
+| aarch64-apple-ios              | Build-only (GitHub Actions doesn't have a way to run the tests)
+| aarch64-unknown-linux-gnu      | Tested on 64-bit Linux using QEMU user emulation
+| aarch64-unknown-linux-musl     | Tested on 64-bit Linux using QEMU user emulation. [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713)
+| aarch64-linux-android          | API level 21 (Android 5.0+); [Build-only; issue 486](https://github.com/briansmith/ring/issues/486)
+| arm-unknown-linux-gnueabihf    | Tested on 64-bit Linux using QEMU user emulation
+| armv7-linux-androideabi        | API level 18 (Android 4.3+); [Build-only; issue 838](https://github.com/briansmith/ring/issues/838)
+| armv7-unknown-linux-musleabihf | Tested on 64-bit Linux using QEMU user emulation. [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713)
+| i686-pc-windows-msvc           | Tested on 64-bit Windows Server 2019 Datacenter
+| i686-unknown-linux-gnu         | Tested on 64-bit Linux using multilib support
+| i686-unknown-linux-musl        | Tested on 64-bit Linux using multilib support. [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713)
+| x86_64-apple-darwin            |
+| x86_64-pc-windows-gnu          |
+| x86_64-pc-windows-msvc         | Tested on 64-bit Windows Server 2019 Datacenter
+| x86_64-unknown-linux-gnu       |
+| x86_64-unknown-linux-musl      | [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713)
+| wasm32-unknown-unknown         | Tested using wasm-bindgen-test-runner on Linux in Chrome and Firefox.
 
 License
 -------
diff --git a/appveyor.yml b/appveyor.yml
deleted file mode 100644
index 97988f1f17..0000000000
--- a/appveyor.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-version: 1.0.{build}
-os:
-  - Visual Studio 2019
-clone_depth: 1
-configuration:
-  - Debug
-  - Release
-platform:
-  - Win32
-  - x64
-environment:
-  matrix:
-    - TOOLCHAIN_VERSION: 14.0
-      RUST: stable
-    - TOOLCHAIN_VERSION: 14.0
-      RUST: beta
-    - TOOLCHAIN_VERSION: 14.0
-      RUST: nightly
-
-build_script: mk/appveyor.bat
diff --git a/build.rs b/build.rs
index d0a7433e71..c09539d0a7 100644
--- a/build.rs
+++ b/build.rs
@@ -19,24 +19,6 @@
 // another for the concrete logging implementation). Instead we use `eprintln!`
 // to log everything to stderr.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 // In the `pregenerate_asm_main()` case we don't want to access (Cargo)
 // environment variables at all, so avoid `use std::env` here.
 
@@ -52,13 +34,14 @@ const X86_64: &str = "x86_64";
 const AARCH64: &str = "aarch64";
 const ARM: &str = "arm";
 
-#[cfg_attr(rustfmt, rustfmt_skip)]
+#[rustfmt::skip]
 const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/fipsmodule/aes/aes_nohw.c"),
     (&[], "crypto/fipsmodule/bn/montgomery.c"),
     (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
     (&[], "crypto/limbs/limbs.c"),
     (&[], "crypto/mem.c"),
+    (&[], "crypto/poly1305/poly1305.c"),
 
     (&[AARCH64, ARM, X86_64, X86], "crypto/crypto.c"),
     (&[AARCH64, ARM, X86_64, X86], "crypto/curve25519/curve25519.c"),
@@ -75,7 +58,6 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[X86], "crypto/chacha/asm/chacha-x86.pl"),
     (&[X86], "crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl"),
     (&[X86], "crypto/fipsmodule/modes/asm/ghash-x86.pl"),
-    (&[X86], "crypto/poly1305/asm/poly1305-x86.pl"),
 
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"),
@@ -85,7 +67,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[X86_64], "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"),
     (&[X86_64], "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"),
-    (&[X86_64], "crypto/poly1305/asm/poly1305-x86_64.pl"),
+    (&[X86_64], "crypto/poly1305/poly1305_vec.c"),
     (&[X86_64], SHA512_X86_64),
     (&[X86_64], "crypto/fipsmodule/rand/asm/rdrand-x86_64.pl"),
 
@@ -99,7 +81,8 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[ARM], "crypto/curve25519/asm/x25519-asm-arm.S"),
     (&[ARM], "crypto/fipsmodule/ec/asm/ecp_nistz256-armv4.pl"),
     (&[ARM], "crypto/fipsmodule/modes/asm/ghash-armv4.pl"),
-    (&[ARM], "crypto/poly1305/asm/poly1305-armv4.pl"),
+    (&[ARM], "crypto/poly1305/poly1305_arm.c"),
+    (&[ARM], "crypto/poly1305/poly1305_arm_asm.S"),
     (&[ARM], "crypto/fipsmodule/sha/asm/sha256-armv4.pl"),
     (&[ARM], "crypto/fipsmodule/sha/asm/sha512-armv4.pl"),
 
@@ -108,7 +91,6 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[AARCH64], "crypto/chacha/asm/chacha-armv8.pl"),
     (&[AARCH64], "crypto/fipsmodule/ec/asm/ecp_nistz256-armv8.pl"),
     (&[AARCH64], "crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl"),
-    (&[AARCH64], "crypto/poly1305/asm/poly1305-armv8.pl"),
     (&[AARCH64], SHA512_ARMV8),
 ];
 
@@ -120,7 +102,7 @@ const SHA512_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha512-armv8.pl";
 
 const RING_TEST_SRCS: &[&str] = &[("crypto/constant_time_test.c")];
 
-#[cfg_attr(rustfmt, rustfmt_skip)]
+#[rustfmt::skip]
 const RING_INCLUDES: &[&str] =
     &[
       "crypto/curve25519/curve25519_tables.h",
@@ -134,19 +116,20 @@ const RING_INCLUDES: &[&str] =
       "crypto/internal.h",
       "crypto/limbs/limbs.h",
       "crypto/limbs/limbs.inl",
-      "crypto/fipsmodule/modes/internal.h",
+      "crypto/poly1305/internal.h",
       "include/GFp/aes.h",
       "include/GFp/arm_arch.h",
       "include/GFp/base.h",
       "include/GFp/check.h",
       "include/GFp/cpu.h",
       "include/GFp/mem.h",
+      "include/GFp/poly1305.h",
       "include/GFp/type_check.h",
       "third_party/fiat/curve25519_32.h",
       "third_party/fiat/curve25519_64.h",
     ];
 
-#[cfg_attr(rustfmt, rustfmt_skip)]
+#[rustfmt::skip]
 const RING_PERL_INCLUDES: &[&str] =
     &["crypto/perlasm/arm-xlate.pl",
       "crypto/perlasm/x86gas.pl",
@@ -236,6 +219,7 @@ const ASM_TARGETS: &[(&str, Option<&str>, Option<&str>)] = &[
     ("x86_64", Some(WINDOWS), Some("nasm")),
     ("x86_64", None, Some("elf")),
     ("aarch64", Some("ios"), Some("ios64")),
+    ("aarch64", Some("macos"), Some("ios64")),
     ("aarch64", None, Some("linux64")),
     ("x86", Some(WINDOWS), Some("win32n")),
     ("x86", Some("ios"), Some("macosx")),
@@ -264,10 +248,6 @@ fn main() {
 fn ring_build_rs_main() {
     use std::env;
 
-    for (key, value) in env::vars() {
-        eprintln!("ENV {}={}", key, value);
-    }
-
     let out_dir = env::var("OUT_DIR").unwrap();
     let out_dir = PathBuf::from(out_dir);
 
@@ -324,9 +304,8 @@ fn pregenerate_asm_main() {
             if target_os == Some(WINDOWS) {
                 let srcs = asm_srcs(perlasm_src_dsts);
                 for src in srcs {
-                    let src_path = PathBuf::from(src);
-                    let obj_path = obj_path(&pregenerated, &src_path, MSVC_OBJ_EXT);
-                    run_command(yasm(&src_path, target_arch, &obj_path));
+                    let obj_path = obj_path(&pregenerated, &src, MSVC_OBJ_EXT);
+                    run_command(nasm(&src, target_arch, &obj_path));
                 }
             }
         }
@@ -374,7 +353,7 @@ fn build_c_code(target: &Target, pregenerated: PathBuf, out_dir: &Path) {
         .iter()
         .find(|entry| {
             let &(entry_arch, entry_os, _) = *entry;
-            entry_arch == &target.arch && is_none_or_equals(entry_os, &target.os)
+            entry_arch == target.arch && is_none_or_equals(entry_os, &target.os)
         })
         .unwrap();
 
@@ -405,7 +384,7 @@ fn build_c_code(target: &Target, pregenerated: PathBuf, out_dir: &Path) {
         // For Windows we also pregenerate the object files for non-Git builds so
         // the user doesn't need to install the assembler. On other platforms we
         // assume the C compiler also assembles.
-        if use_pregenerated && &target.os == WINDOWS {
+        if use_pregenerated && target.os == WINDOWS {
             // The pregenerated object files always use ".obj" as the extension,
             // even when the C/C++ compiler outputs files with the ".o" extension.
             asm_srcs = asm_srcs
@@ -462,8 +441,8 @@ fn build_library(
 ) {
     // Compile all the (dirty) source files into object files.
     let objs = additional_srcs
-        .into_iter()
-        .chain(srcs.into_iter())
+        .iter()
+        .chain(srcs.iter())
         .filter(|f| &target.env != "msvc" || f.extension().unwrap().to_str().unwrap() != "S")
         .map(|f| compile(f, target, warnings_are_errors, out_dir, includes_modified))
         .collect::<Vec<_>>();
@@ -487,7 +466,7 @@ fn build_library(
                 let _ = c.flag("-Wl,-dead_strip");
             }
             _ => {
-                let _ = c.flag("-Wl,--gc-sections".into());
+                let _ = c.flag("-Wl,--gc-sections");
                 enable_lvi_hardening(&mut c);
             }
         }
@@ -522,13 +501,13 @@ fn compile(
     if ext == "obj" {
         p.to_str().expect("Invalid path").into()
     } else {
-        let mut out_path = out_dir.clone().join(p.file_name().unwrap());
+        let mut out_path = out_dir.join(p.file_name().unwrap());
         assert!(out_path.set_extension(target.obj_ext));
         if need_run(&p, &out_path, includes_modified) {
-            let cmd = if &target.os != WINDOWS || ext != "asm" {
+            let cmd = if target.os != WINDOWS || ext != "asm" {
                 cc(p, ext, target, warnings_are_errors, &out_path)
             } else {
-                yasm(p, &target.arch, &out_path)
+                nasm(p, &target.arch, &out_path)
             };
 
             run_command(cmd);
@@ -538,7 +517,7 @@ fn compile(
 }
 
 fn obj_path(out_dir: &Path, src: &Path, obj_ext: &str) -> PathBuf {
-    let mut out_path = out_dir.clone().join(src.file_name().unwrap());
+    let mut out_path = out_dir.join(src.file_name().unwrap());
     assert!(out_path.set_extension(obj_ext));
     out_path
 }
@@ -550,6 +529,8 @@ fn cc(
     warnings_are_errors: bool,
     out_dir: &Path,
 ) -> Command {
+    let is_musl = target.env.starts_with("musl");
+
     let mut c = cc::Build::new();
     let _ = c.include("include");
     match ext {
@@ -564,9 +545,9 @@ fn cc(
     for f in cpp_flags(target) {
         let _ = c.flag(&f);
     }
-    if &target.os != "none"
-        && &target.os != "redox"
-        && &target.os != "windows"
+    if target.os != "none"
+        && target.os != "redox"
+        && target.os != "windows"
         && target.arch != "wasm32"
     {
         let _ = c.flag("-fstack-protector");
@@ -597,8 +578,19 @@ fn cc(
         }
     }
 
-    if (target.arch.as_str(), target.os.as_str()) == ("wasm32", "unknown") {
-        let _ = c.flag("--no-standard-libraries");
+    // Allow cross-compiling without a target sysroot for these targets.
+    //
+    // poly1305_vec.c requires <emmintrin.h> which requires <stdlib.h>.
+    if (target.arch == "wasm32" && target.os == "unknown")
+        || (target.os == "linux" && is_musl && target.arch != "x86_64")
+    {
+        if let Ok(compiler) = c.try_get_compiler() {
+            // TODO: Expand this to non-clang compilers in 0.17.0 if practical.
+            if compiler.is_like_clang() {
+                let _ = c.flag("-nostdlibinc");
+                let _ = c.define("GFp_NOSTDLIBINC", "1");
+            }
+        }
     }
 
     if warnings_are_errors {
@@ -609,7 +601,7 @@ fn cc(
         };
         let _ = c.flag(flag);
     }
-    if &target.env == "musl" {
+    if is_musl {
         // Some platforms enable _FORTIFY_SOURCE by default, but musl
         // libc doesn't support it yet. See
         // http://wiki.musl-libc.org/wiki/Future_Ideas#Fortify
@@ -630,21 +622,20 @@ fn cc(
     c
 }
 
-fn yasm(file: &Path, arch: &str, out_file: &Path) -> Command {
-    let (oformat, machine) = match arch {
-        "x86_64" => ("--oformat=win64", "--machine=amd64"),
-        "x86" => ("--oformat=win32", "--machine=x86"),
+fn nasm(file: &Path, arch: &str, out_file: &Path) -> Command {
+    let oformat = match arch {
+        "x86_64" => ("win64"),
+        "x86" => ("win32"),
         _ => panic!("unsupported arch: {}", arch),
     };
-    let mut c = Command::new("yasm.exe");
+    let mut c = Command::new("./target/tools/nasm");
     let _ = c
-        .arg("-X")
-        .arg("vc")
-        .arg("--dformat=cv8")
-        .arg(oformat)
-        .arg(machine)
         .arg("-o")
         .arg(out_file.to_str().expect("Invalid path"))
+        .arg("-f")
+        .arg(oformat)
+        .arg("-Xgnu")
+        .arg("-gcv8")
         .arg(file);
     c
 }
@@ -783,7 +774,7 @@ fn file_modified(path: &Path) -> SystemTime {
 }
 
 fn get_command(var: &str, default: &str) -> String {
-    std::env::var(var).unwrap_or(default.into())
+    std::env::var(var).unwrap_or_else(|_| default.into())
 }
 
 fn check_all_files_tracked() {
@@ -828,11 +819,7 @@ where
 fn lvi_mitigation_not_supported(base_config: &cc::Build) -> bool {
     let feature_flag = base_config.is_flag_supported("-mlvi-hardening");
 
-    match feature_flag {
-        Ok(false) => true,
-        Err(_) => true,
-        _ => false,
-    }
+    matches!(feature_flag, Ok(false) | Err(_))
 }
 
 fn enable_lvi_hardening(c: &mut cc::Build) {
diff --git a/crypto/.gitattributes b/crypto/.gitattributes
deleted file mode 100644
index 15a5c58091..0000000000
--- a/crypto/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-*.h linguist-language=C
diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl
index 1d6f60aa69..90fa2c777b 100755
--- a/crypto/chacha/asm/chacha-armv4.pl
+++ b/crypto/chacha/asm/chacha-armv4.pl
@@ -197,6 +197,8 @@ sub ROUND {
 .Lone:
 .long	1,0,0,0
 #if __ARM_MAX_ARCH__>=7
+.extern GFp_armcap_P
+.hidden GFp_armcap_P
 .LOPENSSL_armcap:
 .word   GFp_armcap_P-.LChaCha20_ctr32
 #else
@@ -1151,7 +1153,6 @@ sub NEONROUND {
 	add		sp,sp,#4*(16+3)
 	ldmia		sp!,{r4-r11,pc}
 .size	ChaCha20_neon,.-ChaCha20_neon
-.comm	GFp_armcap_P,4,4
 #endif
 ___
 }}}
diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl
index 4644cfd1a8..80ec882c57 100755
--- a/crypto/chacha/asm/chacha-armv8.pl
+++ b/crypto/chacha/asm/chacha-armv8.pl
@@ -123,6 +123,7 @@ sub ROUND {
 #include <GFp/arm_arch.h>
 
 .extern	GFp_armcap_P
+.hidden	GFp_armcap_P
 
 .section .rodata
 
@@ -139,6 +140,7 @@ sub ROUND {
 .type	GFp_ChaCha20_ctr32,%function
 .align	5
 GFp_ChaCha20_ctr32:
+	AARCH64_VALID_CALL_TARGET
 	cbz	$len,.Labort
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	@x[0],:pg_hi21_nc:GFp_armcap_P
@@ -152,6 +154,7 @@ sub ROUND {
 	b.ne	ChaCha20_neon
 
 .Lshort:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
 
@@ -272,6 +275,7 @@ sub ROUND {
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
 .Labort:
 	ret
 
@@ -328,6 +332,7 @@ sub ROUND {
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32
 ___
@@ -373,6 +378,7 @@ sub NEONROUND {
 .type	ChaCha20_neon,%function
 .align	5
 ChaCha20_neon:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
 
@@ -572,6 +578,7 @@ sub NEONROUND {
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 
 .Ltail_neon:
@@ -681,6 +688,7 @@ sub NEONROUND {
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	ChaCha20_neon,.-ChaCha20_neon
 ___
@@ -693,6 +701,7 @@ sub NEONROUND {
 .type	ChaCha20_512_neon,%function
 .align	5
 ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
 
@@ -1112,6 +1121,7 @@ sub NEONROUND {
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	ChaCha20_512_neon,.-ChaCha20_512_neon
 ___
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
index 98b4ae7106..c85bfa89cf 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -233,10 +233,6 @@ sub ROUND {			# critical path is 24 cycles per round
 	je	.Lno_data
 	mov	GFp_ia32cap_P+4(%rip),%r10
 ___
-$code.=<<___	if ($avx>2);
-	bt	\$48,%r10		# check for AVX512F
-	jc	.LChaCha20_avx512
-___
 $code.=<<___;
 	test	\$`1<<(41-32)`,%r10d
 	jnz	.LChaCha20_ssse3
@@ -1807,733 +1803,7 @@ sub AVX2_lane_ROUND {
 }
 
 ########################################################################
-# AVX512 code paths
-if ($avx>2) {
-# This one handles shorter inputs...
-
-my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
-my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
-
-sub AVX512ROUND {	# critical path is 14 "SIMD ticks" per round
-	&vpaddd	($a,$a,$b);
-	&vpxord	($d,$d,$a);
-	&vprold	($d,$d,16);
-
-	&vpaddd	($c,$c,$d);
-	&vpxord	($b,$b,$c);
-	&vprold	($b,$b,12);
-
-	&vpaddd	($a,$a,$b);
-	&vpxord	($d,$d,$a);
-	&vprold	($d,$d,8);
-
-	&vpaddd	($c,$c,$d);
-	&vpxord	($b,$b,$c);
-	&vprold	($b,$b,7);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-$code.=<<___;
-.type	ChaCha20_avx512,\@function,5
-.align	32
-ChaCha20_avx512:
-.LChaCha20_avx512:
-.cfi_startproc
-	mov	%rsp,%r9		# frame pointer
-.cfi_def_cfa_register	r9
-	cmp	\$512,$len
-	ja	.LChaCha20_16x
-
-	sub	\$64+$xframe,%rsp
-___
-$code.=<<___	if ($win64);
-	movaps	%xmm6,-0x28(%r9)
-	movaps	%xmm7,-0x18(%r9)
-.Lavx512_body:
-___
-$code.=<<___;
-	vbroadcasti32x4	.Lsigma(%rip),$a
-	vbroadcasti32x4	($key),$b
-	vbroadcasti32x4	16($key),$c
-	vbroadcasti32x4	($counter),$d
-
-	vmovdqa32	$a,$a_
-	vmovdqa32	$b,$b_
-	vmovdqa32	$c,$c_
-	vpaddd		.Lzeroz(%rip),$d,$d
-	vmovdqa32	.Lfourz(%rip),$fourz
-	mov		\$10,$counter	# reuse $counter
-	vmovdqa32	$d,$d_
-	jmp		.Loop_avx512
-
-.align	16
-.Loop_outer_avx512:
-	vmovdqa32	$a_,$a
-	vmovdqa32	$b_,$b
-	vmovdqa32	$c_,$c
-	vpaddd		$fourz,$d_,$d
-	mov		\$10,$counter
-	vmovdqa32	$d,$d_
-	jmp		.Loop_avx512
-
-.align	32
-.Loop_avx512:
-___
-	&AVX512ROUND();
-	&vpshufd	($c,$c,0b01001110);
-	&vpshufd	($b,$b,0b00111001);
-	&vpshufd	($d,$d,0b10010011);
-
-	&AVX512ROUND();
-	&vpshufd	($c,$c,0b01001110);
-	&vpshufd	($b,$b,0b10010011);
-	&vpshufd	($d,$d,0b00111001);
-
-	&dec		($counter);
-	&jnz		(".Loop_avx512");
-
-$code.=<<___;
-	vpaddd		$a_,$a,$a
-	vpaddd		$b_,$b,$b
-	vpaddd		$c_,$c,$c
-	vpaddd		$d_,$d,$d
-
-	sub		\$64,$len
-	jb		.Ltail64_avx512
-
-	vpxor		0x00($inp),%x#$a,$t0	# xor with input
-	vpxor		0x10($inp),%x#$b,$t1
-	vpxor		0x20($inp),%x#$c,$t2
-	vpxor		0x30($inp),%x#$d,$t3
-	lea		0x40($inp),$inp		# inp+=64
-
-	vmovdqu		$t0,0x00($out)		# write output
-	vmovdqu		$t1,0x10($out)
-	vmovdqu		$t2,0x20($out)
-	vmovdqu		$t3,0x30($out)
-	lea		0x40($out),$out		# out+=64
-
-	jz		.Ldone_avx512
-
-	vextracti32x4	\$1,$a,$t0
-	vextracti32x4	\$1,$b,$t1
-	vextracti32x4	\$1,$c,$t2
-	vextracti32x4	\$1,$d,$t3
-
-	sub		\$64,$len
-	jb		.Ltail_avx512
-
-	vpxor		0x00($inp),$t0,$t0	# xor with input
-	vpxor		0x10($inp),$t1,$t1
-	vpxor		0x20($inp),$t2,$t2
-	vpxor		0x30($inp),$t3,$t3
-	lea		0x40($inp),$inp		# inp+=64
-
-	vmovdqu		$t0,0x00($out)		# write output
-	vmovdqu		$t1,0x10($out)
-	vmovdqu		$t2,0x20($out)
-	vmovdqu		$t3,0x30($out)
-	lea		0x40($out),$out		# out+=64
-
-	jz		.Ldone_avx512
-
-	vextracti32x4	\$2,$a,$t0
-	vextracti32x4	\$2,$b,$t1
-	vextracti32x4	\$2,$c,$t2
-	vextracti32x4	\$2,$d,$t3
-
-	sub		\$64,$len
-	jb		.Ltail_avx512
-
-	vpxor		0x00($inp),$t0,$t0	# xor with input
-	vpxor		0x10($inp),$t1,$t1
-	vpxor		0x20($inp),$t2,$t2
-	vpxor		0x30($inp),$t3,$t3
-	lea		0x40($inp),$inp		# inp+=64
-
-	vmovdqu		$t0,0x00($out)		# write output
-	vmovdqu		$t1,0x10($out)
-	vmovdqu		$t2,0x20($out)
-	vmovdqu		$t3,0x30($out)
-	lea		0x40($out),$out		# out+=64
-
-	jz		.Ldone_avx512
-
-	vextracti32x4	\$3,$a,$t0
-	vextracti32x4	\$3,$b,$t1
-	vextracti32x4	\$3,$c,$t2
-	vextracti32x4	\$3,$d,$t3
-
-	sub		\$64,$len
-	jb		.Ltail_avx512
-
-	vpxor		0x00($inp),$t0,$t0	# xor with input
-	vpxor		0x10($inp),$t1,$t1
-	vpxor		0x20($inp),$t2,$t2
-	vpxor		0x30($inp),$t3,$t3
-	lea		0x40($inp),$inp		# inp+=64
-
-	vmovdqu		$t0,0x00($out)		# write output
-	vmovdqu		$t1,0x10($out)
-	vmovdqu		$t2,0x20($out)
-	vmovdqu		$t3,0x30($out)
-	lea		0x40($out),$out		# out+=64
-
-	jnz		.Loop_outer_avx512
-
-	jmp		.Ldone_avx512
-
-.align	16
-.Ltail64_avx512:
-	vmovdqa		%x#$a,0x00(%rsp)
-	vmovdqa		%x#$b,0x10(%rsp)
-	vmovdqa		%x#$c,0x20(%rsp)
-	vmovdqa		%x#$d,0x30(%rsp)
-	add		\$64,$len
-	jmp		.Loop_tail_avx512
-
-.align	16
-.Ltail_avx512:
-	vmovdqa		$t0,0x00(%rsp)
-	vmovdqa		$t1,0x10(%rsp)
-	vmovdqa		$t2,0x20(%rsp)
-	vmovdqa		$t3,0x30(%rsp)
-	add		\$64,$len
-
-.Loop_tail_avx512:
-	movzb		($inp,$counter),%eax
-	movzb		(%rsp,$counter),%ecx
-	lea		1($counter),$counter
-	xor		%ecx,%eax
-	mov		%al,-1($out,$counter)
-	dec		$len
-	jnz		.Loop_tail_avx512
-
-	vmovdqa32	$a_,0x00(%rsp)
-
-.Ldone_avx512:
-	vzeroall
-___
-$code.=<<___	if ($win64);
-	movaps	-0x28(%r9),%xmm6
-	movaps	-0x18(%r9),%xmm7
-___
-$code.=<<___;
-	lea	(%r9),%rsp
-.cfi_def_cfa_register	rsp
-.Lavx512_epilogue:
-	ret
-.cfi_endproc
-.size	ChaCha20_avx512,.-ChaCha20_avx512
-___
-}
-if ($avx>2) {
-# This one handles longer inputs...
-
-my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
-    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
-my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
-	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-my @key=map("%zmm$_",(16..31));
-my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-sub AVX512_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
-	(
-	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
-	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
-	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
-	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
-	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
-	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
-	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
-	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
-	"&vprold	(@x[$d0],@x[$d0],16)",
-	 "&vprold	(@x[$d1],@x[$d1],16)",
-	  "&vprold	(@x[$d2],@x[$d2],16)",
-	   "&vprold	(@x[$d3],@x[$d3],16)",
-
-	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
-	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
-	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
-	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
-	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
-	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
-	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
-	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
-	"&vprold	(@x[$b0],@x[$b0],12)",
-	 "&vprold	(@x[$b1],@x[$b1],12)",
-	  "&vprold	(@x[$b2],@x[$b2],12)",
-	   "&vprold	(@x[$b3],@x[$b3],12)",
-
-	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
-	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
-	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
-	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
-	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
-	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
-	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
-	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
-	"&vprold	(@x[$d0],@x[$d0],8)",
-	 "&vprold	(@x[$d1],@x[$d1],8)",
-	  "&vprold	(@x[$d2],@x[$d2],8)",
-	   "&vprold	(@x[$d3],@x[$d3],8)",
-
-	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
-	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
-	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
-	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
-	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
-	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
-	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
-	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
-	"&vprold	(@x[$b0],@x[$b0],7)",
-	 "&vprold	(@x[$b1],@x[$b1],7)",
-	  "&vprold	(@x[$b2],@x[$b2],7)",
-	   "&vprold	(@x[$b3],@x[$b3],7)"
-	);
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type	ChaCha20_16x,\@function,5
-.align	32
-ChaCha20_16x:
-.LChaCha20_16x:
-.cfi_startproc
-	mov		%rsp,%r9		# frame register
-.cfi_def_cfa_register	r9
-	sub		\$64+$xframe,%rsp
-	and		\$-64,%rsp
-___
-$code.=<<___	if ($win64);
-	movaps		%xmm6,-0xa8(%r9)
-	movaps		%xmm7,-0x98(%r9)
-	movaps		%xmm8,-0x88(%r9)
-	movaps		%xmm9,-0x78(%r9)
-	movaps		%xmm10,-0x68(%r9)
-	movaps		%xmm11,-0x58(%r9)
-	movaps		%xmm12,-0x48(%r9)
-	movaps		%xmm13,-0x38(%r9)
-	movaps		%xmm14,-0x28(%r9)
-	movaps		%xmm15,-0x18(%r9)
-.L16x_body:
-___
-$code.=<<___;
-	vzeroupper
-
-	lea		.Lsigma(%rip),%r10
-	vbroadcasti32x4	(%r10),$xa3		# key[0]
-	vbroadcasti32x4	($key),$xb3		# key[1]
-	vbroadcasti32x4	16($key),$xc3		# key[2]
-	vbroadcasti32x4	($counter),$xd3		# key[3]
-
-	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
-	vpshufd		\$0x55,$xa3,$xa1
-	vpshufd		\$0xaa,$xa3,$xa2
-	vpshufd		\$0xff,$xa3,$xa3
-	vmovdqa64	$xa0,@key[0]
-	vmovdqa64	$xa1,@key[1]
-	vmovdqa64	$xa2,@key[2]
-	vmovdqa64	$xa3,@key[3]
-
-	vpshufd		\$0x00,$xb3,$xb0
-	vpshufd		\$0x55,$xb3,$xb1
-	vpshufd		\$0xaa,$xb3,$xb2
-	vpshufd		\$0xff,$xb3,$xb3
-	vmovdqa64	$xb0,@key[4]
-	vmovdqa64	$xb1,@key[5]
-	vmovdqa64	$xb2,@key[6]
-	vmovdqa64	$xb3,@key[7]
-
-	vpshufd		\$0x00,$xc3,$xc0
-	vpshufd		\$0x55,$xc3,$xc1
-	vpshufd		\$0xaa,$xc3,$xc2
-	vpshufd		\$0xff,$xc3,$xc3
-	vmovdqa64	$xc0,@key[8]
-	vmovdqa64	$xc1,@key[9]
-	vmovdqa64	$xc2,@key[10]
-	vmovdqa64	$xc3,@key[11]
-
-	vpshufd		\$0x00,$xd3,$xd0
-	vpshufd		\$0x55,$xd3,$xd1
-	vpshufd		\$0xaa,$xd3,$xd2
-	vpshufd		\$0xff,$xd3,$xd3
-	vpaddd		.Lincz(%rip),$xd0,$xd0	# don't save counters yet
-	vmovdqa64	$xd0,@key[12]
-	vmovdqa64	$xd1,@key[13]
-	vmovdqa64	$xd2,@key[14]
-	vmovdqa64	$xd3,@key[15]
-
-	mov		\$10,%eax
-	jmp		.Loop16x
-
-.align	32
-.Loop_outer16x:
-	vpbroadcastd	0(%r10),$xa0		# reload key
-	vpbroadcastd	4(%r10),$xa1
-	vpbroadcastd	8(%r10),$xa2
-	vpbroadcastd	12(%r10),$xa3
-	vpaddd		.Lsixteen(%rip),@key[12],@key[12]	# next SIMD counters
-	vmovdqa64	@key[4],$xb0
-	vmovdqa64	@key[5],$xb1
-	vmovdqa64	@key[6],$xb2
-	vmovdqa64	@key[7],$xb3
-	vmovdqa64	@key[8],$xc0
-	vmovdqa64	@key[9],$xc1
-	vmovdqa64	@key[10],$xc2
-	vmovdqa64	@key[11],$xc3
-	vmovdqa64	@key[12],$xd0
-	vmovdqa64	@key[13],$xd1
-	vmovdqa64	@key[14],$xd2
-	vmovdqa64	@key[15],$xd3
-
-	vmovdqa64	$xa0,@key[0]
-	vmovdqa64	$xa1,@key[1]
-	vmovdqa64	$xa2,@key[2]
-	vmovdqa64	$xa3,@key[3]
-
-	mov		\$10,%eax
-	jmp		.Loop16x
-
-.align	32
-.Loop16x:
-___
-	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
-	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
-	dec		%eax
-	jnz		.Loop16x
-
-	vpaddd		@key[0],$xa0,$xa0	# accumulate key
-	vpaddd		@key[1],$xa1,$xa1
-	vpaddd		@key[2],$xa2,$xa2
-	vpaddd		@key[3],$xa3,$xa3
-
-	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
-	vpunpckldq	$xa3,$xa2,$xt3
-	vpunpckhdq	$xa1,$xa0,$xa0
-	vpunpckhdq	$xa3,$xa2,$xa2
-	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
-	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
-	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
-	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
-___
-	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
-	vpaddd		@key[4],$xb0,$xb0
-	vpaddd		@key[5],$xb1,$xb1
-	vpaddd		@key[6],$xb2,$xb2
-	vpaddd		@key[7],$xb3,$xb3
-
-	vpunpckldq	$xb1,$xb0,$xt2
-	vpunpckldq	$xb3,$xb2,$xt3
-	vpunpckhdq	$xb1,$xb0,$xb0
-	vpunpckhdq	$xb3,$xb2,$xb2
-	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
-	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
-	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
-	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
-___
-	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
-	vshufi32x4	\$0x44,$xb0,$xa0,$xt3	# "de-interlace" further
-	vshufi32x4	\$0xee,$xb0,$xa0,$xb0
-	vshufi32x4	\$0x44,$xb1,$xa1,$xa0
-	vshufi32x4	\$0xee,$xb1,$xa1,$xb1
-	vshufi32x4	\$0x44,$xb2,$xa2,$xa1
-	vshufi32x4	\$0xee,$xb2,$xa2,$xb2
-	vshufi32x4	\$0x44,$xb3,$xa3,$xa2
-	vshufi32x4	\$0xee,$xb3,$xa3,$xb3
-___
-	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
-	vpaddd		@key[8],$xc0,$xc0
-	vpaddd		@key[9],$xc1,$xc1
-	vpaddd		@key[10],$xc2,$xc2
-	vpaddd		@key[11],$xc3,$xc3
-
-	vpunpckldq	$xc1,$xc0,$xt2
-	vpunpckldq	$xc3,$xc2,$xt3
-	vpunpckhdq	$xc1,$xc0,$xc0
-	vpunpckhdq	$xc3,$xc2,$xc2
-	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
-	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
-	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
-	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
-___
-	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
-	vpaddd		@key[12],$xd0,$xd0
-	vpaddd		@key[13],$xd1,$xd1
-	vpaddd		@key[14],$xd2,$xd2
-	vpaddd		@key[15],$xd3,$xd3
-
-	vpunpckldq	$xd1,$xd0,$xt2
-	vpunpckldq	$xd3,$xd2,$xt3
-	vpunpckhdq	$xd1,$xd0,$xd0
-	vpunpckhdq	$xd3,$xd2,$xd2
-	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
-	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
-	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
-	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
-___
-	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
-	vshufi32x4	\$0x44,$xd0,$xc0,$xt3	# "de-interlace" further
-	vshufi32x4	\$0xee,$xd0,$xc0,$xd0
-	vshufi32x4	\$0x44,$xd1,$xc1,$xc0
-	vshufi32x4	\$0xee,$xd1,$xc1,$xd1
-	vshufi32x4	\$0x44,$xd2,$xc2,$xc1
-	vshufi32x4	\$0xee,$xd2,$xc2,$xd2
-	vshufi32x4	\$0x44,$xd3,$xc3,$xc2
-	vshufi32x4	\$0xee,$xd3,$xc3,$xd3
-___
-	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
-$code.=<<___;
-	vshufi32x4	\$0x88,$xc0,$xa0,$xt0	# "de-interlace" further
-	vshufi32x4	\$0xdd,$xc0,$xa0,$xa0
-	 vshufi32x4	\$0x88,$xd0,$xb0,$xc0
-	 vshufi32x4	\$0xdd,$xd0,$xb0,$xd0
-	vshufi32x4	\$0x88,$xc1,$xa1,$xt1
-	vshufi32x4	\$0xdd,$xc1,$xa1,$xa1
-	 vshufi32x4	\$0x88,$xd1,$xb1,$xc1
-	 vshufi32x4	\$0xdd,$xd1,$xb1,$xd1
-	vshufi32x4	\$0x88,$xc2,$xa2,$xt2
-	vshufi32x4	\$0xdd,$xc2,$xa2,$xa2
-	 vshufi32x4	\$0x88,$xd2,$xb2,$xc2
-	 vshufi32x4	\$0xdd,$xd2,$xb2,$xd2
-	vshufi32x4	\$0x88,$xc3,$xa3,$xt3
-	vshufi32x4	\$0xdd,$xc3,$xa3,$xa3
-	 vshufi32x4	\$0x88,$xd3,$xb3,$xc3
-	 vshufi32x4	\$0xdd,$xd3,$xb3,$xd3
-___
-	($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
-	($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
-
-	($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
-	 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
-	($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
-	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-$code.=<<___;
-	cmp		\$64*16,$len
-	jb		.Ltail16x
-
-	vpxord		0x00($inp),$xa0,$xa0	# xor with input
-	vpxord		0x40($inp),$xb0,$xb0
-	vpxord		0x80($inp),$xc0,$xc0
-	vpxord		0xc0($inp),$xd0,$xd0
-	vmovdqu32	$xa0,0x00($out)
-	vmovdqu32	$xb0,0x40($out)
-	vmovdqu32	$xc0,0x80($out)
-	vmovdqu32	$xd0,0xc0($out)
-
-	vpxord		0x100($inp),$xa1,$xa1
-	vpxord		0x140($inp),$xb1,$xb1
-	vpxord		0x180($inp),$xc1,$xc1
-	vpxord		0x1c0($inp),$xd1,$xd1
-	vmovdqu32	$xa1,0x100($out)
-	vmovdqu32	$xb1,0x140($out)
-	vmovdqu32	$xc1,0x180($out)
-	vmovdqu32	$xd1,0x1c0($out)
-
-	vpxord		0x200($inp),$xa2,$xa2
-	vpxord		0x240($inp),$xb2,$xb2
-	vpxord		0x280($inp),$xc2,$xc2
-	vpxord		0x2c0($inp),$xd2,$xd2
-	vmovdqu32	$xa2,0x200($out)
-	vmovdqu32	$xb2,0x240($out)
-	vmovdqu32	$xc2,0x280($out)
-	vmovdqu32	$xd2,0x2c0($out)
-
-	vpxord		0x300($inp),$xa3,$xa3
-	vpxord		0x340($inp),$xb3,$xb3
-	vpxord		0x380($inp),$xc3,$xc3
-	vpxord		0x3c0($inp),$xd3,$xd3
-	lea		0x400($inp),$inp
-	vmovdqu32	$xa3,0x300($out)
-	vmovdqu32	$xb3,0x340($out)
-	vmovdqu32	$xc3,0x380($out)
-	vmovdqu32	$xd3,0x3c0($out)
-	lea		0x400($out),$out
-
-	sub		\$64*16,$len
-	jnz		.Loop_outer16x
-
-	jmp		.Ldone16x
-
-.align	32
-.Ltail16x:
-	xor		%r10,%r10
-	sub		$inp,$out
-	cmp		\$64*1,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xa0,$xa0	# xor with input
-	vmovdqu32	$xa0,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xb0,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*2,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xb0,$xb0
-	vmovdqu32	$xb0,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xc0,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*3,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xc0,$xc0
-	vmovdqu32	$xc0,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xd0,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*4,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xd0,$xd0
-	vmovdqu32	$xd0,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xa1,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*5,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xa1,$xa1
-	vmovdqu32	$xa1,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xb1,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*6,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xb1,$xb1
-	vmovdqu32	$xb1,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xc1,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*7,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xc1,$xc1
-	vmovdqu32	$xc1,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xd1,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*8,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xd1,$xd1
-	vmovdqu32	$xd1,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xa2,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*9,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xa2,$xa2
-	vmovdqu32	$xa2,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xb2,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*10,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xb2,$xb2
-	vmovdqu32	$xb2,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xc2,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*11,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xc2,$xc2
-	vmovdqu32	$xc2,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xd2,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*12,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xd2,$xd2
-	vmovdqu32	$xd2,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xa3,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*13,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xa3,$xa3
-	vmovdqu32	$xa3,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xb3,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*14,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xb3,$xb3
-	vmovdqu32	$xb3,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xc3,$xa0
-	lea		64($inp),$inp
-
-	cmp		\$64*15,$len
-	jb		.Less_than_64_16x
-	vpxord		($inp),$xc3,$xc3
-	vmovdqu32	$xc3,($out,$inp)
-	je		.Ldone16x
-	vmovdqa32	$xd3,$xa0
-	lea		64($inp),$inp
-
-.Less_than_64_16x:
-	vmovdqa32	$xa0,0x00(%rsp)
-	lea		($out,$inp),$out
-	and		\$63,$len
-
-.Loop_tail16x:
-	movzb		($inp,%r10),%eax
-	movzb		(%rsp,%r10),%ecx
-	lea		1(%r10),%r10
-	xor		%ecx,%eax
-	mov		%al,-1($out,%r10)
-	dec		$len
-	jnz		.Loop_tail16x
-
-	vpxord		$xa0,$xa0,$xa0
-	vmovdqa32	$xa0,0(%rsp)
-
-.Ldone16x:
-	vzeroall
-___
-$code.=<<___	if ($win64);
-	movaps		-0xa8(%r9),%xmm6
-	movaps		-0x98(%r9),%xmm7
-	movaps		-0x88(%r9),%xmm8
-	movaps		-0x78(%r9),%xmm9
-	movaps		-0x68(%r9),%xmm10
-	movaps		-0x58(%r9),%xmm11
-	movaps		-0x48(%r9),%xmm12
-	movaps		-0x38(%r9),%xmm13
-	movaps		-0x28(%r9),%xmm14
-	movaps		-0x18(%r9),%xmm15
-___
-$code.=<<___;
-	lea		(%r9),%rsp
-.cfi_def_cfa_register	rsp
-.L16x_epilogue:
-	ret
-.cfi_endproc
-.size	ChaCha20_16x,.-ChaCha20_16x
-___
-}
+# AVX512 code paths were removed
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
@@ -2729,15 +1999,6 @@ sub AVX512_lane_ROUND {
 	.rva	.LSEH_end_ChaCha20_8x
 	.rva	.LSEH_info_ChaCha20_8x
 ___
-$code.=<<___ if ($avx>2);
-	.rva	.LSEH_begin_ChaCha20_avx512
-	.rva	.LSEH_end_ChaCha20_avx512
-	.rva	.LSEH_info_ChaCha20_avx512
-
-	.rva	.LSEH_begin_ChaCha20_16x
-	.rva	.LSEH_end_ChaCha20_16x
-	.rva	.LSEH_info_ChaCha20_16x
-___
 $code.=<<___;
 .section	.xdata
 .align	8
@@ -2761,17 +2022,6 @@ sub AVX512_lane_ROUND {
 	.rva	full_handler
 	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
 ___
-$code.=<<___ if ($avx>2);
-.LSEH_info_ChaCha20_avx512:
-	.byte	9,0,0,0
-	.rva	ssse3_handler
-	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
-
-.LSEH_info_ChaCha20_16x:
-	.byte	9,0,0,0
-	.rva	full_handler
-	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
-___
 }
 
 foreach (split("\n",$code)) {
diff --git a/crypto/crypto.c b/crypto/crypto.c
index 06000a856a..8a3d06675b 100644
--- a/crypto/crypto.c
+++ b/crypto/crypto.c
@@ -35,10 +35,4 @@
 // initialising it to zero, it becomes a "data symbol", which isn't so
 // affected.
 HIDDEN uint32_t GFp_ia32cap_P[4] = {0};
-#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
-
-#include <GFp/arm_arch.h>
-
-HIDDEN uint32_t GFp_armcap_P = 0;
-
 #endif
diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c
index b4198996e8..30afff0eda 100644
--- a/crypto/curve25519/curve25519.c
+++ b/crypto/curve25519/curve25519.c
@@ -159,7 +159,7 @@ static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
 
 static void fe_frombytes(fe *h, const uint8_t s[32]) {
   uint8_t s_copy[32];
-  bytes_copy(s_copy, s, 32);
+  GFp_memcpy(s_copy, s, 32);
   s_copy[31] &= 0x7f;
   fe_frombytes_strict(h, s_copy);
 }
@@ -171,21 +171,21 @@ static void fe_tobytes(uint8_t s[32], const fe *f) {
 
 // h = 0
 static void fe_0(fe *h) {
-  fe_limbs_zero(h->v);
+  GFp_memset(h, 0, sizeof(fe));
 }
 
 static void fe_loose_0(fe_loose *h) {
-  fe_limbs_zero(h->v);
+  GFp_memset(h, 0, sizeof(fe_loose));
 }
 
 // h = 1
 static void fe_1(fe *h) {
-  fe_0(h);
+  GFp_memset(h, 0, sizeof(fe));
   h->v[0] = 1;
 }
 
 static void fe_loose_1(fe_loose *h) {
-  fe_loose_0(h);
+  GFp_memset(h, 0, sizeof(fe_loose));
   h->v[0] = 1;
 }
 
@@ -1782,7 +1782,7 @@ void GFp_x25519_scalar_mult_generic_masked(uint8_t out[32],
   fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
 
   uint8_t e[32];
-  bytes_copy(e, scalar_masked, 32);
+  GFp_memcpy(e, scalar_masked, 32);
   // The following implementation was transcribed to Coq and proven to
   // correspond to unary scalar multiplication in affine coordinates given that
   // x1 != 0 is the x coordinate of some point on the curve. It was also checked
@@ -1856,7 +1856,7 @@ void GFp_x25519_scalar_mult_generic_masked(uint8_t out[32],
 void GFp_x25519_public_from_private_generic_masked(uint8_t out_public_value[32],
                                                    const uint8_t private_key_masked[32]) {
   uint8_t e[32];
-  bytes_copy(e, private_key_masked, 32);
+  GFp_memcpy(e, private_key_masked, 32);
 
   ge_p3 A;
   GFp_x25519_ge_scalarmult_base(&A, e);
diff --git a/crypto/curve25519/internal.h b/crypto/curve25519/internal.h
index 5f87f92003..60f2f615b4 100644
--- a/crypto/curve25519/internal.h
+++ b/crypto/curve25519/internal.h
@@ -65,12 +65,6 @@ static inline void fe_limbs_copy(fe_limb_t r[], const fe_limb_t a[]) {
   }
 }
 
-static inline void fe_limbs_zero(fe_limb_t r[]) {
-  for (size_t i = 0; i < FE_NUM_LIMBS; ++i) {
-    r[i] = 0;
-  }
-}
-
 // ge means group element.
 //
 // Here the group is the set of pairs (x,y) of field elements (see fe.h)
diff --git a/crypto/fipsmodule/.gitattributes b/crypto/fipsmodule/.gitattributes
deleted file mode 100644
index 80928d6041..0000000000
--- a/crypto/fipsmodule/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-*.inl linguist-language=C
diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
index 4284ea7ec6..19b019e73f 100644
--- a/crypto/fipsmodule/aes/aes_nohw.c
+++ b/crypto/fipsmodule/aes/aes_nohw.c
@@ -14,13 +14,6 @@
 
 #include <GFp/aes.h>
 
-#if !defined(__wasm__)
-#include <string.h>
-#else
-void *memcpy(void *, const void*, size_t);
-void *memset(void *, int, size_t);
-#endif
-
 #include "../../internal.h"
 
 #if defined(OPENSSL_SSE2)
@@ -353,7 +346,7 @@ static inline uint8_t lo(uint32_t a) {
 
 static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
                                           const uint8_t in[16]) {
-  memcpy(out, in, 16);
+  GFp_memcpy(out, in, 16);
 #if defined(OPENSSL_SSE2)
   // No conversions needed.
 #elif defined(OPENSSL_64_BIT)
@@ -381,7 +374,7 @@ static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
 static inline void aes_nohw_uncompact_block(
     uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
 #if defined(OPENSSL_SSE2)
-  memcpy(out, in, 16);  // No conversions needed.
+  GFp_memcpy(out, in, 16);  // No conversions needed.
 #elif defined(OPENSSL_64_BIT)
   uint64_t a0 = in[0];
   uint64_t a1 = in[1];
@@ -389,8 +382,8 @@ static inline void aes_nohw_uncompact_block(
       aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
   uint64_t b1 =
       aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
-  memcpy(out, &b0, 8);
-  memcpy(out + 8, &b1, 8);
+  GFp_memcpy(out, &b0, 8);
+  GFp_memcpy(out + 8, &b1, 8);
 #else
   uint32_t a0 = in[0];
   uint32_t a1 = in[1];
@@ -411,10 +404,10 @@ static inline void aes_nohw_uncompact_block(
   b1 = aes_nohw_uncompact_word(b1);
   b2 = aes_nohw_uncompact_word(b2);
   b3 = aes_nohw_uncompact_word(b3);
-  memcpy(out, &b0, 4);
-  memcpy(out + 4, &b1, 4);
-  memcpy(out + 8, &b2, 4);
-  memcpy(out + 12, &b3, 4);
+  GFp_memcpy(out, &b0, 4);
+  GFp_memcpy(out + 4, &b1, 4);
+  GFp_memcpy(out + 8, &b2, 4);
+  GFp_memcpy(out + 12, &b3, 4);
 #endif
 }
 
@@ -482,7 +475,7 @@ static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
 static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
                               size_t num_blocks) {
   // Don't leave unused blocks uninitialized.
-  memset(out, 0, sizeof(AES_NOHW_BATCH));
+  GFp_memset(out, 0, sizeof(AES_NOHW_BATCH));
   debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
   for (size_t i = 0; i < num_blocks; i++) {
     aes_word_t block[AES_NOHW_BLOCK_WORDS];
@@ -777,7 +770,7 @@ static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
     // Copy the round key into each block in the batch.
     for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
       aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
-      memcpy(tmp, key->rd_key + 4 * i, 16);
+      GFp_memcpy(tmp, key->rd_key + 4 * i, 16);
       aes_nohw_batch_set(&out->keys[i], tmp, j);
     }
     aes_nohw_transpose(&out->keys[i]);
@@ -801,7 +794,7 @@ static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
 static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
                                const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
   AES_NOHW_BATCH batch;
-  memset(&batch, 0, sizeof(batch));
+  GFp_memset(&batch, 0, sizeof(batch));
   aes_nohw_batch_set(&batch, in, 0);
   aes_nohw_transpose(&batch);
   aes_nohw_sub_bytes(&batch);
@@ -814,7 +807,7 @@ static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
 
   aes_word_t block[AES_NOHW_BLOCK_WORDS];
   aes_nohw_compact_block(block, in);
-  memcpy(key->rd_key, block, 16);
+  GFp_memcpy(key->rd_key, block, 16);
 
   for (size_t i = 1; i <= 10; i++) {
     aes_word_t sub[AES_NOHW_BLOCK_WORDS];
@@ -833,113 +826,7 @@ static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
       block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
       block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
     }
-    memcpy(key->rd_key + 4 * i, block, 16);
-  }
-}
-
-static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) {
-  key->rounds = 12;
-
-  aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS];
-  aes_word_t *block1 = storage1, *block2 = storage2;
-
-  // AES-192's key schedule is complex because each key schedule iteration
-  // produces six words, but we compute on blocks and each block is four words.
-  // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time.
-  // We loop below every three blocks or two key schedule iterations.
-  //
-  // On entry to the loop, |block1| and the first half of |block2| contain the
-  // previous key schedule iteration. |block1| has been written to |key|, but
-  // |block2| has not as it is incomplete.
-  aes_nohw_compact_block(block1, in);
-  memcpy(key->rd_key, block1, 16);
-
-  uint8_t half_block[16] = {0};
-  memcpy(half_block, in + 16, 8);
-  aes_nohw_compact_block(block2, half_block);
-
-  for (size_t i = 0; i < 4; i++) {
-    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_sub_block(sub, block2);
-    uint8_t rcon = aes_nohw_rcon[2 * i];
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Compute the first two words of the next key schedule iteration, which
-      // go in the second half of |block2|. The first two words of the previous
-      // iteration are in the first half of |block1|. Apply |rcon| here too
-      // because the shifts match.
-      block2[j] = aes_nohw_or(
-          block2[j],
-          aes_nohw_shift_left(
-              aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8));
-      // Incorporate the transformed word and propagate. Note the last word of
-      // the previous iteration corresponds to the second word of |copy|. This
-      // is incorporated into the first word of the next iteration, or the third
-      // word of |block2|.
-      block2[j] = aes_nohw_xor(
-          block2[j], aes_nohw_and(aes_nohw_shift_left(
-                                      aes_nohw_rotate_rows_down(sub[j]), 4),
-                                  AES_NOHW_COL2_MASK));
-      block2[j] = aes_nohw_xor(
-          block2[j],
-          aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK));
-
-      // Compute the remaining four words, which fill |block1|. Begin by moving
-      // the corresponding words of the previous iteration: the second half of
-      // |block1| and the first half of |block2|.
-      block1[j] = aes_nohw_shift_right(block1[j], 8);
-      block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8));
-      // Incorporate the second word, computed previously in |block2|, and
-      // propagate.
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
-      aes_word_t v = block1[j];
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
-    }
-
-    // This completes two round keys. Note half of |block2| was computed in the
-    // previous loop iteration but was not yet output.
-    memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16);
-    memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16);
-
-    aes_nohw_sub_block(sub, block1);
-    rcon = aes_nohw_rcon[2 * i + 1];
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Compute the first four words of the next key schedule iteration in
-      // |block2|. Begin by moving the corresponding words of the previous
-      // iteration: the second half of |block2| and the first half of |block1|.
-      block2[j] = aes_nohw_shift_right(block2[j], 8);
-      block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8));
-      // Incorporate rcon and the transformed word. Note the last word of the
-      // previous iteration corresponds to the last word of |copy|.
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j));
-      block2[j] = aes_nohw_xor(
-          block2[j],
-          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
-      // Propagate to the remaining words.
-      aes_word_t v = block2[j];
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
-
-      // Compute the last two words, which go in the first half of |block1|. The
-      // last two words of the previous iteration are in the second half of
-      // |block1|.
-      block1[j] = aes_nohw_shift_right(block1[j], 8);
-      // Propagate blocks and mask off the excess.
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4));
-      block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK);
-    }
-
-    // |block2| has a complete round key. |block1| will be completed in the next
-    // iteration.
-    memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16);
-
-    // Swap blocks to restore the invariant.
-    aes_word_t *tmp = block1;
-    block1 = block2;
-    block2 = tmp;
+    GFp_memcpy(key->rd_key + 4 * i, block, 16);
   }
 }
 
@@ -949,10 +836,10 @@ static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
   // Each key schedule iteration produces two round keys.
   aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
   aes_nohw_compact_block(block1, in);
-  memcpy(key->rd_key, block1, 16);
+  GFp_memcpy(key->rd_key, block1, 16);
 
   aes_nohw_compact_block(block2, in + 16);
-  memcpy(key->rd_key + 4, block2, 16);
+  GFp_memcpy(key->rd_key + 4, block2, 16);
 
   for (size_t i = 2; i <= 14; i += 2) {
     aes_word_t sub[AES_NOHW_BLOCK_WORDS];
@@ -970,7 +857,7 @@ static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
       block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
       block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
     }
-    memcpy(key->rd_key + 4 * i, block1, 16);
+    GFp_memcpy(key->rd_key + 4 * i, block1, 16);
 
     if (i == 14) {
       break;
@@ -986,7 +873,7 @@ static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
       block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
       block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
     }
-    memcpy(key->rd_key + 4 * (i + 1), block2, 16);
+    GFp_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
   }
 }
 
@@ -999,9 +886,6 @@ int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
     case 128:
       aes_nohw_setup_key_128(aeskey, key);
       return 0;
-    case 192:
-      aes_nohw_setup_key_192(aeskey, key);
-      return 0;
     case 256:
       aes_nohw_setup_key_256(aeskey, key);
       return 0;
@@ -1022,10 +906,10 @@ static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
                                       const uint8_t b[16]) {
   for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
     aes_word_t x, y;
-    memcpy(&x, a + i, sizeof(aes_word_t));
-    memcpy(&y, b + i, sizeof(aes_word_t));
+    GFp_memcpy(&x, a + i, sizeof(aes_word_t));
+    GFp_memcpy(&y, b + i, sizeof(aes_word_t));
     x = aes_nohw_xor(x, y);
-    memcpy(out + i, &x, sizeof(aes_word_t));
+    GFp_memcpy(out + i, &x, sizeof(aes_word_t));
   }
 }
 
@@ -1045,7 +929,7 @@ void GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
     uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
   } ivs, enc_ivs;
   for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-    memcpy(ivs.u8 + 16 * i, ivec, 16);
+    GFp_memcpy(ivs.u8 + 16 * i, ivec, 16);
   }
 
   uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
diff --git a/crypto/fipsmodule/aes/asm/aesv8-armx.pl b/crypto/fipsmodule/aes/asm/aesv8-armx.pl
index c1dcde0c4e..804df8181d 100644
--- a/crypto/fipsmodule/aes/asm/aesv8-armx.pl
+++ b/crypto/fipsmodule/aes/asm/aesv8-armx.pl
@@ -96,6 +96,8 @@
 .Lenc_key:
 ___
 $code.=<<___	if ($flavour =~ /64/);
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 ___
@@ -249,6 +251,7 @@ ()
 .type	GFp_${prefix}_${dir}crypt,%function
 .align	5
 GFp_${prefix}_${dir}crypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	$rounds,[$key,#240]
 	vld1.32	{$rndkey0},[$key],#16
 	vld1.8	{$inout},[$inp]
@@ -299,6 +302,8 @@ ()
 GFp_${prefix}_ctr32_encrypt_blocks:
 ___
 $code.=<<___	if ($flavour =~ /64/);
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp		x29,x30,[sp,#-16]!
 	add		x29,sp,#0
 ___
@@ -326,20 +331,34 @@ ()
 	add		$key_,$key,#32
 	mov		$cnt,$rounds
 	cclr		$step,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov.32 lines
+	// could write to $dat1 and $dat2 directly, but that trips this bugs.
+	// We write to $ivec and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
 #ifndef __ARMEB__
 	rev		$ctr, $ctr
 #endif
-	vorr		$dat1,$dat0,$dat0
 	add		$tctr1, $ctr, #1
-	vorr		$dat2,$dat0,$dat0
-	add		$ctr, $ctr, #2
 	vorr		$ivec,$dat0,$dat0
 	rev		$tctr1, $tctr1
-	vmov.32		${dat1}[3],$tctr1
+	vmov.32		${ivec}[3],$tctr1
+	add		$ctr, $ctr, #2
+	vorr		$dat1,$ivec,$ivec
 	b.ls		.Lctr32_tail
 	rev		$tctr2, $ctr
+	vmov.32		${ivec}[3],$tctr2
 	sub		$len,$len,#3		// bias
-	vmov.32		${dat2}[3],$tctr2
+	vorr		$dat2,$ivec,$ivec
 	b		.Loop3x_ctr32
 
 .align	4
@@ -366,11 +385,11 @@ ()
 	aese		$dat1,q8
 	aesmc		$tmp1,$dat1
 	 vld1.8		{$in0},[$inp],#16
-	 vorr		$dat0,$ivec,$ivec
+	 add		$tctr0,$ctr,#1
 	aese		$dat2,q8
 	aesmc		$dat2,$dat2
 	 vld1.8		{$in1},[$inp],#16
-	 vorr		$dat1,$ivec,$ivec
+	 rev		$tctr0,$tctr0
 	aese		$tmp0,q9
 	aesmc		$tmp0,$tmp0
 	aese		$tmp1,q9
@@ -379,8 +398,6 @@ ()
 	 mov		$key_,$key
 	aese		$dat2,q9
 	aesmc		$tmp2,$dat2
-	 vorr		$dat2,$ivec,$ivec
-	 add		$tctr0,$ctr,#1
 	aese		$tmp0,q12
 	aesmc		$tmp0,$tmp0
 	aese		$tmp1,q12
@@ -395,21 +412,26 @@ ()
 	aesmc		$tmp0,$tmp0
 	aese		$tmp1,q13
 	aesmc		$tmp1,$tmp1
+	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
 	 veor		$in2,$in2,$rndlast
-	 rev		$tctr0,$tctr0
+	 vmov.32	${ivec}[3], $tctr0
 	aese		$tmp2,q13
 	aesmc		$tmp2,$tmp2
-	 vmov.32	${dat0}[3], $tctr0
+	 vorr		$dat0,$ivec,$ivec
 	 rev		$tctr1,$tctr1
 	aese		$tmp0,q14
 	aesmc		$tmp0,$tmp0
+	 vmov.32	${ivec}[3], $tctr1
+	 rev		$tctr2,$ctr
 	aese		$tmp1,q14
 	aesmc		$tmp1,$tmp1
-	 vmov.32	${dat1}[3], $tctr1
-	 rev		$tctr2,$ctr
+	 vorr		$dat1,$ivec,$ivec
+	 vmov.32	${ivec}[3], $tctr2
 	aese		$tmp2,q14
 	aesmc		$tmp2,$tmp2
-	 vmov.32	${dat2}[3], $tctr2
+	 vorr		$dat2,$ivec,$ivec
 	 subs		$len,$len,#3
 	aese		$tmp0,q15
 	aese		$tmp1,q15
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
index 1ab7e0d954..b31bbb81f2 100755
--- a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
@@ -49,6 +49,8 @@
 *STDOUT=*OUT;
 
 $code.=<<___;
+#include <GFp/arm_arch.h>
+
 .section	.rodata
 
 .type	_vpaes_consts,%object
@@ -237,6 +239,7 @@
 .type	GFp_vpaes_encrypt,%function
 .align	4
 GFp_vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 
@@ -246,6 +249,7 @@
 	st1	{v0.16b}, [$out]
 
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
 
@@ -391,6 +395,7 @@
 .type	_vpaes_schedule_core,%function
 .align	4
 _vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29, x30, [sp,#-16]!
 	add	x29,sp,#0
 
@@ -550,6 +555,7 @@
 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
 	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	_vpaes_schedule_core,.-_vpaes_schedule_core
 
@@ -720,6 +726,7 @@
 .type	GFp_vpaes_set_encrypt_key,%function
 .align	4
 GFp_vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@@ -735,6 +742,7 @@
 
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
 ___
@@ -750,6 +758,7 @@
 .type	GFp_vpaes_ctr32_encrypt_blocks,%function
 .align	4
 GFp_vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@@ -817,6 +826,7 @@
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
 ___
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl
index 038006dc6b..dbc28b51d4 100644
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -112,6 +112,8 @@
 #endif
 
 #if __ARM_MAX_ARCH__>=7
+.extern GFp_armcap_P
+.hidden GFp_armcap_P
 .align	5
 .LOPENSSL_armcap:
 .word	GFp_armcap_P-.Lbn_mul_mont
@@ -744,11 +746,6 @@
 }
 $code.=<<___;
 .asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if __ARM_MAX_ARCH__>=7
-.comm	GFp_armcap_P,4,4
-.hidden	GFp_armcap_P
-#endif
 ___
 
 foreach (split("\n",$code)) {
diff --git a/crypto/fipsmodule/bn/asm/armv8-mont.pl b/crypto/fipsmodule/bn/asm/armv8-mont.pl
index da93f3aa15..717ea68cf1 100644
--- a/crypto/fipsmodule/bn/asm/armv8-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv8-mont.pl
@@ -64,12 +64,15 @@
 $num="x5";	# size_t num);
 
 $code.=<<___;
+#include <GFp/arm_arch.h>
+
 .text
 
 .globl	GFp_bn_mul_mont
 .type	GFp_bn_mul_mont,%function
 .align	5
 GFp_bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
 	tst	$num,#7
 	b.eq	__bn_sqr8x_mont
 	tst	$num,#3
@@ -267,6 +270,7 @@
 	mov	x0,#1
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	GFp_bn_mul_mont,.-GFp_bn_mul_mont
 ___
@@ -284,6 +288,8 @@
 .type	__bn_sqr8x_mont,%function
 .align	5
 __bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
 	cmp	$ap,$bp
 	b.ne	__bn_mul4x_mont
 .Lsqr8x_mont:
@@ -1040,6 +1046,8 @@
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
 ___
@@ -1063,6 +1071,9 @@
 .type	__bn_mul4x_mont,%function
 .align	5
 __bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
 	stp	x19,x20,[sp,#16]
@@ -1496,6 +1507,8 @@
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	__bn_mul4x_mont,.-__bn_mul4x_mont
 ___
diff --git a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
index e40aaa92d4..f30025e901 100755
--- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@@ -2136,7 +2136,7 @@
 
 $code.=<<___;
 ################################################################################
-# void GFp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+# void GFp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, crypto_word index);
 .globl	GFp_nistz256_select_w5
 .type	GFp_nistz256_select_w5,\@abi-omnipotent
 .align	32
@@ -2236,7 +2236,7 @@
 .size	GFp_nistz256_select_w5,.-GFp_nistz256_select_w5
 
 ################################################################################
-# void GFp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+# void GFp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, crypto_word index);
 .globl	GFp_nistz256_select_w7
 .type	GFp_nistz256_select_w7,\@abi-omnipotent
 .align	32
@@ -2333,7 +2333,7 @@
 
 $code.=<<___;
 ################################################################################
-# void GFp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
+# void GFp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, crypto_word index);
 .type	GFp_nistz256_avx2_select_w5,\@abi-omnipotent
 .align	32
 GFp_nistz256_avx2_select_w5:
@@ -2440,7 +2440,7 @@
 $code.=<<___;
 
 ################################################################################
-# void GFp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
+# void GFp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, crypto_word index);
 .globl	GFp_nistz256_avx2_select_w7
 .type	GFp_nistz256_avx2_select_w7,\@abi-omnipotent
 .align	32
diff --git a/crypto/fipsmodule/ec/ecp_nistz.h b/crypto/fipsmodule/ec/ecp_nistz.h
index 74d31007a8..2bcf4b5d4d 100644
--- a/crypto/fipsmodule/ec/ecp_nistz.h
+++ b/crypto/fipsmodule/ec/ecp_nistz.h
@@ -246,16 +246,16 @@
 //   P-384: ...01110011; w = 2, 5, 6, 7 are okay
 //   P-256: ...01010001; w = 5, 7 are okay
 //   P-224: ...00111101; w = 3, 4, 5, 6 are okay
-static inline void booth_recode(Limb *is_negative, unsigned *digit,
-                                unsigned in, unsigned w) {
+static inline void booth_recode(crypto_word *is_negative, crypto_word *digit,
+                                crypto_word in, crypto_word w) {
   debug_assert_nonsecret(w >= 2);
   debug_assert_nonsecret(w <= 7);
 
   // Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|,
   // but 'in' seen as (`w+1`)-bit value.
-  Limb s = ~((in >> w) - 1);
-  unsigned d;
-  d = (1 << (w + 1)) - in - 1;
+  crypto_word s = ~((in >> w) - 1);
+  crypto_word d;
+  d = ((crypto_word)1u << (w + 1)) - in - 1;
   d = (d & s) | (in & ~s);
   d = (d >> 1) + (d & 1);
 
diff --git a/crypto/fipsmodule/ec/ecp_nistz256.c b/crypto/fipsmodule/ec/ecp_nistz256.c
index 34602956d2..b71100cdad 100644
--- a/crypto/fipsmodule/ec/ecp_nistz256.c
+++ b/crypto/fipsmodule/ec/ecp_nistz256.c
@@ -193,8 +193,8 @@ void GFp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT
 void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS],
                             const Limb p_x[P256_LIMBS],
                             const Limb p_y[P256_LIMBS]) {
-  static const unsigned kWindowSize = 5;
-  static const unsigned kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
+  static const size_t kWindowSize = 5;
+  static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
 
   uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1];
   gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
@@ -232,23 +232,22 @@ void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS],
 
   Limb tmp[P256_LIMBS];
   alignas(32) P256_POINT h;
-  static const unsigned START_INDEX = 256 - 1;
-  unsigned index = START_INDEX;
+  static const size_t START_INDEX = 256 - 1;
+  size_t index = START_INDEX;
 
-  unsigned raw_wvalue;
-  Limb recoded_is_negative;
-  unsigned recoded;
+  crypto_word raw_wvalue;
+  crypto_word recoded_is_negative;
+  crypto_word recoded;
 
   raw_wvalue = p_str[(index - 1) / 8];
   raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
-
   booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
   dev_assert_secret(!recoded_is_negative);
   GFp_nistz256_select_w5(r, table, recoded);
 
   while (index >= kWindowSize) {
     if (index != START_INDEX) {
-      unsigned off = (index - 1) / 8;
+      size_t off = (index - 1) / 8;
 
       raw_wvalue = p_str[off] | p_str[off + 1] << 8;
       raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
@@ -286,12 +285,12 @@ void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS],
 /* Precomputed tables for the default generator */
 #include "ecp_nistz256_table.inl"
 
-static const unsigned kWindowSize = 7;
+static const size_t kWindowSize = 7;
 
 static inline void select_precomputed(P256_POINT_AFFINE *p, size_t i,
-                                      unsigned raw_wvalue) {
-  Limb recoded_is_negative;
-  unsigned recoded;
+                                      crypto_word raw_wvalue) {
+  crypto_word recoded_is_negative;
+  crypto_word recoded;
   booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize);
   GFp_nistz256_select_w7(p, GFp_nistz256_precomputed[i], recoded);
   Limb neg_y[P256_LIMBS];
@@ -312,18 +311,18 @@ static Limb is_infinity(const Limb x[P256_LIMBS],
 
 void GFp_nistz256_point_mul_base(P256_POINT *r,
                                  const Limb g_scalar[P256_LIMBS]) {
-  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+  static const crypto_word kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
 
   uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1];
   gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
                                       g_scalar, P256_LIMBS);
 
   /* First window */
-  unsigned index = kWindowSize;
+  size_t index = kWindowSize;
 
   alignas(32) P256_POINT_AFFINE t;
 
-  unsigned raw_wvalue = (p_str[0] << 1) & kMask;
+  crypto_word raw_wvalue = (p_str[0] << 1) & kMask;
   select_precomputed(&t, 0, raw_wvalue);
 
   alignas(32) P256_POINT p;
@@ -334,7 +333,7 @@ void GFp_nistz256_point_mul_base(P256_POINT *r,
   copy_conditional(p.Z, p.X, is_infinity(p.X, p.Y));
 
   for (size_t i = 1; i < 37; i++) {
-    unsigned off = (index - 1) / 8;
+    size_t off = (index - 1) / 8;
     raw_wvalue = p_str[off] | p_str[off + 1] << 8;
     raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask;
     index += kWindowSize;
diff --git a/crypto/fipsmodule/ec/ecp_nistz256.h b/crypto/fipsmodule/ec/ecp_nistz256.h
index 01ad2e148d..561d4155f7 100644
--- a/crypto/fipsmodule/ec/ecp_nistz256.h
+++ b/crypto/fipsmodule/ec/ecp_nistz256.h
@@ -45,10 +45,10 @@ void GFp_nistz256_sqr_mont(Limb res[P256_LIMBS], const Limb a[P256_LIMBS]);
 
 /* Functions that perform constant time access to the precomputed tables */
 void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
-                            int index);
+                            crypto_word index);
 
 #if defined(GFp_USE_LARGE_TABLE)
-void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, const PRECOMP256_ROW table, int index);
+void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, const PRECOMP256_ROW table, crypto_word index);
 #endif
 
 #endif /* OPENSSL_HEADER_EC_ECP_NISTZ256_H */
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.inl b/crypto/fipsmodule/ec/ecp_nistz384.inl
index 718e4a7915..12fc9d9d35 100644
--- a/crypto/fipsmodule/ec/ecp_nistz384.inl
+++ b/crypto/fipsmodule/ec/ecp_nistz384.inl
@@ -157,10 +157,10 @@ void GFp_nistz384_point_add(P384_POINT *r, const P384_POINT *a,
   limbs_copy(r->Z, res_z, P384_LIMBS);
 }
 
-static void add_precomputed_w5(P384_POINT *r, unsigned wvalue,
+static void add_precomputed_w5(P384_POINT *r, crypto_word wvalue,
                                const P384_POINT table[16]) {
-  BN_ULONG recoded_is_negative;
-  unsigned int recoded;
+  crypto_word recoded_is_negative;
+  crypto_word recoded;
   booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
 
   alignas(64) P384_POINT h;
@@ -177,8 +177,8 @@ static void add_precomputed_w5(P384_POINT *r, unsigned wvalue,
 void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS],
                             const BN_ULONG p_x[P384_LIMBS],
                             const BN_ULONG p_y[P384_LIMBS]) {
-  static const unsigned kWindowSize = 5;
-  static const unsigned kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
+  static const size_t kWindowSize = 5;
+  static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
 
   uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
   gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
@@ -214,13 +214,13 @@ void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS],
   GFp_nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
   GFp_nistz384_point_double(&row[16 - 1], &row[8 - 1]);
 
-  static const unsigned START_INDEX = 384 - 4;
-  unsigned index = START_INDEX;
+  static const size_t START_INDEX = 384 - 4;
+  size_t index = START_INDEX;
 
   BN_ULONG recoded_is_negative;
-  unsigned recoded;
+  crypto_word recoded;
 
-  unsigned wvalue = p_str[(index - 1) / 8];
+  crypto_word wvalue = p_str[(index - 1) / 8];
   wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
 
   booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
@@ -230,7 +230,7 @@ void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS],
 
   while (index >= kWindowSize) {
     if (index != START_INDEX) {
-      unsigned off = (index - 1) / 8;
+      size_t off = (index - 1) / 8;
 
       wvalue = p_str[off] | p_str[off + 1] << 8;
       wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
diff --git a/crypto/fipsmodule/ec/gfp_p256.c b/crypto/fipsmodule/ec/gfp_p256.c
index 5e9046a960..60678ec6d9 100644
--- a/crypto/fipsmodule/ec/gfp_p256.c
+++ b/crypto/fipsmodule/ec/gfp_p256.c
@@ -73,9 +73,8 @@ void GFp_p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) {
 /* TODO(perf): Optimize these. */
 
 void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
-                            int index) {
+                            crypto_word index) {
   dev_assert_secret(index >= 0);
-  size_t index_s = (size_t)index; /* XXX: constant time? */
 
   alignas(32) Elem x; limbs_zero(x, P256_LIMBS);
   alignas(32) Elem y; limbs_zero(y, P256_LIMBS);
@@ -83,7 +82,7 @@ void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
 
   // TODO: Rewrite in terms of |limbs_select|.
   for (size_t i = 0; i < 16; ++i) {
-    Limb equal = constant_time_eq_w(index_s, i + 1);
+    crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1);
     for (size_t j = 0; j < P256_LIMBS; ++j) {
       x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
       y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
@@ -98,12 +97,9 @@ void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16],
 
 #if defined GFp_USE_LARGE_TABLE
 void GFp_nistz256_select_w7(P256_POINT_AFFINE *out,
-                            const PRECOMP256_ROW table, int index) {
-  dev_assert_secret(index >= 0);
-  size_t index_as_s = (size_t)index; /* XXX: constant time? */
-
+                            const PRECOMP256_ROW table, crypto_word index) {
   alignas(32) Limb xy[P256_LIMBS * 2];
-  limbs_select(xy, table, P256_LIMBS * 2, 64, index_as_s - 1);
+  limbs_select(xy, table, P256_LIMBS * 2, 64, index - 1);
   limbs_copy(out->X, &xy[0], P256_LIMBS);
   limbs_copy(out->Y, &xy[P256_LIMBS], P256_LIMBS);
 }
diff --git a/crypto/fipsmodule/ec/gfp_p384.c b/crypto/fipsmodule/ec/gfp_p384.c
index 641f4a70cd..820fac4a15 100644
--- a/crypto/fipsmodule/ec/gfp_p384.c
+++ b/crypto/fipsmodule/ec/gfp_p384.c
@@ -225,7 +225,7 @@ static void gfp_p384_point_select_w5(P384_POINT *out,
 
   // TODO: Rewrite in terms of |limbs_select|.
   for (size_t i = 0; i < 16; ++i) {
-    Limb equal = constant_time_eq_w(index, i + 1);
+    crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1);
     for (size_t j = 0; j < P384_LIMBS; ++j) {
       x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
       y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
index f90550b06c..7e52ad667f 100644
--- a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -157,12 +157,15 @@ sub clmul64x64 {
 }
 
 $code .= <<___;
+#include <GFp/arm_arch.h>
+
 .text
 
 .global	GFp_gcm_init_neon
 .type	GFp_gcm_init_neon,%function
 .align	4
 GFp_gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
 	// This function is adapted from gcm_init_v8. xC2 is t3.
 	ld1	{$t1.2d}, [x1]			// load H
 	movi	$t3.16b, #0xe1
@@ -187,6 +190,7 @@ sub clmul64x64 {
 .type	GFp_gcm_gmult_neon,%function
 .align	4
 GFp_gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{$INlo.16b}, [$Xi]		// load Xi
 	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
 	ld1	{$Hhi.1d}, [$Htbl]
@@ -205,6 +209,7 @@ sub clmul64x64 {
 .type	GFp_gcm_ghash_neon,%function
 .align	4
 GFp_gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{$Xl.16b}, [$Xi]		// load Xi
 	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
 	ld1	{$Hhi.1d}, [$Htbl]
diff --git a/crypto/fipsmodule/modes/asm/ghashv8-armx.pl b/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
index a477cae8fd..3a551c2901 100644
--- a/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
+++ b/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
@@ -86,6 +86,7 @@
 .type	GFp_gcm_init_clmul,%function
 .align	4
 GFp_gcm_init_clmul:
+	AARCH64_VALID_CALL_TARGET
 	vld1.64		{$t1},[x1]		@ load input H
 	vmov.i8		$xC2,#0xe1
 	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
@@ -145,6 +146,7 @@
 .type	GFp_gcm_gmult_clmul,%function
 .align	4
 GFp_gcm_gmult_clmul:
+	AARCH64_VALID_CALL_TARGET
 	vld1.64		{$t1},[$Xi]		@ load Xi
 	vmov.i8		$xC2,#0xe1
 	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
@@ -198,6 +200,7 @@
 .type	GFp_gcm_ghash_clmul,%function
 .align	4
 GFp_gcm_ghash_clmul:
+	AARCH64_VALID_CALL_TARGET
 ___
 $code.=<<___		if ($flavour !~ /64/);
 	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
deleted file mode 100644
index efccd24cd6..0000000000
--- a/crypto/fipsmodule/modes/internal.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ==================================================================== */
-
-#ifndef OPENSSL_HEADER_MODES_INTERNAL_H
-#define OPENSSL_HEADER_MODES_INTERNAL_H
-
-#include "../../internal.h"
-
-// GCM definitions
-typedef struct { uint64_t hi,lo; } u128;
-
-#endif  // OPENSSL_HEADER_MODES_INTERNAL_H
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
index d71fc82e22..d8661a0911 100644
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -218,6 +218,8 @@ sub BODY_16_XX {
 .size	K256,.-K256
 .word	0				@ terminator
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.extern GFp_armcap_P
+.hidden GFp_armcap_P
 .LOPENSSL_armcap:
 .word	GFp_armcap_P-.Lsha256_block_data_order
 #endif
@@ -687,11 +689,6 @@ ()
 }}}
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   GFp_armcap_P,4,4
-.hidden GFp_armcap_P
-#endif
 ___
 
 open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index 4543f4566c..21c7ebddba 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -278,6 +278,8 @@ ()
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.extern GFp_armcap_P
+.hidden GFp_armcap_P
 .LOPENSSL_armcap:
 .word	GFp_armcap_P-.Lsha512_block_data_order
 .skip	32-4
@@ -651,11 +653,6 @@ ()
 }
 $code.=<<___;
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	GFp_armcap_P,4,4
-.hidden	GFp_armcap_P
-#endif
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
index d8667c8db8..bb80b7c96b 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv8.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
@@ -179,12 +179,14 @@ sub BODY_00_xx {
 .text
 
 .extern	GFp_armcap_P
+.hidden GFp_armcap_P
 .globl	$func
 .type	$func,%function
 .align	6
 $func:
 ___
 $code.=<<___	if ($SZ==4);
+	AARCH64_VALID_CALL_TARGET
 #ifndef	__KERNEL__
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:GFp_armcap_P
@@ -197,6 +199,7 @@ sub BODY_00_xx {
 #endif
 ___
 $code.=<<___;
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
 
@@ -259,6 +262,7 @@ sub BODY_00_xx {
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	$func,.-$func
 
@@ -350,6 +354,7 @@ sub BODY_00_xx {
 .align	6
 sha256_block_armv8:
 .Lv8_entry:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 	stp		x29,x30,[sp,#-16]!
 	add		x29,sp,#0
 
@@ -419,13 +424,6 @@ sub BODY_00_xx {
 ___
 }
 
-$code.=<<___;
-#ifndef	__KERNEL__
-.comm	GFp_armcap_P,4,4
-.hidden	GFp_armcap_P
-#endif
-___
-
 {   my  %opcode = (
 	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
 	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
diff --git a/crypto/internal.h b/crypto/internal.h
index 57607bfc38..1877bec7f6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -259,10 +259,39 @@ static inline uint32_t CRYPTO_bswap4(uint32_t x) {
 }
 #endif
 
-static inline void bytes_copy(uint8_t out[], const uint8_t in[], size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    out[i] = in[i];
+#if !defined(GFp_NOSTDLIBINC)
+#include <string.h>
+#endif
+
+static inline void *GFp_memcpy(void *dst, const void *src, size_t n) {
+#if !defined(GFp_NOSTDLIBINC)
+  if (n == 0) {
+    return dst;
+  }
+  return memcpy(dst, src, n);
+#else
+  unsigned char *d = dst;
+  const unsigned char *s = src;
+  for (size_t i = 0; i < n; ++i) {
+    d[i] = s[i];
   }
+  return dst;
+#endif
+}
+
+static inline void *GFp_memset(void *dst, int c, size_t n) {
+#if !defined(GFp_NOSTDLIBINC)
+  if (n == 0) {
+    return dst;
+  }
+  return memset(dst, c, n);
+#else
+  unsigned char *d = dst;
+  for (size_t i = 0; i < n; ++i) {
+    d[i] = (unsigned char)c;
+  }
+  return dst;
+#endif
 }
 
 #endif  // OPENSSL_HEADER_CRYPTO_INTERNAL_H
diff --git a/crypto/perlasm/.gitattributes b/crypto/perlasm/.gitattributes
deleted file mode 100644
index d77060900d..0000000000
--- a/crypto/perlasm/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-*.pl linguist-language=Perl
-
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
deleted file mode 100755
index 1265676bbb..0000000000
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ /dev/null
@@ -1,1246 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-#			IALU(*)/gcc-4.4		NEON
-#
-# ARM11xx(ARMv6)	7.78/+100%		-
-# Cortex-A5		6.35/+130%		3.00
-# Cortex-A8		6.25/+115%		2.36
-# Cortex-A9		5.10/+95%		2.55
-# Cortex-A15		3.85/+85%		1.25(**)
-# Snapdragon S4		5.70/+100%		1.48(**)
-#
-# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**)	these are trade-off results, they can be improved by ~8% but at
-#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#include <GFp/arm_arch.h>
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.globl	GFp_poly1305_emit
-.globl	GFp_poly1305_blocks
-.globl	GFp_poly1305_init_asm
-
-.type	GFp_poly1305_init_asm,%function
-.align	5
-GFp_poly1305_init_asm:
-.Lpoly1305_init:
-	stmdb	sp!,{r4-r11}
-
-	eor	r3,r3,r3
-	cmp	$inp,#0
-	str	r3,[$ctx,#0]		@ zero hash value
-	str	r3,[$ctx,#4]
-	str	r3,[$ctx,#8]
-	str	r3,[$ctx,#12]
-	str	r3,[$ctx,#16]
-	str	r3,[$ctx,#36]		@ is_base2_26
-	add	$ctx,$ctx,#20
-
-#ifdef	__thumb2__
-	it	eq
-#endif
-	moveq	r0,#0
-	beq	.Lno_key
-
-#if	__ARM_MAX_ARCH__>=7
-	adr	r11,.Lpoly1305_init
-	ldr	r12,.LOPENSSL_armcap
-#endif
-	ldrb	r4,[$inp,#0]
-	mov	r10,#0x0fffffff
-	ldrb	r5,[$inp,#1]
-	and	r3,r10,#-4		@ 0x0ffffffc
-	ldrb	r6,[$inp,#2]
-	ldrb	r7,[$inp,#3]
-	orr	r4,r4,r5,lsl#8
-	ldrb	r5,[$inp,#4]
-	orr	r4,r4,r6,lsl#16
-	ldrb	r6,[$inp,#5]
-	orr	r4,r4,r7,lsl#24
-	ldrb	r7,[$inp,#6]
-	and	r4,r4,r10
-
-#if	__ARM_MAX_ARCH__>=7
-	ldr	r12,[r11,r12]		@ GFp_armcap_P
-# ifdef	__APPLE__
-	ldr	r12,[r12]
-# endif
-#endif
-	ldrb	r8,[$inp,#7]
-	orr	r5,r5,r6,lsl#8
-	ldrb	r6,[$inp,#8]
-	orr	r5,r5,r7,lsl#16
-	ldrb	r7,[$inp,#9]
-	orr	r5,r5,r8,lsl#24
-	ldrb	r8,[$inp,#10]
-	and	r5,r5,r3
-
-#if	__ARM_MAX_ARCH__>=7
-	tst	r12,#ARMV7_NEON		@ check for NEON
-# ifdef	__APPLE__
-	adr	r9,poly1305_blocks_neon
-	adr	r11,GFp_poly1305_blocks
-#  ifdef __thumb2__
-	it	ne
-#  endif
-	movne	r11,r9
-	adr	r12,GFp_poly1305_emit
-	adr	r10,poly1305_emit_neon
-#  ifdef __thumb2__
-	it	ne
-#  endif
-	movne	r12,r10
-# else
-#  ifdef __thumb2__
-	itete	eq
-#  endif
-	addeq	r12,r11,#(GFp_poly1305_emit-.Lpoly1305_init)
-	addne	r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
-	addeq	r11,r11,#(GFp_poly1305_blocks-.Lpoly1305_init)
-	addne	r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef	__thumb2__
-	orr	r12,r12,#1	@ thumb-ify address
-	orr	r11,r11,#1
-# endif
-#endif
-	ldrb	r9,[$inp,#11]
-	orr	r6,r6,r7,lsl#8
-	ldrb	r7,[$inp,#12]
-	orr	r6,r6,r8,lsl#16
-	ldrb	r8,[$inp,#13]
-	orr	r6,r6,r9,lsl#24
-	ldrb	r9,[$inp,#14]
-	and	r6,r6,r3
-
-	ldrb	r10,[$inp,#15]
-	orr	r7,r7,r8,lsl#8
-	str	r4,[$ctx,#0]
-	orr	r7,r7,r9,lsl#16
-	str	r5,[$ctx,#4]
-	orr	r7,r7,r10,lsl#24
-	str	r6,[$ctx,#8]
-	and	r7,r7,r3
-	str	r7,[$ctx,#12]
-#if	__ARM_MAX_ARCH__>=7
-	stmia	r2,{r11,r12}		@ fill functions table
-	mov	r0,#1
-#else
-	mov	r0,#0
-#endif
-.Lno_key:
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	GFp_poly1305_init_asm,.-GFp_poly1305_init_asm
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type	GFp_poly1305_blocks,%function
-.align	5
-GFp_poly1305_blocks:
-	stmdb	sp!,{r3-r11,lr}
-
-	ands	$len,$len,#-16
-	beq	.Lno_data
-
-	cmp	$padbit,#0
-	add	$len,$len,$inp		@ end pointer
-	sub	sp,sp,#32
-
-	ldmia	$ctx,{$h0-$r3}		@ load context
-
-	str	$ctx,[sp,#12]		@ offload stuff
-	mov	lr,$inp
-	str	$len,[sp,#16]
-	str	$r1,[sp,#20]
-	str	$r2,[sp,#24]
-	str	$r3,[sp,#28]
-	b	.Loop
-
-.Loop:
-#if __ARM_ARCH__<7
-	ldrb	r0,[lr],#16		@ load input
-# ifdef	__thumb2__
-	it	hi
-# endif
-	addhi	$h4,$h4,#1		@ 1<<128
-	ldrb	r1,[lr,#-15]
-	ldrb	r2,[lr,#-14]
-	ldrb	r3,[lr,#-13]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-12]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-11]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-10]
-	adds	$h0,$h0,r3		@ accumulate input
-
-	ldrb	r3,[lr,#-9]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-8]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-7]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-6]
-	adcs	$h1,$h1,r3
-
-	ldrb	r3,[lr,#-5]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-4]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-3]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-2]
-	adcs	$h2,$h2,r3
-
-	ldrb	r3,[lr,#-1]
-	orr	r1,r0,r1,lsl#8
-	str	lr,[sp,#8]		@ offload input pointer
-	orr	r2,r1,r2,lsl#16
-	add	$s1,$r1,$r1,lsr#2
-	orr	r3,r2,r3,lsl#24
-#else
-	ldr	r0,[lr],#16		@ load input
-# ifdef	__thumb2__
-	it	hi
-# endif
-	addhi	$h4,$h4,#1		@ padbit
-	ldr	r1,[lr,#-12]
-	ldr	r2,[lr,#-8]
-	ldr	r3,[lr,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	adds	$h0,$h0,r0		@ accumulate input
-	str	lr,[sp,#8]		@ offload input pointer
-	adcs	$h1,$h1,r1
-	add	$s1,$r1,$r1,lsr#2
-	adcs	$h2,$h2,r2
-#endif
-	add	$s2,$r2,$r2,lsr#2
-	adcs	$h3,$h3,r3
-	add	$s3,$r3,$r3,lsr#2
-
-	umull	r2,r3,$h1,$r0
-	 adc	$h4,$h4,#0
-	umull	r0,r1,$h0,$r0
-	umlal	r2,r3,$h4,$s1
-	umlal	r0,r1,$h3,$s1
-	ldr	$r1,[sp,#20]		@ reload $r1
-	umlal	r2,r3,$h2,$s3
-	umlal	r0,r1,$h1,$s3
-	umlal	r2,r3,$h3,$s2
-	umlal	r0,r1,$h2,$s2
-	umlal	r2,r3,$h0,$r1
-	str	r0,[sp,#0]		@ future $h0
-	 mul	r0,$s2,$h4
-	ldr	$r2,[sp,#24]		@ reload $r2
-	adds	r2,r2,r1		@ d1+=d0>>32
-	 eor	r1,r1,r1
-	adc	lr,r3,#0		@ future $h2
-	str	r2,[sp,#4]		@ future $h1
-
-	mul	r2,$s3,$h4
-	eor	r3,r3,r3
-	umlal	r0,r1,$h3,$s3
-	ldr	$r3,[sp,#28]		@ reload $r3
-	umlal	r2,r3,$h3,$r0
-	umlal	r0,r1,$h2,$r0
-	umlal	r2,r3,$h2,$r1
-	umlal	r0,r1,$h1,$r1
-	umlal	r2,r3,$h1,$r2
-	umlal	r0,r1,$h0,$r2
-	umlal	r2,r3,$h0,$r3
-	ldr	$h0,[sp,#0]
-	mul	$h4,$r0,$h4
-	ldr	$h1,[sp,#4]
-
-	adds	$h2,lr,r0		@ d2+=d1>>32
-	ldr	lr,[sp,#8]		@ reload input pointer
-	adc	r1,r1,#0
-	adds	$h3,r2,r1		@ d3+=d2>>32
-	ldr	r0,[sp,#16]		@ reload end pointer
-	adc	r3,r3,#0
-	add	$h4,$h4,r3		@ h4+=d3>>32
-
-	and	r1,$h4,#-4
-	and	$h4,$h4,#3
-	add	r1,r1,r1,lsr#2		@ *=5
-	adds	$h0,$h0,r1
-	adcs	$h1,$h1,#0
-	adcs	$h2,$h2,#0
-	adcs	$h3,$h3,#0
-	adc	$h4,$h4,#0
-
-	cmp	r0,lr			@ done yet?
-	bhi	.Loop
-
-	ldr	$ctx,[sp,#12]
-	add	sp,sp,#32
-	stmia	$ctx,{$h0-$h4}		@ store the result
-
-.Lno_data:
-#if	__ARM_ARCH__>=5
-	ldmia	sp!,{r3-r11,pc}
-#else
-	ldmia	sp!,{r3-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	GFp_poly1305_blocks,.-GFp_poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$h4;
-
-$code.=<<___;
-.type	GFp_poly1305_emit,%function
-.align	5
-GFp_poly1305_emit:
-	stmdb	sp!,{r4-r11}
-.Lpoly1305_emit_enter:
-
-	ldmia	$ctx,{$h0-$h4}
-	adds	$g0,$h0,#5		@ compare to modulus
-	adcs	$g1,$h1,#0
-	adcs	$g2,$h2,#0
-	adcs	$g3,$h3,#0
-	adc	$g4,$h4,#0
-	tst	$g4,#4			@ did it carry/borrow?
-
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h0,$g0
-	ldr	$g0,[$nonce,#0]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h1,$g1
-	ldr	$g1,[$nonce,#4]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h2,$g2
-	ldr	$g2,[$nonce,#8]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h3,$g3
-	ldr	$g3,[$nonce,#12]
-
-	adds	$h0,$h0,$g0
-	adcs	$h1,$h1,$g1
-	adcs	$h2,$h2,$g2
-	adc	$h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
-	rev	$h0,$h0
-	rev	$h1,$h1
-	rev	$h2,$h2
-	rev	$h3,$h3
-# endif
-	str	$h0,[$mac,#0]
-	str	$h1,[$mac,#4]
-	str	$h2,[$mac,#8]
-	str	$h3,[$mac,#12]
-#else
-	strb	$h0,[$mac,#0]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#4]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#8]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#12]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#1]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#5]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#9]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#13]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#2]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#6]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#10]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#14]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#3]
-	strb	$h1,[$mac,#7]
-	strb	$h2,[$mac,#11]
-	strb	$h3,[$mac,#15]
-#endif
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	GFp_poly1305_emit,.-GFp_poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if	__ARM_MAX_ARCH__>=7
-.fpu	neon
-
-.type	poly1305_init_neon,%function
-.align	5
-poly1305_init_neon:
-	ldr	r4,[$ctx,#20]		@ load key base 2^32
-	ldr	r5,[$ctx,#24]
-	ldr	r6,[$ctx,#28]
-	ldr	r7,[$ctx,#32]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	and	r3,r3,#0x03ffffff
-	and	r4,r4,#0x03ffffff
-	and	r5,r5,#0x03ffffff
-
-	vdup.32	$R0,r2			@ r^1 in both lanes
-	add	r2,r3,r3,lsl#2		@ *5
-	vdup.32	$R1,r3
-	add	r3,r4,r4,lsl#2
-	vdup.32	$S1,r2
-	vdup.32	$R2,r4
-	add	r4,r5,r5,lsl#2
-	vdup.32	$S2,r3
-	vdup.32	$R3,r5
-	add	r5,r6,r6,lsl#2
-	vdup.32	$S3,r4
-	vdup.32	$R4,r6
-	vdup.32	$S4,r5
-
-	mov	$zeros,#2		@ counter
-
-.Lsquare_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-
-	vmull.u32	$D0,$R0,${R0}[1]
-	vmull.u32	$D1,$R1,${R0}[1]
-	vmull.u32	$D2,$R2,${R0}[1]
-	vmull.u32	$D3,$R3,${R0}[1]
-	vmull.u32	$D4,$R4,${R0}[1]
-
-	vmlal.u32	$D0,$R4,${S1}[1]
-	vmlal.u32	$D1,$R0,${R1}[1]
-	vmlal.u32	$D2,$R1,${R1}[1]
-	vmlal.u32	$D3,$R2,${R1}[1]
-	vmlal.u32	$D4,$R3,${R1}[1]
-
-	vmlal.u32	$D0,$R3,${S2}[1]
-	vmlal.u32	$D1,$R4,${S2}[1]
-	vmlal.u32	$D3,$R1,${R2}[1]
-	vmlal.u32	$D2,$R0,${R2}[1]
-	vmlal.u32	$D4,$R2,${R2}[1]
-
-	vmlal.u32	$D0,$R2,${S3}[1]
-	vmlal.u32	$D3,$R0,${R3}[1]
-	vmlal.u32	$D1,$R3,${S3}[1]
-	vmlal.u32	$D2,$R4,${S3}[1]
-	vmlal.u32	$D4,$R1,${R3}[1]
-
-	vmlal.u32	$D3,$R4,${S4}[1]
-	vmlal.u32	$D0,$R1,${S4}[1]
-	vmlal.u32	$D1,$R2,${S4}[1]
-	vmlal.u32	$D2,$R3,${S4}[1]
-	vmlal.u32	$D4,$R0,${R4}[1]
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	@ and P. Schwabe
-	@
-	@ H0>>+H1>>+H2>>+H3>>+H4
-	@ H3>>+H4>>*5+H0>>+H1
-	@
-	@ Trivia.
-	@
-	@ Result of multiplication of n-bit number by m-bit number is
-	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
-	@ m-bit number multiplied by 2^n is still n+m bits wide.
-	@
-	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
-	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
-	@ one is n+1 bits wide.
-	@
-	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
-	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
-	@ can be 27. However! In cases when their width exceeds 26 bits
-	@ they are limited by 2^26+2^6. This in turn means that *sum*
-	@ of the products with these values can still be viewed as sum
-	@ of 52-bit numbers as long as the amount of addends is not a
-	@ power of 2. For example,
-	@
-	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
-	@
-	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
-	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
-	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
-	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
-	@ which is less than 32 * (2^52) or 2^57. And when processing
-	@ data we are looking at triple as many addends...
-	@
-	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
-	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
-	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
-	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
-	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
-	@ This means that result of reduction have to be compressed upon
-	@ loop wrap-around. This can be done in the process of reduction
-	@ to minimize amount of instructions [as well as amount of
-	@ 128-bit instructions, which benefits low-end processors], but
-	@ one has to watch for H2 (which is narrower than H0) and 5*H4
-	@ not being wider than 58 bits, so that result of right shift
-	@ by 26 bits fits in 32 bits. This is also useful on x86,
-	@ because it allows to use paddd in place for paddq, which
-	@ benefits Atom, where paddq is ridiculously slow.
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	vbic.i32	$D4#lo,#0xfc000000
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	 vbic.i32	$D2#lo,#0xfc000000
-
-	vshr.u32	$T0#lo,$D0#lo,#26
-	vbic.i32	$D0#lo,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-
-	subs		$zeros,$zeros,#1
-	beq		.Lsquare_break_neon
-
-	add		$tbl0,$ctx,#(48+0*9*4)
-	add		$tbl1,$ctx,#(48+1*9*4)
-
-	vtrn.32		$R0,$D0#lo		@ r^2:r^1
-	vtrn.32		$R2,$D2#lo
-	vtrn.32		$R3,$D3#lo
-	vtrn.32		$R1,$D1#lo
-	vtrn.32		$R4,$D4#lo
-
-	vshl.u32	$S2,$R2,#2		@ *5
-	vshl.u32	$S3,$R3,#2
-	vshl.u32	$S1,$R1,#2
-	vshl.u32	$S4,$R4,#2
-	vadd.i32	$S2,$S2,$R2
-	vadd.i32	$S1,$S1,$R1
-	vadd.i32	$S3,$S3,$R3
-	vadd.i32	$S4,$S4,$R4
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0,:32]
-	vst1.32		{${S4}[1]},[$tbl1,:32]
-
-	b		.Lsquare_neon
-
-.align	4
-.Lsquare_break_neon:
-	add		$tbl0,$ctx,#(48+2*4*9)
-	add		$tbl1,$ctx,#(48+3*4*9)
-
-	vmov		$R0,$D0#lo		@ r^4:r^3
-	vshl.u32	$S1,$D1#lo,#2		@ *5
-	vmov		$R1,$D1#lo
-	vshl.u32	$S2,$D2#lo,#2
-	vmov		$R2,$D2#lo
-	vshl.u32	$S3,$D3#lo,#2
-	vmov		$R3,$D3#lo
-	vshl.u32	$S4,$D4#lo,#2
-	vmov		$R4,$D4#lo
-	vadd.i32	$S1,$S1,$D1#lo
-	vadd.i32	$S2,$S2,$D2#lo
-	vadd.i32	$S3,$S3,$D3#lo
-	vadd.i32	$S4,$S4,$D4#lo
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0]
-	vst1.32		{${S4}[1]},[$tbl1]
-
-	ret				@ bx	lr
-.size	poly1305_init_neon,.-poly1305_init_neon
-
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-	ands	$len,$len,#-16
-	beq	.Lno_data_neon
-
-	cmp	$len,#64
-	bhs	.Lenter_neon
-	tst	ip,ip			@ is_base2_26?
-	beq	GFp_poly1305_blocks
-
-.Lenter_neon:
-	stmdb	sp!,{r4-r7}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-
-	tst	ip,ip			@ is_base2_26?
-	bne	.Lbase2_26_neon
-
-	stmdb	sp!,{r1-r3,lr}
-	bl	poly1305_init_neon
-
-	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
-	ldr	r5,[$ctx,#4]
-	ldr	r6,[$ctx,#8]
-	ldr	r7,[$ctx,#12]
-	ldr	ip,[$ctx,#16]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	 veor	$D0#lo,$D0#lo,$D0#lo
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	 veor	$D1#lo,$D1#lo,$D1#lo
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	 veor	$D2#lo,$D2#lo,$D2#lo
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	 veor	$D3#lo,$D3#lo,$D3#lo
-	and	r3,r3,#0x03ffffff
-	orr	r6,r6,ip,lsl#24
-	 veor	$D4#lo,$D4#lo,$D4#lo
-	and	r4,r4,#0x03ffffff
-	mov	r1,#1
-	and	r5,r5,#0x03ffffff
-	str	r1,[$ctx,#36]		@ is_base2_26
-
-	vmov.32	$D0#lo[0],r2
-	vmov.32	$D1#lo[0],r3
-	vmov.32	$D2#lo[0],r4
-	vmov.32	$D3#lo[0],r5
-	vmov.32	$D4#lo[0],r6
-	adr	$zeros,.Lzeros
-
-	ldmia	sp!,{r1-r3,lr}
-	b	.Lbase2_32_neon
-
-.align	4
-.Lbase2_26_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ load hash value
-
-	veor		$D0#lo,$D0#lo,$D0#lo
-	veor		$D1#lo,$D1#lo,$D1#lo
-	veor		$D2#lo,$D2#lo,$D2#lo
-	veor		$D3#lo,$D3#lo,$D3#lo
-	veor		$D4#lo,$D4#lo,$D4#lo
-	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	adr		$zeros,.Lzeros
-	vld1.32		{$D4#lo[0]},[$ctx]
-	sub		$ctx,$ctx,#16		@ rewind
-
-.Lbase2_32_neon:
-	add		$in2,$inp,#32
-	mov		$padbit,$padbit,lsl#24
-	tst		$len,#31
-	beq		.Leven
-
-	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
-	vmov.32		$H4#lo[0],$padbit
-	sub		$len,$len,#16
-	add		$in2,$inp,#32
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
-	vshl.u32	$H3#lo,$H3#lo,#18
-
-	vsri.u32	$H3#lo,$H2#lo,#14
-	vshl.u32	$H2#lo,$H2#lo,#12
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
-
-	vbic.i32	$H3#lo,#0xfc000000
-	vsri.u32	$H2#lo,$H1#lo,#20
-	vshl.u32	$H1#lo,$H1#lo,#6
-
-	vbic.i32	$H2#lo,#0xfc000000
-	vsri.u32	$H1#lo,$H0#lo,#26
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-
-	vbic.i32	$H0#lo,#0xfc000000
-	vbic.i32	$H1#lo,#0xfc000000
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo
-
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-
-	mov		$tbl1,$zeros
-	add		$tbl0,$ctx,#48
-
-	cmp		$len,$len
-	b		.Long_tail
-
-.align	4
-.Leven:
-	subs		$len,$len,#64
-	it		lo
-	movlo		$in2,$zeros
-
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-	itt		hi
-	addhi		$tbl1,$ctx,#(48+1*9*4)
-	addhi		$tbl0,$ctx,#(48+3*9*4)
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	vshl.u32	$H3,$H3,#18
-
-	vsri.u32	$H3,$H2,#14
-	vshl.u32	$H2,$H2,#12
-
-	vbic.i32	$H3,#0xfc000000
-	vsri.u32	$H2,$H1,#20
-	vshl.u32	$H1,$H1,#6
-
-	vbic.i32	$H2,#0xfc000000
-	vsri.u32	$H1,$H0,#26
-
-	vbic.i32	$H0,#0xfc000000
-	vbic.i32	$H1,#0xfc000000
-
-	bls		.Lskip_loop
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	b		.Loop_neon
-
-.align	5
-.Loop_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	@   \___________________/
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	@   \___________________/ \____________________/
-	@
-	@ Note that we start with inp[2:3]*r^2. This is because it
-	@ doesn't depend on reduction in previous iteration.
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ inp[2:3]*r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
-	vmull.u32	$D2,$H2#hi,${R0}[1]
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,${R0}[1]
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,${R0}[1]
-	vmlal.u32	$D2,$H1#hi,${R1}[1]
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,${R0}[1]
-
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,${R0}[1]
-	subs		$len,$len,#64
-	vmlal.u32	$D0,$H4#hi,${S1}[1]
-	it		lo
-	movlo		$in2,$zeros
-	vmlal.u32	$D3,$H2#hi,${R1}[1]
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D1,$H0#hi,${R1}[1]
-	vmlal.u32	$D4,$H3#hi,${R1}[1]
-
-	vmlal.u32	$D0,$H3#hi,${S2}[1]
-	vmlal.u32	$D3,$H1#hi,${R2}[1]
-	vmlal.u32	$D4,$H2#hi,${R2}[1]
-	vmlal.u32	$D1,$H4#hi,${S2}[1]
-	vmlal.u32	$D2,$H0#hi,${R2}[1]
-
-	vmlal.u32	$D3,$H0#hi,${R3}[1]
-	vmlal.u32	$D0,$H2#hi,${S3}[1]
-	vmlal.u32	$D4,$H1#hi,${R3}[1]
-	vmlal.u32	$D1,$H3#hi,${S3}[1]
-	vmlal.u32	$D2,$H4#hi,${S3}[1]
-
-	vmlal.u32	$D3,$H4#hi,${S4}[1]
-	vmlal.u32	$D0,$H1#hi,${S4}[1]
-	vmlal.u32	$D4,$H0#hi,${R4}[1]
-	vmlal.u32	$D1,$H2#hi,${S4}[1]
-	vmlal.u32	$D2,$H3#hi,${S4}[1]
-
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4 and accumulate
-
-	vmlal.u32	$D3,$H3#lo,${R0}[0]
-	vmlal.u32	$D0,$H0#lo,${R0}[0]
-	vmlal.u32	$D4,$H4#lo,${R0}[0]
-	vmlal.u32	$D1,$H1#lo,${R0}[0]
-	vmlal.u32	$D2,$H2#lo,${R0}[0]
-	vld1.32		${S4}[0],[$tbl0,:32]
-
-	vmlal.u32	$D3,$H2#lo,${R1}[0]
-	vmlal.u32	$D0,$H4#lo,${S1}[0]
-	vmlal.u32	$D4,$H3#lo,${R1}[0]
-	vmlal.u32	$D1,$H0#lo,${R1}[0]
-	vmlal.u32	$D2,$H1#lo,${R1}[0]
-
-	vmlal.u32	$D3,$H1#lo,${R2}[0]
-	vmlal.u32	$D0,$H3#lo,${S2}[0]
-	vmlal.u32	$D4,$H2#lo,${R2}[0]
-	vmlal.u32	$D1,$H4#lo,${S2}[0]
-	vmlal.u32	$D2,$H0#lo,${R2}[0]
-
-	vmlal.u32	$D3,$H0#lo,${R3}[0]
-	vmlal.u32	$D0,$H2#lo,${S3}[0]
-	vmlal.u32	$D4,$H1#lo,${R3}[0]
-	vmlal.u32	$D1,$H3#lo,${S3}[0]
-	vmlal.u32	$D3,$H4#lo,${S4}[0]
-
-	vmlal.u32	$D2,$H4#lo,${S3}[0]
-	vmlal.u32	$D0,$H1#lo,${S4}[0]
-	vmlal.u32	$D4,$H0#lo,${R4}[0]
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vmlal.u32	$D1,$H2#lo,${S4}[0]
-	vmlal.u32	$D2,$H3#lo,${S4}[0]
-
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-	vrev32.8	$H3,$H3
-# endif
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
-	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000
-	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	  vshl.u32	$H3,$H3,#18
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	  vsri.u32	$H3,$H2,#14
-	vbic.i32	$D4#lo,#0xfc000000
-	  vshl.u32	$H2,$H2,#12
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	  vbic.i32	$H3,#0xfc000000
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
-	  vsri.u32	$H2,$H1,#20
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	  vshl.u32	$H1,$H1,#6
-	 vbic.i32	$D2#lo,#0xfc000000
-	  vbic.i32	$H2,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
-	vmovn.i64	$D0#lo,$D0
-	  vsri.u32	$H1,$H0,#26
-	  vbic.i32	$H0,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vbic.i32	$D0#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-	  vbic.i32	$H1,#0xfc000000
-
-	bhi		.Loop_neon
-
-.Lskip_loop:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	add		$tbl1,$ctx,#(48+0*9*4)
-	add		$tbl0,$ctx,#(48+1*9*4)
-	adds		$len,$len,#32
-	it		ne
-	movne		$len,#0
-	bne		.Long_tail
-
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
-	vmull.u32	$D2,$H2#hi,$R0
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,$R0
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,$R0
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,$R0
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,$R0
-
-	vmlal.u32	$D0,$H4#hi,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#hi,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#hi,$R1
-	vmlal.u32	$D4,$H3#hi,$R1
-	vmlal.u32	$D2,$H1#hi,$R1
-
-	vmlal.u32	$D3,$H1#hi,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#hi,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#hi,$R2
-	vmlal.u32	$D1,$H4#hi,$S2
-	vmlal.u32	$D2,$H0#hi,$R2
-
-	vmlal.u32	$D3,$H0#hi,$R3
-	 it		ne
-	 addne		$tbl1,$ctx,#(48+2*9*4)
-	vmlal.u32	$D0,$H2#hi,$S3
-	 it		ne
-	 addne		$tbl0,$ctx,#(48+3*9*4)
-	vmlal.u32	$D4,$H1#hi,$R3
-	vmlal.u32	$D1,$H3#hi,$S3
-	vmlal.u32	$D2,$H4#hi,$S3
-
-	vmlal.u32	$D3,$H4#hi,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
-	vmlal.u32	$D0,$H1#hi,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#hi,$R4
-	vmlal.u32	$D1,$H2#hi,$S4
-	vmlal.u32	$D2,$H3#hi,$S4
-
-	beq		.Lshort_tail
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-
-	vmlal.u32	$D2,$H2#lo,$R0
-	vmlal.u32	$D0,$H0#lo,$R0
-	vmlal.u32	$D3,$H3#lo,$R0
-	vmlal.u32	$D1,$H1#lo,$R0
-	vmlal.u32	$D4,$H4#lo,$R0
-
-	vmlal.u32	$D0,$H4#lo,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#lo,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#lo,$R1
-	vmlal.u32	$D4,$H3#lo,$R1
-	vmlal.u32	$D2,$H1#lo,$R1
-
-	vmlal.u32	$D3,$H1#lo,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#lo,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#lo,$R2
-	vmlal.u32	$D1,$H4#lo,$S2
-	vmlal.u32	$D2,$H0#lo,$R2
-
-	vmlal.u32	$D3,$H0#lo,$R3
-	vmlal.u32	$D0,$H2#lo,$S3
-	vmlal.u32	$D4,$H1#lo,$R3
-	vmlal.u32	$D1,$H3#lo,$S3
-	vmlal.u32	$D2,$H4#lo,$S3
-
-	vmlal.u32	$D3,$H4#lo,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones
-	vmlal.u32	$D0,$H1#lo,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#lo,$R4
-	vmlal.u32	$D1,$H2#lo,$S4
-	vmlal.u32	$D2,$H3#lo,$S4
-
-.Lshort_tail:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ horizontal addition
-
-	vadd.i64	$D3#lo,$D3#lo,$D3#hi
-	vadd.i64	$D0#lo,$D0#lo,$D0#hi
-	vadd.i64	$D4#lo,$D4#lo,$D4#hi
-	vadd.i64	$D1#lo,$D1#lo,$D1#hi
-	vadd.i64	$D2#lo,$D2#lo,$D2#hi
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction, but without narrowing
-
-	vshr.u64	$T0,$D3,#26
-	vand.i64	$D3,$D3,$MASK
-	 vshr.u64	$T1,$D0,#26
-	 vand.i64	$D0,$D0,$MASK
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-
-	vshr.u64	$T0,$D4,#26
-	vand.i64	$D4,$D4,$MASK
-	 vshr.u64	$T1,$D1,#26
-	 vand.i64	$D1,$D1,$MASK
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-
-	vadd.i64	$D0,$D0,$T0
-	vshl.u64	$T0,$T0,#2
-	 vshr.u64	$T1,$D2,#26
-	 vand.i64	$D2,$D2,$MASK
-	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
-	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
-
-	vshr.u64	$T0,$D0,#26
-	vand.i64	$D0,$D0,$MASK
-	 vshr.u64	$T1,$D3,#26
-	 vand.i64	$D3,$D3,$MASK
-	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
-	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
-
-	cmp		$len,#0
-	bne		.Leven
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ store hash value
-
-	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	vst1.32		{$D4#lo[0]},[$ctx]
-
-	vldmia	sp!,{d8-d15}			@ epilogue
-	ldmia	sp!,{r4-r7}
-.Lno_data_neon:
-	ret					@ bx	lr
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type	poly1305_emit_neon,%function
-.align	5
-poly1305_emit_neon:
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-
-	stmdb	sp!,{r4-r11}
-
-	tst	ip,ip
-	beq	.Lpoly1305_emit_enter
-
-	ldmia	$ctx,{$h0-$h4}
-	eor	$g0,$g0,$g0
-
-	adds	$h0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
-	mov	$h1,$h1,lsr#6
-	adcs	$h1,$h1,$h2,lsl#20
-	mov	$h2,$h2,lsr#12
-	adcs	$h2,$h2,$h3,lsl#14
-	mov	$h3,$h3,lsr#18
-	adcs	$h3,$h3,$h4,lsl#8
-	adc	$h4,$g0,$h4,lsr#24	@ can be partially reduced ...
-
-	and	$g0,$h4,#-4		@ ... so reduce
-	and	$h4,$h3,#3
-	add	$g0,$g0,$g0,lsr#2	@ *= 5
-	adds	$h0,$h0,$g0
-	adcs	$h1,$h1,#0
-	adcs	$h2,$h2,#0
-	adcs	$h3,$h3,#0
-	adc	$h4,$h4,#0
-
-	adds	$g0,$h0,#5		@ compare to modulus
-	adcs	$g1,$h1,#0
-	adcs	$g2,$h2,#0
-	adcs	$g3,$h3,#0
-	adc	$g4,$h4,#0
-	tst	$g4,#4			@ did it carry/borrow?
-
-	it	ne
-	movne	$h0,$g0
-	ldr	$g0,[$nonce,#0]
-	it	ne
-	movne	$h1,$g1
-	ldr	$g1,[$nonce,#4]
-	it	ne
-	movne	$h2,$g2
-	ldr	$g2,[$nonce,#8]
-	it	ne
-	movne	$h3,$g3
-	ldr	$g3,[$nonce,#12]
-
-	adds	$h0,$h0,$g0		@ accumulate nonce
-	adcs	$h1,$h1,$g1
-	adcs	$h2,$h2,$g2
-	adc	$h3,$h3,$g3
-
-# ifdef __ARMEB__
-	rev	$h0,$h0
-	rev	$h1,$h1
-	rev	$h2,$h2
-	rev	$h3,$h3
-# endif
-	str	$h0,[$mac,#0]		@ store the result
-	str	$h1,[$mac,#4]
-	str	$h2,[$mac,#8]
-	str	$h3,[$mac,#12]
-
-	ldmia	sp!,{r4-r11}
-	ret				@ bx	lr
-.size	poly1305_emit_neon,.-poly1305_emit_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.LOPENSSL_armcap:
-.word	GFp_armcap_P-.Lpoly1305_init
-#endif
-___
-}	}
-$code.=<<___;
-.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if	__ARM_MAX_ARCH__>=7
-.comm   GFp_armcap_P,4,4
-#endif
-___
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
-
-	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
-	s/\bret\b/bx	lr/go						or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
-
-	print $_,"\n";
-}
-close STDOUT or die "error closing STDOUT";
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
deleted file mode 100755
index 82aee20c11..0000000000
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ /dev/null
@@ -1,931 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with GFp_poly1305_blocks alone.
-#
-#		IALU/gcc-4.9	NEON
-#
-# Apple A7	1.86/+5%	0.72
-# Cortex-A53	2.69/+58%	1.47
-# Cortex-A57	2.70/+7%	1.14
-# Denver	1.64/+50%	1.18(*)
-# X-Gene	2.13/+68%	2.27
-#
-# (*)	estimate based on resources availability is less than 1.0,
-#	i.e. measured result is worse than expected, presumably binary
-#	translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-die "can't locate arm-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#include <GFp/arm_arch.h>
-
-.text
-
-// forward "declarations" are required for Apple
-.extern	GFp_armcap_P
-.globl	GFp_poly1305_blocks
-.globl	GFp_poly1305_emit
-.globl	GFp_poly1305_init_asm
-
-.type	GFp_poly1305_init_asm,%function
-.align	5
-GFp_poly1305_init_asm:
-	cmp	$inp,xzr
-	stp	xzr,xzr,[$ctx]		// zero hash value
-	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
-
-	csel	x0,xzr,x0,eq
-	b.eq	.Lno_key
-
-#ifdef	__ILP32__
-	ldrsw	$t1,.LGFp_armcap_P
-#else
-	ldr	$t1,.LGFp_armcap_P
-#endif
-	adr	$t0,.LGFp_armcap_P
-
-	ldp	$r0,$r1,[$inp]		// load key
-	mov	$s1,#0xfffffffc0fffffff
-	movk	$s1,#0x0fff,lsl#48
-	ldr	w17,[$t0,$t1]
-#ifdef	__ARMEB__
-	rev	$r0,$r0			// flip bytes
-	rev	$r1,$r1
-#endif
-	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
-	and	$s1,$s1,#-4
-	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
-	stp	$r0,$r1,[$ctx,#32]	// save key value
-
-	tst	w17,#ARMV7_NEON
-
-	adr	$d0,GFp_poly1305_blocks
-	adr	$r0,poly1305_blocks_neon
-	adr	$d1,GFp_poly1305_emit
-	adr	$r1,poly1305_emit_neon
-
-	csel	$d0,$d0,$r0,eq
-	csel	$d1,$d1,$r1,eq
-
-	stp	$d0,$d1,[$len]
-
-	mov	x0,#1
-.Lno_key:
-	ret
-.size	GFp_poly1305_init_asm,.-GFp_poly1305_init_asm
-
-.type	GFp_poly1305_blocks,%function
-.align	5
-GFp_poly1305_blocks:
-	ands	$len,$len,#-16
-	b.eq	.Lno_data
-
-	ldp	$h0,$h1,[$ctx]		// load hash value
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-	ldr	$h2,[$ctx,#16]
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	b	.Loop
-
-.align	5
-.Loop:
-	ldp	$t0,$t1,[$inp],#16	// load input
-	sub	$len,$len,#16
-#ifdef	__ARMEB__
-	rev	$t0,$t0
-	rev	$t1,$t1
-#endif
-	adds	$h0,$h0,$t0		// accumulate input
-	adcs	$h1,$h1,$t1
-
-	mul	$d0,$h0,$r0		// h0*r0
-	adc	$h2,$h2,$padbit
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adcs	$h1,$d1,xzr
-	adc	$h2,$h2,xzr
-
-	cbnz	$len,.Loop
-
-	stp	$h0,$h1,[$ctx]		// store hash value
-	str	$h2,[$ctx,#16]
-
-.Lno_data:
-	ret
-.size	GFp_poly1305_blocks,.-GFp_poly1305_blocks
-
-.type	GFp_poly1305_emit,%function
-.align	5
-GFp_poly1305_emit:
-	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
-	ldr	$h2,[$ctx,#16]
-	ldp	$t0,$t1,[$nonce]	// load nonce
-
-	adds	$d0,$h0,#5		// compare to modulus
-	adcs	$d1,$h1,xzr
-	adc	$d2,$h2,xzr
-
-	tst	$d2,#-4			// see if it's carried/borrowed
-
-	csel	$h0,$h0,$d0,eq
-	csel	$h1,$h1,$d1,eq
-
-#ifdef	__ARMEB__
-	ror	$t0,$t0,#32		// flip nonce words
-	ror	$t1,$t1,#32
-#endif
-	adds	$h0,$h0,$t0		// accumulate nonce
-	adc	$h1,$h1,$t1
-#ifdef	__ARMEB__
-	rev	$h0,$h0			// flip output bytes
-	rev	$h1,$h1
-#endif
-	stp	$h0,$h1,[$mac]		// write result
-
-	ret
-.size	GFp_poly1305_emit,.-GFp_poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros;		# borrow
-
-$code.=<<___;
-.type	poly1305_mult,%function
-.align	5
-poly1305_mult:
-	mul	$d0,$h0,$r0		// h0*r0
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adcs	$h1,$d1,xzr
-	adc	$h2,$h2,xzr
-
-	ret
-.size	poly1305_mult,.-poly1305_mult
-
-.type	poly1305_splat,%function
-.align	5
-poly1305_splat:
-	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x13,$h0,#26,#26
-	extr	x14,$h1,$h0,#52
-	and	x14,x14,#0x03ffffff
-	ubfx	x15,$h1,#14,#26
-	extr	x16,$h2,$h1,#40
-
-	str	w12,[$ctx,#16*0]	// r0
-	add	w12,w13,w13,lsl#2	// r1*5
-	str	w13,[$ctx,#16*1]	// r1
-	add	w13,w14,w14,lsl#2	// r2*5
-	str	w12,[$ctx,#16*2]	// s1
-	str	w14,[$ctx,#16*3]	// r2
-	add	w14,w15,w15,lsl#2	// r3*5
-	str	w13,[$ctx,#16*4]	// s2
-	str	w15,[$ctx,#16*5]	// r3
-	add	w15,w16,w16,lsl#2	// r4*5
-	str	w14,[$ctx,#16*6]	// s3
-	str	w16,[$ctx,#16*7]	// r4
-	str	w15,[$ctx,#16*8]	// s4
-
-	ret
-.size	poly1305_splat,.-poly1305_splat
-
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-	ldr	$is_base2_26,[$ctx,#24]
-	cmp	$len,#128
-	b.hs	.Lblocks_neon
-	cbz	$is_base2_26,GFp_poly1305_blocks
-
-.Lblocks_neon:
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-
-	ands	$len,$len,#-16
-	b.eq	.Lno_data_neon
-
-	cbz	$is_base2_26,.Lbase2_64_neon
-
-	ldp	w10,w11,[$ctx]		// load hash value base 2^26
-	ldp	w12,w13,[$ctx,#8]
-	ldr	w14,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Leven_neon
-
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	$h1,x12,#12
-	adds	$h0,$h0,x12,lsl#52
-	add	$h1,$h1,x13,lsl#14
-	adc	$h1,$h1,xzr
-	lsr	$h2,x14,#24
-	adds	$h1,$h1,x14,lsl#40
-	adc	$d2,$h2,xzr		// can be partially reduced...
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-
-	and	$t0,$d2,#-4		// ... so reduce
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$h0,$t0
-	adcs	$h1,$h1,xzr
-	adc	$h2,$h2,xzr
-
-#ifdef	__ARMEB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-	ldr	x30,[sp,#8]
-
-	cbz	$padbit,.Lstore_base2_64_neon
-
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	cbnz	$len,.Leven_neon
-
-	stp	w10,w11,[$ctx]		// store hash value base 2^26
-	stp	w12,w13,[$ctx,#8]
-	str	w14,[$ctx,#16]
-	b	.Lno_data_neon
-
-.align	4
-.Lstore_base2_64_neon:
-	stp	$h0,$h1,[$ctx]		// store hash value base 2^64
-	stp	$h2,xzr,[$ctx,#16]	// note that is_base2_26 is zeroed
-	b	.Lno_data_neon
-
-.align	4
-.Lbase2_64_neon:
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
-	ldr	$h2,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Linit_neon
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-#ifdef	__ARMEB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-
-.Linit_neon:
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-	////////////////////////////////// initialize r^n table
-	mov	$h0,$r0			// r^1
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	mov	$h1,$r1
-	mov	$h2,xzr
-	add	$ctx,$ctx,#48+12
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^2
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^3
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^4
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-	ldr	x30,[sp,#8]
-
-	add	$in2,$inp,#32
-	adr	$zeros,.Lzeros
-	subs	$len,$len,#64
-	csel	$in2,$zeros,$in2,lo
-
-	mov	x4,#1
-	str	x4,[$ctx,#-24]		// set is_base2_26
-	sub	$ctx,$ctx,#48		// restore original $ctx
-	b	.Ldo_neon
-
-.align	4
-.Leven_neon:
-	add	$in2,$inp,#32
-	adr	$zeros,.Lzeros
-	subs	$len,$len,#64
-	csel	$in2,$zeros,$in2,lo
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-.Ldo_neon:
-	ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
-	ldp	x9,x13,[$in2],#48
-
-	lsl	$padbit,$padbit,#24
-	add	x15,$ctx,#48
-
-#ifdef	__ARMEB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN23_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN23_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	fmov	$IN23_2,x8
-	fmov	$IN23_3,x10
-	fmov	$IN23_4,x12
-
-	ldp	x8,x12,[$inp],#16	// inp[0:1]
-	ldp	x9,x13,[$inp],#48
-
-	ld1	{$R0,$R1,$S1,$R2},[x15],#64
-	ld1	{$S2,$R3,$S3,$R4},[x15],#64
-	ld1	{$S4},[x15]
-
-#ifdef	__ARMEB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN01_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN01_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	movi	$MASK.2d,#-1
-	fmov	$IN01_2,x8
-	fmov	$IN01_3,x10
-	fmov	$IN01_4,x12
-	ushr	$MASK.2d,$MASK.2d,#38
-
-	b.ls	.Lskip_loop
-
-.align	4
-.Loop_neon:
-	////////////////////////////////////////////////////////////////
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	//   \___________________/
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	//   \___________________/ \____________________/
-	//
-	// Note that we start with inp[2:3]*r^2. This is because it
-	// doesn't depend on reduction in previous iteration.
-	////////////////////////////////////////////////////////////////
-	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
-	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
-	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
-	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	subs	$len,$len,#64
-	umull	$ACC4,$IN23_0,${R4}[2]
-	csel	$in2,$zeros,$in2,lo
-	umull	$ACC3,$IN23_0,${R3}[2]
-	umull	$ACC2,$IN23_0,${R2}[2]
-	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
-	umull	$ACC1,$IN23_0,${R1}[2]
-	 ldp	x9,x13,[$in2],#48
-	umull	$ACC0,$IN23_0,${R0}[2]
-#ifdef	__ARMEB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	umlal	$ACC4,$IN23_1,${R3}[2]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC3,$IN23_1,${R2}[2]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC2,$IN23_1,${R1}[2]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN23_1,${R0}[2]
-	 ubfx	x7,x9,#26,#26
-	umlal	$ACC0,$IN23_1,${S4}[2]
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-
-	umlal	$ACC4,$IN23_2,${R2}[2]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC3,$IN23_2,${R1}[2]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC2,$IN23_2,${R0}[2]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC1,$IN23_2,${S4}[2]
-	 fmov	$IN23_0,x4
-	umlal	$ACC0,$IN23_2,${S3}[2]
-	 and	x8,x8,#0x03ffffff
-
-	umlal	$ACC4,$IN23_3,${R1}[2]
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN23_3,${R0}[2]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC2,$IN23_3,${S4}[2]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC1,$IN23_3,${S3}[2]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC0,$IN23_3,${S2}[2]
-	 fmov	$IN23_1,x6
-
-	add	$IN01_2,$IN01_2,$H2
-	 add	x12,$padbit,x12,lsr#40
-	umlal	$ACC4,$IN23_4,${R0}[2]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC3,$IN23_4,${S4}[2]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC2,$IN23_4,${S3}[2]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN23_4,${S2}[2]
-	 fmov	$IN23_2,x8
-	umlal	$ACC0,$IN23_4,${S1}[2]
-	 fmov	$IN23_3,x10
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	 fmov	$IN23_4,x12
-	umlal	$ACC3,$IN01_2,${R1}[0]
-	 ldp	x8,x12,[$inp],#16	// inp[0:1]
-	umlal	$ACC0,$IN01_2,${S3}[0]
-	 ldp	x9,x13,[$inp],#48
-	umlal	$ACC4,$IN01_2,${R2}[0]
-	umlal	$ACC1,$IN01_2,${S4}[0]
-	umlal	$ACC2,$IN01_2,${R0}[0]
-#ifdef	__ARMEB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}[0]
-	umlal	$ACC4,$IN01_0,${R4}[0]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC2,$IN01_0,${R2}[0]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC0,$IN01_0,${R0}[0]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN01_0,${R1}[0]
-	 ubfx	x7,x9,#26,#26
-
-	add	$IN01_3,$IN01_3,$H3
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	umlal	$ACC3,$IN01_1,${R2}[0]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC4,$IN01_1,${R3}[0]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC0,$IN01_1,${S4}[0]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC2,$IN01_1,${R1}[0]
-	 fmov	$IN01_0,x4
-	umlal	$ACC1,$IN01_1,${R0}[0]
-	 and	x8,x8,#0x03ffffff
-
-	add	$IN01_4,$IN01_4,$H4
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN01_3,${R0}[0]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC0,$IN01_3,${S2}[0]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC4,$IN01_3,${R1}[0]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC1,$IN01_3,${S3}[0]
-	 fmov	$IN01_1,x6
-	umlal	$ACC2,$IN01_3,${S4}[0]
-	 add	x12,$padbit,x12,lsr#40
-
-	umlal	$ACC3,$IN01_4,${S4}[0]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC0,$IN01_4,${S1}[0]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC4,$IN01_4,${R0}[0]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN01_4,${S2}[0]
-	 fmov	$IN01_2,x8
-	umlal	$ACC2,$IN01_4,${S3}[0]
-	 fmov	$IN01_3,x10
-	 fmov	$IN01_4,x12
-
-	/////////////////////////////////////////////////////////////////
-	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	// and P. Schwabe
-	//
-	// [see discussion in poly1305-armv4 module]
-
-	ushr	$T0.2d,$ACC3,#26
-	xtn	$H3,$ACC3
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	xtn	$H4,$ACC4
-	 ushr	$T1.2d,$ACC1,#26
-	 xtn	$H1,$ACC1
-	bic	$H4,#0xfc,lsl#24
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 shrn	$T1.2s,$ACC2,#26
-	 xtn	$H2,$ACC2
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 bic	$H1,#0xfc,lsl#24
-	 add	$H3,$H3,$T1.2s		// h2 -> h3
-	 bic	$H2,#0xfc,lsl#24
-
-	shrn	$T0.2s,$ACC0,#26
-	xtn	$H0,$ACC0
-	 ushr	$T1.2s,$H3,#26
-	 bic	$H3,#0xfc,lsl#24
-	 bic	$H0,#0xfc,lsl#24
-	add	$H1,$H1,$T0.2s		// h0 -> h1
-	 add	$H4,$H4,$T1.2s		// h3 -> h4
-
-	b.hi	.Loop_neon
-
-.Lskip_loop:
-	dup	$IN23_2,${IN23_2}[0]
-	add	$IN01_2,$IN01_2,$H2
-
-	////////////////////////////////////////////////////////////////
-	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	adds	$len,$len,#32
-	b.ne	.Long_tail
-
-	dup	$IN23_2,${IN01_2}[0]
-	add	$IN23_0,$IN01_0,$H0
-	add	$IN23_3,$IN01_3,$H3
-	add	$IN23_1,$IN01_1,$H1
-	add	$IN23_4,$IN01_4,$H4
-
-.Long_tail:
-	dup	$IN23_0,${IN23_0}[0]
-	umull2	$ACC0,$IN23_2,${S3}
-	umull2	$ACC3,$IN23_2,${R1}
-	umull2	$ACC4,$IN23_2,${R2}
-	umull2	$ACC2,$IN23_2,${R0}
-	umull2	$ACC1,$IN23_2,${S4}
-
-	dup	$IN23_1,${IN23_1}[0]
-	umlal2	$ACC0,$IN23_0,${R0}
-	umlal2	$ACC2,$IN23_0,${R2}
-	umlal2	$ACC3,$IN23_0,${R3}
-	umlal2	$ACC4,$IN23_0,${R4}
-	umlal2	$ACC1,$IN23_0,${R1}
-
-	dup	$IN23_3,${IN23_3}[0]
-	umlal2	$ACC0,$IN23_1,${S4}
-	umlal2	$ACC3,$IN23_1,${R2}
-	umlal2	$ACC2,$IN23_1,${R1}
-	umlal2	$ACC4,$IN23_1,${R3}
-	umlal2	$ACC1,$IN23_1,${R0}
-
-	dup	$IN23_4,${IN23_4}[0]
-	umlal2	$ACC3,$IN23_3,${R0}
-	umlal2	$ACC4,$IN23_3,${R1}
-	umlal2	$ACC0,$IN23_3,${S2}
-	umlal2	$ACC1,$IN23_3,${S3}
-	umlal2	$ACC2,$IN23_3,${S4}
-
-	umlal2	$ACC3,$IN23_4,${S4}
-	umlal2	$ACC0,$IN23_4,${S1}
-	umlal2	$ACC4,$IN23_4,${R0}
-	umlal2	$ACC1,$IN23_4,${S2}
-	umlal2	$ACC2,$IN23_4,${S3}
-
-	b.eq	.Lshort_tail
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	umlal	$ACC3,$IN01_2,${R1}
-	umlal	$ACC0,$IN01_2,${S3}
-	umlal	$ACC4,$IN01_2,${R2}
-	umlal	$ACC1,$IN01_2,${S4}
-	umlal	$ACC2,$IN01_2,${R0}
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}
-	umlal	$ACC0,$IN01_0,${R0}
-	umlal	$ACC4,$IN01_0,${R4}
-	umlal	$ACC1,$IN01_0,${R1}
-	umlal	$ACC2,$IN01_0,${R2}
-
-	add	$IN01_3,$IN01_3,$H3
-	umlal	$ACC3,$IN01_1,${R2}
-	umlal	$ACC0,$IN01_1,${S4}
-	umlal	$ACC4,$IN01_1,${R3}
-	umlal	$ACC1,$IN01_1,${R0}
-	umlal	$ACC2,$IN01_1,${R1}
-
-	add	$IN01_4,$IN01_4,$H4
-	umlal	$ACC3,$IN01_3,${R0}
-	umlal	$ACC0,$IN01_3,${S2}
-	umlal	$ACC4,$IN01_3,${R1}
-	umlal	$ACC1,$IN01_3,${S3}
-	umlal	$ACC2,$IN01_3,${S4}
-
-	umlal	$ACC3,$IN01_4,${S4}
-	umlal	$ACC0,$IN01_4,${S1}
-	umlal	$ACC4,$IN01_4,${R0}
-	umlal	$ACC1,$IN01_4,${S2}
-	umlal	$ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
-	////////////////////////////////////////////////////////////////
-	// horizontal add
-
-	addp	$ACC3,$ACC3,$ACC3
-	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
-	addp	$ACC0,$ACC0,$ACC0
-	 ldp	d10,d11,[sp,#32]
-	addp	$ACC4,$ACC4,$ACC4
-	 ldp	d12,d13,[sp,#48]
-	addp	$ACC1,$ACC1,$ACC1
-	 ldp	d14,d15,[sp,#64]
-	addp	$ACC2,$ACC2,$ACC2
-
-	////////////////////////////////////////////////////////////////
-	// lazy reduction, but without narrowing
-
-	ushr	$T0.2d,$ACC3,#26
-	and	$ACC3,$ACC3,$MASK.2d
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	and	$ACC4,$ACC4,$MASK.2d
-	 ushr	$T1.2d,$ACC1,#26
-	 and	$ACC1,$ACC1,$MASK.2d
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 ushr	$T1.2d,$ACC2,#26
-	 and	$ACC2,$ACC2,$MASK.2d
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
-
-	ushr	$T0.2d,$ACC0,#26
-	and	$ACC0,$ACC0,$MASK.2d
-	 ushr	$T1.2d,$ACC3,#26
-	 and	$ACC3,$ACC3,$MASK.2d
-	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
-	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
-
-	////////////////////////////////////////////////////////////////
-	// write the result, can be partially reduced
-
-	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
-	st1	{$ACC4}[0],[$ctx]
-
-.Lno_data_neon:
-	ldr	x29,[sp],#80
-	ret
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type	poly1305_emit_neon,%function
-.align	5
-poly1305_emit_neon:
-	ldr	$is_base2_26,[$ctx,#24]
-	cbz	$is_base2_26,GFp_poly1305_emit
-
-	ldp	w10,w11,[$ctx]		// load hash value base 2^26
-	ldp	w12,w13,[$ctx,#8]
-	ldr	w14,[$ctx,#16]
-
-	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	$h1,x12,#12
-	adds	$h0,$h0,x12,lsl#52
-	add	$h1,$h1,x13,lsl#14
-	adc	$h1,$h1,xzr
-	lsr	$h2,x14,#24
-	adds	$h1,$h1,x14,lsl#40
-	adc	$h2,$h2,xzr		// can be partially reduced...
-
-	ldp	$t0,$t1,[$nonce]	// load nonce
-
-	and	$d0,$h2,#-4		// ... so reduce
-	add	$d0,$d0,$h2,lsr#2
-	and	$h2,$h2,#3
-	adds	$h0,$h0,$d0
-	adcs	$h1,$h1,xzr
-	adc	$h2,$h2,xzr
-
-	adds	$d0,$h0,#5		// compare to modulus
-	adcs	$d1,$h1,xzr
-	adc	$d2,$h2,xzr
-
-	tst	$d2,#-4			// see if it's carried/borrowed
-
-	csel	$h0,$h0,$d0,eq
-	csel	$h1,$h1,$d1,eq
-
-#ifdef	__ARMEB__
-	ror	$t0,$t0,#32		// flip nonce words
-	ror	$t1,$t1,#32
-#endif
-	adds	$h0,$h0,$t0		// accumulate nonce
-	adc	$h1,$h1,$t1
-#ifdef	__ARMEB__
-	rev	$h0,$h0			// flip output bytes
-	rev	$h1,$h1
-#endif
-	stp	$h0,$h1,[$mac]		// write result
-
-	ret
-.size	poly1305_emit_neon,.-poly1305_emit_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0
-.LGFp_armcap_P:
-#ifdef	__ILP32__
-.long	GFp_armcap_P-.
-#else
-.quad	GFp_armcap_P-.
-#endif
-.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-___
-
-foreach (split("\n",$code)) {
-	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
-	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
-	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
-	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
-	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
-	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
-	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
-	s/\.[124]([sd])\[/.$1\[/;
-
-	print $_,"\n";
-}
-close STDOUT or die "error closing STDOUT";
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl
deleted file mode 100755
index 3f0e4c416b..0000000000
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ /dev/null
@@ -1,1223 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86.
-#
-# April 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-#		IALU/gcc-3.4(*)	SSE2(**)	AVX2
-# Pentium	15.7/+80%	-
-# PIII		6.21/+90%	-
-# P4		19.8/+40%	3.24
-# Core 2	4.85/+90%	1.80
-# Westmere	4.58/+100%	1.43
-# Sandy Bridge	3.90/+100%	1.36
-# Haswell	3.88/+70%	1.18		0.72
-# Silvermont	11.0/+40%	4.80
-# VIA Nano	6.71/+90%	2.47
-# Sledgehammer	3.51/+180%	4.27
-# Bulldozer	4.53/+140%	1.31
-#
-# (*)	gcc 4.8 for some reason generated worse code;
-# (**)	besides SSE2 there are floating-point and AVX options; FP
-#	is deemed unnecessary, because pre-SSE2 processor are too
-#	old to care about, while it's not the fastest option on
-#	SSE2-capable ones; AVX is omitted, because it doesn't give
-#	a lot of improvement, 5-10% depending on processor;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-$output=pop;
-open STDOUT,">$output";
-
-&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386");
-
-$sse2=$avx=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-if ($sse2) {
-	&static_label("const_sse2");
-	&static_label("enter_blocks");
-	&static_label("enter_emit");
-	&external_label("GFp_ia32cap_P");
-
-	# This may be set to 2, but valgrind can't do AVX2 on 32-bit. Without a
-	# way to verify test coverage, keep it disabled.
-	# The AVX2 code was removed.
-	$avx = 0;
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int32 h[5];		# current hash value base 2^32
-#	unsigned __int32 pad;		# is_base2_26 in vector context
-#	unsigned __int32 r[4];		# key value base 2^32
-
-&align(64);
-&function_begin("GFp_poly1305_init_asm");
-	&mov	("edi",&wparam(0));		# context
-	&mov	("esi",&wparam(1));		# key
-	&mov	("ebp",&wparam(2));		# function table
-
-	&xor	("eax","eax");
-	&mov	(&DWP(4*0,"edi"),"eax");	# zero hash value
-	&mov	(&DWP(4*1,"edi"),"eax");
-	&mov	(&DWP(4*2,"edi"),"eax");
-	&mov	(&DWP(4*3,"edi"),"eax");
-	&mov	(&DWP(4*4,"edi"),"eax");
-	&mov	(&DWP(4*5,"edi"),"eax");	# is_base2_26
-
-	&cmp	("esi",0);
-	&je	(&label("nokey"));
-
-    if ($sse2) {
-	&call	(&label("pic_point"));
-    &set_label("pic_point");
-	&blindpop("ebx");
-
-	&lea	("eax",&DWP("GFp_poly1305_blocks-".&label("pic_point"),"ebx"));
-	&lea	("edx",&DWP("GFp_poly1305_emit-".&label("pic_point"),"ebx"));
-
-	&picmeup("edi","GFp_ia32cap_P","ebx",&label("pic_point"));
-	&mov	("ecx",&DWP(0,"edi"));
-	&and	("ecx",1<<26|1<<24);
-	&cmp	("ecx",1<<26|1<<24);		# SSE2 and XMM?
-	# The non-SSE2 code was removed. 
-
-	&lea	("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
-	&lea	("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
-
-	# AVX2 code removed.
-
-    # The non-SSE2 code was removed.
-
-	&mov	("edi",&wparam(0));		# reload context
-	&mov	(&DWP(0,"ebp"),"eax");		# fill function table
-	&mov	(&DWP(4,"ebp"),"edx");
-    }
-
-	&mov	("eax",&DWP(4*0,"esi"));	# load input key
-	&mov	("ebx",&DWP(4*1,"esi"));
-	&mov	("ecx",&DWP(4*2,"esi"));
-	&mov	("edx",&DWP(4*3,"esi"));
-	&and	("eax",0x0fffffff);
-	&and	("ebx",0x0ffffffc);
-	&and	("ecx",0x0ffffffc);
-	&and	("edx",0x0ffffffc);
-	&mov	(&DWP(4*6,"edi"),"eax");
-	&mov	(&DWP(4*7,"edi"),"ebx");
-	&mov	(&DWP(4*8,"edi"),"ecx");
-	&mov	(&DWP(4*9,"edi"),"edx");
-
-	&mov	("eax",$sse2);
-&set_label("nokey");
-&function_end("GFp_poly1305_init_asm");
-
-($h0,$h1,$h2,$h3,$h4,
- $d0,$d1,$d2,$d3,
- $r0,$r1,$r2,$r3,
-     $s1,$s2,$s3)=map(4*$_,(0..15));
-
-&function_begin("GFp_poly1305_blocks");
-	&mov	("edi",&wparam(0));		# ctx
-	&mov	("esi",&wparam(1));		# inp
-	&mov	("ecx",&wparam(2));		# len
-&set_label("enter_blocks");
-	&and	("ecx",-15);
-	&jz	(&label("nodata"));
-
-	&stack_push(16);
-	&mov	("eax",&DWP(4*6,"edi"));	# r0
-	&mov	("ebx",&DWP(4*7,"edi"));	# r1
-	 &lea	("ebp",&DWP(0,"esi","ecx"));	# end of input
-	&mov	("ecx",&DWP(4*8,"edi"));	# r2
-	&mov	("edx",&DWP(4*9,"edi"));	# r3
-
-	&mov	(&wparam(2),"ebp");
-	&mov	("ebp","esi");
-
-	&mov	(&DWP($r0,"esp"),"eax");	# r0
-	&mov	("eax","ebx");
-	&shr	("eax",2);
-	&mov	(&DWP($r1,"esp"),"ebx");	# r1
-	&add	("eax","ebx");			# s1
-	&mov	("ebx","ecx");
-	&shr	("ebx",2);
-	&mov	(&DWP($r2,"esp"),"ecx");	# r2
-	&add	("ebx","ecx");			# s2
-	&mov	("ecx","edx");
-	&shr	("ecx",2);
-	&mov	(&DWP($r3,"esp"),"edx");	# r3
-	&add	("ecx","edx");			# s3
-	&mov	(&DWP($s1,"esp"),"eax");	# s1
-	&mov	(&DWP($s2,"esp"),"ebx");	# s2
-	&mov	(&DWP($s3,"esp"),"ecx");	# s3
-
-	&mov	("eax",&DWP(4*0,"edi"));	# load hash value
-	&mov	("ebx",&DWP(4*1,"edi"));
-	&mov	("ecx",&DWP(4*2,"edi"));
-	&mov	("esi",&DWP(4*3,"edi"));
-	&mov	("edi",&DWP(4*4,"edi"));
-	&jmp	(&label("loop"));
-
-&set_label("loop",32);
-	&add	("eax",&DWP(4*0,"ebp"));	# accumulate input
-	&adc	("ebx",&DWP(4*1,"ebp"));
-	&adc	("ecx",&DWP(4*2,"ebp"));
-	&adc	("esi",&DWP(4*3,"ebp"));
-	&lea	("ebp",&DWP(4*4,"ebp"));
-	&adc	("edi",&wparam(3));		# padbit
-
-	&mov	(&DWP($h0,"esp"),"eax");	# put aside hash[+inp]
-	&mov	(&DWP($h3,"esp"),"esi");
-
-	&mul	(&DWP($r0,"esp"));		# h0*r0
-	 &mov	(&DWP($h4,"esp"),"edi");
-	&mov	("edi","eax");
-	&mov	("eax","ebx");			# h1
-	&mov	("esi","edx");
-	&mul	(&DWP($s3,"esp"));		# h1*s3
-	&add	("edi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("esi","edx");
-	&mul	(&DWP($s2,"esp"));		# h2*s2
-	&add	("edi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("esi","edx");
-	&mul	(&DWP($s1,"esp"));		# h3*s1
-	&add	("edi","eax");
-	 &mov	("eax",&DWP($h0,"esp"));
-	&adc	("esi","edx");
-
-	&mul	(&DWP($r1,"esp"));		# h0*r1
-	 &mov	(&DWP($d0,"esp"),"edi");
-	&xor	("edi","edi");
-	&add	("esi","eax");
-	&mov	("eax","ebx");			# h1
-	&adc	("edi","edx");
-	&mul	(&DWP($r0,"esp"));		# h1*r0
-	&add	("esi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("edi","edx");
-	&mul	(&DWP($s3,"esp"));		# h2*s3
-	&add	("esi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("edi","edx");
-	&mul	(&DWP($s2,"esp"));		# h3*s2
-	&add	("esi","eax");
-	&mov	("eax",&DWP($h4,"esp"));
-	&adc	("edi","edx");
-	&imul	("eax",&DWP($s1,"esp"));	# h4*s1
-	&add	("esi","eax");
-	 &mov	("eax",&DWP($h0,"esp"));
-	&adc	("edi",0);
-
-	&mul	(&DWP($r2,"esp"));		# h0*r2
-	 &mov	(&DWP($d1,"esp"),"esi");
-	&xor	("esi","esi");
-	&add	("edi","eax");
-	&mov	("eax","ebx");			# h1
-	&adc	("esi","edx");
-	&mul	(&DWP($r1,"esp"));		# h1*r1
-	&add	("edi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("esi","edx");
-	&mul	(&DWP($r0,"esp"));		# h2*r0
-	&add	("edi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("esi","edx");
-	&mul	(&DWP($s3,"esp"));		# h3*s3
-	&add	("edi","eax");
-	&mov	("eax",&DWP($h4,"esp"));
-	&adc	("esi","edx");
-	&imul	("eax",&DWP($s2,"esp"));	# h4*s2
-	&add	("edi","eax");
-	 &mov	("eax",&DWP($h0,"esp"));
-	&adc	("esi",0);
-
-	&mul	(&DWP($r3,"esp"));		# h0*r3
-	 &mov	(&DWP($d2,"esp"),"edi");
-	&xor	("edi","edi");
-	&add	("esi","eax");
-	&mov	("eax","ebx");			# h1
-	&adc	("edi","edx");
-	&mul	(&DWP($r2,"esp"));		# h1*r2
-	&add	("esi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("edi","edx");
-	&mul	(&DWP($r1,"esp"));		# h2*r1
-	&add	("esi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("edi","edx");
-	&mul	(&DWP($r0,"esp"));		# h3*r0
-	&add	("esi","eax");
-	 &mov	("ecx",&DWP($h4,"esp"));
-	&adc	("edi","edx");
-
-	&mov	("edx","ecx");
-	&imul	("ecx",&DWP($s3,"esp"));	# h4*s3
-	&add	("esi","ecx");
-	 &mov	("eax",&DWP($d0,"esp"));
-	&adc	("edi",0);
-
-	&imul	("edx",&DWP($r0,"esp"));	# h4*r0
-	&add	("edx","edi");
-
-	&mov	("ebx",&DWP($d1,"esp"));
-	&mov	("ecx",&DWP($d2,"esp"));
-
-	&mov	("edi","edx");			# last reduction step
-	&shr	("edx",2);
-	&and	("edi",3);
-	&lea	("edx",&DWP(0,"edx","edx",4));	# *5
-	&add	("eax","edx");
-	&adc	("ebx",0);
-	&adc	("ecx",0);
-	&adc	("esi",0);
-	&adc	("edi",0);
-
-	&cmp	("ebp",&wparam(2));		# done yet?
-	&jne	(&label("loop"));
-
-	&mov	("edx",&wparam(0));		# ctx
-	&stack_pop(16);
-	&mov	(&DWP(4*0,"edx"),"eax");	# store hash value
-	&mov	(&DWP(4*1,"edx"),"ebx");
-	&mov	(&DWP(4*2,"edx"),"ecx");
-	&mov	(&DWP(4*3,"edx"),"esi");
-	&mov	(&DWP(4*4,"edx"),"edi");
-&set_label("nodata");
-&function_end("GFp_poly1305_blocks");
-
-&function_begin("GFp_poly1305_emit");
-	&mov	("ebp",&wparam(0));		# context
-&set_label("enter_emit");
-	&mov	("edi",&wparam(1));		# output
-	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
-	&mov	("ebx",&DWP(4*1,"ebp"));
-	&mov	("ecx",&DWP(4*2,"ebp"));
-	&mov	("edx",&DWP(4*3,"ebp"));
-	&mov	("esi",&DWP(4*4,"ebp"));
-
-	&add	("eax",5);			# compare to modulus
-	&adc	("ebx",0);
-	&adc	("ecx",0);
-	&adc	("edx",0);
-	&adc	("esi",0);
-	&shr	("esi",2);			# did it carry/borrow?
-	&neg	("esi");			# do we choose hash-modulus?
-
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&and	("ecx","esi");
-	&and	("edx","esi");
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&mov	(&DWP(4*3,"edi"),"edx");
-
-	&not	("esi");			# or original hash value?
-	&mov	("eax",&DWP(4*0,"ebp"));
-	&mov	("ebx",&DWP(4*1,"ebp"));
-	&mov	("ecx",&DWP(4*2,"ebp"));
-	&mov	("edx",&DWP(4*3,"ebp"));
-	&mov	("ebp",&wparam(2));
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&and	("ecx","esi");
-	&and	("edx","esi");
-	&or	("eax",&DWP(4*0,"edi"));
-	&or	("ebx",&DWP(4*1,"edi"));
-	&or	("ecx",&DWP(4*2,"edi"));
-	&or	("edx",&DWP(4*3,"edi"));
-
-	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
-	&adc	("ebx",&DWP(4*1,"ebp"));
-	&adc	("ecx",&DWP(4*2,"ebp"));
-	&adc	("edx",&DWP(4*3,"ebp"));
-
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&mov	(&DWP(4*3,"edi"),"edx");
-&function_end("GFp_poly1305_emit");
-
-if ($sse2) {
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int32 h[5];		# current hash value base 2^26
-#	unsigned __int32 is_base2_26;
-#	unsigned __int32 r[4];		# key value base 2^32
-#	unsigned __int32 pad[2];
-#	struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
-my $MASK=$T2;	# borrow and keep in mind
-
-&align	(32);
-&function_begin_B("_poly1305_init_sse2");
-	&movdqu		($D4,&QWP(4*6,"edi"));		# key base 2^32
-	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
-	&mov		("ebp","esp");
-	&sub		("esp",16*(9+5));
-	&and		("esp",-16);
-
-	#&pand		($D4,&QWP(96,"ebx"));		# magic mask
-	&movq		($MASK,&QWP(64,"ebx"));
-
-	&movdqa		($D0,$D4);
-	&movdqa		($D1,$D4);
-	&movdqa		($D2,$D4);
-
-	&pand		($D0,$MASK);			# -> base 2^26
-	&psrlq		($D1,26);
-	&psrldq		($D2,6);
-	&pand		($D1,$MASK);
-	&movdqa		($D3,$D2);
-	&psrlq		($D2,4)
-	&psrlq		($D3,30);
-	&pand		($D2,$MASK);
-	&pand		($D3,$MASK);
-	&psrldq		($D4,13);
-
-	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
-	&mov		("ecx",2);
-&set_label("square");
-	&movdqa		(&QWP(16*0,"esp"),$D0);
-	&movdqa		(&QWP(16*1,"esp"),$D1);
-	&movdqa		(&QWP(16*2,"esp"),$D2);
-	&movdqa		(&QWP(16*3,"esp"),$D3);
-	&movdqa		(&QWP(16*4,"esp"),$D4);
-
-	&movdqa		($T1,$D1);
-	&movdqa		($T0,$D2);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D1);			# *5
-	&paddd		($T0,$D2);			# *5
-	&movdqa		(&QWP(16*5,"esp"),$T1);
-	&movdqa		(&QWP(16*6,"esp"),$T0);
-	&movdqa		($T1,$D3);
-	&movdqa		($T0,$D4);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D3);			# *5
-	&paddd		($T0,$D4);			# *5
-	&movdqa		(&QWP(16*7,"esp"),$T1);
-	&movdqa		(&QWP(16*8,"esp"),$T0);
-
-	&pshufd		($T1,$D0,0b01000100);
-	&movdqa		($T0,$D1);
-	&pshufd		($D1,$D1,0b01000100);
-	&pshufd		($D2,$D2,0b01000100);
-	&pshufd		($D3,$D3,0b01000100);
-	&pshufd		($D4,$D4,0b01000100);
-	&movdqa		(&QWP(16*0,"edx"),$T1);
-	&movdqa		(&QWP(16*1,"edx"),$D1);
-	&movdqa		(&QWP(16*2,"edx"),$D2);
-	&movdqa		(&QWP(16*3,"edx"),$D3);
-	&movdqa		(&QWP(16*4,"edx"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&pmuludq	($D4,$D0);			# h4*r0
-	&pmuludq	($D3,$D0);			# h3*r0
-	&pmuludq	($D2,$D0);			# h2*r0
-	&pmuludq	($D1,$D0);			# h1*r0
-	&pmuludq	($D0,$T1);			# h0*r0
-
-sub pmuladd {
-my $load = shift;
-my $base = shift; $base = "esp" if (!defined($base));
-
-	################################################################
-	# As for choice to "rotate" $T0-$T2 in order to move paddq
-	# past next multiplication. While it makes code harder to read
-	# and doesn't have significant effect on most processors, it
-	# makes a lot of difference on Atom, up to 30% improvement.
-
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*3,$base));		# r1*h3
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*2,$base));		# r1*h2
-	&paddq		($D4,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*1,$base));		# r1*h1
-	&paddq		($D3,$T1);
-	&$load		($T1,5);			# s1
-	&pmuludq	($T0,&QWP(16*0,$base));		# r1*h0
-	&paddq		($D2,$T2);
-	&pmuludq	($T1,&QWP(16*4,$base));		# s1*h4
-	 &$load		($T2,2);			# r2^n
-	&paddq		($D1,$T0);
-
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*2,$base));		# r2*h2
-	 &paddq		($D0,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*1,$base));		# r2*h1
-	&paddq		($D4,$T2);
-	&$load		($T2,6);			# s2^n
-	&pmuludq	($T1,&QWP(16*0,$base));		# r2*h0
-	&paddq		($D3,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*4,$base));		# s2*h4
-	&paddq		($D2,$T1);
-	&pmuludq	($T0,&QWP(16*3,$base));		# s2*h3
-	 &$load		($T1,3);			# r3^n
-	&paddq		($D1,$T2);
-
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*1,$base));		# r3*h1
-	 &paddq		($D0,$T0);
-	&$load		($T0,7);			# s3^n
-	&pmuludq	($T2,&QWP(16*0,$base));		# r3*h0
-	&paddq		($D4,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*4,$base));		# s3*h4
-	&paddq		($D3,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*3,$base));		# s3*h3
-	&paddq		($D2,$T0);
-	&pmuludq	($T2,&QWP(16*2,$base));		# s3*h2
-	 &$load		($T0,4);			# r4^n
-	&paddq		($D1,$T1);
-
-	&$load		($T1,8);			# s4^n
-	&pmuludq	($T0,&QWP(16*0,$base));		# r4*h0
-	 &paddq		($D0,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*4,$base));		# s4*h4
-	&paddq		($D4,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*1,$base));		# s4*h1
-	&paddq		($D3,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*2,$base));		# s4*h2
-	&paddq		($D0,$T2);
-	&pmuludq	($T1,&QWP(16*3,$base));		# s4*h3
-	 &movdqa	($MASK,&QWP(64,"ebx"));
-	&paddq		($D1,$T0);
-	&paddq		($D2,$T1);
-}
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&movdqa ($reg,&QWP(16*$i,"esp"));
-			     },"edx");
-
-sub lazy_reduction {
-my $extra = shift;
-
-	################################################################
-	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	# and P. Schwabe
-	#
-	# [(*) see discussion in poly1305-armv4 module]
-
-	 &movdqa	($T0,$D3);
-	 &pand		($D3,$MASK);
-	 &psrlq		($T0,26);
-	 &$extra	()				if (defined($extra));
-	 &paddq		($T0,$D4);			# h3 -> h4
-	&movdqa		($T1,$D0);
-	&pand		($D0,$MASK);
-	&psrlq		($T1,26);
-	 &movdqa	($D4,$T0);
-	&paddq		($T1,$D1);			# h0 -> h1
-	 &psrlq		($T0,26);
-	 &pand		($D4,$MASK);
-	&movdqa		($D1,$T1);
-	&psrlq		($T1,26);
-	 &paddd		($D0,$T0);			# favour paddd when
-							# possible, because
-							# paddq is "broken"
-							# on Atom
-	 &psllq		($T0,2);
-	&paddq		($T1,$D2);			# h1 -> h2
-	 &paddq		($T0,$D0);			# h4 -> h0 (*)
-	&pand		($D1,$MASK);
-	&movdqa		($D2,$T1);
-	&psrlq		($T1,26);
-	&pand		($D2,$MASK);
-	&paddd		($T1,$D3);			# h2 -> h3
-	 &movdqa	($D0,$T0);
-	 &psrlq		($T0,26);
-	&movdqa		($D3,$T1);
-	&psrlq		($T1,26);
-	 &pand		($D0,$MASK);
-	 &paddd		($D1,$T0);			# h0 -> h1
-	&pand		($D3,$MASK);
-	&paddd		($D4,$T1);			# h3 -> h4
-}
-	&lazy_reduction	();
-
-	&dec		("ecx");
-	&jz		(&label("square_break"));
-
-	&punpcklqdq	($D0,&QWP(16*0,"esp"));		# 0:r^1:0:r^2
-	&punpcklqdq	($D1,&QWP(16*1,"esp"));
-	&punpcklqdq	($D2,&QWP(16*2,"esp"));
-	&punpcklqdq	($D3,&QWP(16*3,"esp"));
-	&punpcklqdq	($D4,&QWP(16*4,"esp"));
-	&jmp		(&label("square"));
-
-&set_label("square_break");
-	&psllq		($D0,32);			# -> r^3:0:r^4:0
-	&psllq		($D1,32);
-	&psllq		($D2,32);
-	&psllq		($D3,32);
-	&psllq		($D4,32);
-	&por		($D0,&QWP(16*0,"esp"));		# r^3:r^1:r^4:r^2
-	&por		($D1,&QWP(16*1,"esp"));
-	&por		($D2,&QWP(16*2,"esp"));
-	&por		($D3,&QWP(16*3,"esp"));
-	&por		($D4,&QWP(16*4,"esp"));
-
-	&pshufd		($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
-	&pshufd		($D1,$D1,0b10001101);
-	&pshufd		($D2,$D2,0b10001101);
-	&pshufd		($D3,$D3,0b10001101);
-	&pshufd		($D4,$D4,0b10001101);
-
-	&movdqu		(&QWP(16*0,"edi"),$D0);		# save the table
-	&movdqu		(&QWP(16*1,"edi"),$D1);
-	&movdqu		(&QWP(16*2,"edi"),$D2);
-	&movdqu		(&QWP(16*3,"edi"),$D3);
-	&movdqu		(&QWP(16*4,"edi"),$D4);
-
-	&movdqa		($T1,$D1);
-	&movdqa		($T0,$D2);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D1);			# *5
-	&paddd		($T0,$D2);			# *5
-	&movdqu		(&QWP(16*5,"edi"),$T1);
-	&movdqu		(&QWP(16*6,"edi"),$T0);
-	&movdqa		($T1,$D3);
-	&movdqa		($T0,$D4);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D3);			# *5
-	&paddd		($T0,$D4);			# *5
-	&movdqu		(&QWP(16*7,"edi"),$T1);
-	&movdqu		(&QWP(16*8,"edi"),$T0);
-
-	&mov		("esp","ebp");
-	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
-	&ret		();
-&function_end_B("_poly1305_init_sse2");
-
-&align	(32);
-&function_begin("_poly1305_blocks_sse2");
-	&mov	("edi",&wparam(0));			# ctx
-	&mov	("esi",&wparam(1));			# inp
-	&mov	("ecx",&wparam(2));			# len
-
-	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
-	&and	("ecx",-16);
-	&jz	(&label("nodata"));
-	&cmp	("ecx",64);
-	&jae	(&label("enter_sse2"));
-	&test	("eax","eax");				# is_base2_26?
-	&jz	(&label("enter_blocks"));
-
-&set_label("enter_sse2",16);
-	&call	(&label("pic_point"));
-&set_label("pic_point");
-	&blindpop("ebx");
-	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
-
-	&test	("eax","eax");				# is_base2_26?
-	&jnz	(&label("base2_26"));
-
-	&call	("_poly1305_init_sse2");
-
-	################################################# base 2^32 -> base 2^26
-	&mov	("eax",&DWP(0,"edi"));
-	&mov	("ecx",&DWP(3,"edi"));
-	&mov	("edx",&DWP(6,"edi"));
-	&mov	("esi",&DWP(9,"edi"));
-	&mov	("ebp",&DWP(13,"edi"));
-	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
-
-	&shr	("ecx",2);
-	&and	("eax",0x3ffffff);
-	&shr	("edx",4);
-	&and	("ecx",0x3ffffff);
-	&shr	("esi",6);
-	&and	("edx",0x3ffffff);
-
-	&movd	($D0,"eax");
-	&movd	($D1,"ecx");
-	&movd	($D2,"edx");
-	&movd	($D3,"esi");
-	&movd	($D4,"ebp");
-
-	&mov	("esi",&wparam(1));			# [reload] inp
-	&mov	("ecx",&wparam(2));			# [reload] len
-	&jmp	(&label("base2_32"));
-
-&set_label("base2_26",16);
-	&movd	($D0,&DWP(4*0,"edi"));			# load hash value
-	&movd	($D1,&DWP(4*1,"edi"));
-	&movd	($D2,&DWP(4*2,"edi"));
-	&movd	($D3,&DWP(4*3,"edi"));
-	&movd	($D4,&DWP(4*4,"edi"));
-	&movdqa	($MASK,&QWP(64,"ebx"));
-
-&set_label("base2_32");
-	&mov	("eax",&wparam(3));			# padbit
-	&mov	("ebp","esp");
-
-	&sub	("esp",16*(5+5+5+9+9));
-	&and	("esp",-16);
-
-	&lea	("edi",&DWP(16*3,"edi"));		# size optimization
-	&shl	("eax",24);				# padbit
-
-	&test	("ecx",31);
-	&jz	(&label("even"));
-
-	################################################################
-	# process single block, with SSE2, because it's still faster
-	# even though half of result is discarded
-
-	&movdqu		($T1,&QWP(0,"esi"));		# input
-	&lea		("esi",&DWP(16,"esi"));
-
-	&movdqa		($T0,$T1);			# -> base 2^26 ...
-	&pand		($T1,$MASK);
-	&paddd		($D0,$T1);			# ... and accumuate
-
-	&movdqa		($T1,$T0);
-	&psrlq		($T0,26);
-	&psrldq		($T1,6);
-	&pand		($T0,$MASK);
-	&paddd		($D1,$T0);
-
-	&movdqa		($T0,$T1);
-	&psrlq		($T1,4);
-	&pand		($T1,$MASK);
-	&paddd		($D2,$T1);
-
-	&movdqa		($T1,$T0);
-	&psrlq		($T0,30);
-	&pand		($T0,$MASK);
-	&psrldq		($T1,7);
-	&paddd		($D3,$T0);
-
-	&movd		($T0,"eax");			# padbit
-	&paddd		($D4,$T1);
-	 &movd		($T1,&DWP(16*0+12,"edi"));	# r0
-	&paddd		($D4,$T0);
-
-	&movdqa		(&QWP(16*0,"esp"),$D0);
-	&movdqa		(&QWP(16*1,"esp"),$D1);
-	&movdqa		(&QWP(16*2,"esp"),$D2);
-	&movdqa		(&QWP(16*3,"esp"),$D3);
-	&movdqa		(&QWP(16*4,"esp"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&pmuludq	($D0,$T1);			# h4*r0
-	&pmuludq	($D1,$T1);			# h3*r0
-	&pmuludq	($D2,$T1);			# h2*r0
-	 &movd		($T0,&DWP(16*1+12,"edi"));	# r1
-	&pmuludq	($D3,$T1);			# h1*r0
-	&pmuludq	($D4,$T1);			# h0*r0
-
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&movd ($reg,&DWP(16*$i+12,"edi"));
-			     });
-
-	&lazy_reduction	();
-
-	&sub		("ecx",16);
-	&jz		(&label("done"));
-
-&set_label("even");
-	&lea		("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
-	&lea		("eax",&DWP(-16*2,"esi"));
-	&sub		("ecx",64);
-
-	################################################################
-	# expand and copy pre-calculated table to stack
-
-	&movdqu		($T0,&QWP(16*0,"edi"));		# r^1:r^2:r^3:r^4
-	&pshufd		($T1,$T0,0b01000100);		# duplicate r^3:r^4
-	&cmovb		("esi","eax");
-	&pshufd		($T0,$T0,0b11101110);		# duplicate r^1:r^2
-	&movdqa		(&QWP(16*0,"edx"),$T1);
-	&lea		("eax",&DWP(16*10,"esp"));
-	&movdqu		($T1,&QWP(16*1,"edi"));
-	&movdqa		(&QWP(16*(0-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*1,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*2,"edi"));
-	&movdqa		(&QWP(16*(1-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*2,"edx"),$T1);
-	&movdqu		($T1,&QWP(16*3,"edi"));
-	&movdqa		(&QWP(16*(2-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*3,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*4,"edi"));
-	&movdqa		(&QWP(16*(3-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*4,"edx"),$T1);
-	&movdqu		($T1,&QWP(16*5,"edi"));
-	&movdqa		(&QWP(16*(4-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*5,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*6,"edi"));
-	&movdqa		(&QWP(16*(5-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*6,"edx"),$T1);
-	&movdqu		($T1,&QWP(16*7,"edi"));
-	&movdqa		(&QWP(16*(6-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*7,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*8,"edi"));
-	&movdqa		(&QWP(16*(7-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*8,"edx"),$T1);
-	&movdqa		(&QWP(16*(8-9),"edx"),$T0);
-
-sub load_input {
-my ($inpbase,$offbase)=@_;
-
-	&movdqu		($T0,&QWP($inpbase+0,"esi"));	# load input
-	&movdqu		($T1,&QWP($inpbase+16,"esi"));
-	&lea		("esi",&DWP(16*2,"esi"));
-
-	&movdqa		(&QWP($offbase+16*2,"esp"),$D2);
-	&movdqa		(&QWP($offbase+16*3,"esp"),$D3);
-	&movdqa		(&QWP($offbase+16*4,"esp"),$D4);
-
-	&movdqa		($D2,$T0);			# splat input
-	&movdqa		($D3,$T1);
-	&psrldq		($D2,6);
-	&psrldq		($D3,6);
-	&movdqa		($D4,$T0);
-	&punpcklqdq	($D2,$D3);			# 2:3
-	&punpckhqdq	($D4,$T1);			# 4
-	&punpcklqdq	($T0,$T1);			# 0:1
-
-	&movdqa		($D3,$D2);
-	&psrlq		($D2,4);
-	&psrlq		($D3,30);
-	&movdqa		($T1,$T0);
-	&psrlq		($D4,40);			# 4
-	&psrlq		($T1,26);
-	&pand		($T0,$MASK);			# 0
-	&pand		($T1,$MASK);			# 1
-	&pand		($D2,$MASK);			# 2
-	&pand		($D3,$MASK);			# 3
-	&por		($D4,&QWP(0,"ebx"));		# padbit, yes, always
-
-	&movdqa		(&QWP($offbase+16*0,"esp"),$D0)	if ($offbase);
-	&movdqa		(&QWP($offbase+16*1,"esp"),$D1)	if ($offbase);
-}
-	&load_input	(16*2,16*5);
-
-	&jbe		(&label("skip_loop"));
-	&jmp		(&label("loop"));
-
-&set_label("loop",32);
-	################################################################
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	#   \___________________/
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	#   \___________________/ \____________________/
-	################################################################
-
-	&movdqa		($T2,&QWP(16*(0-9),"edx"));	# r0^2
-	&movdqa		(&QWP(16*1,"eax"),$T1);
-	&movdqa		(&QWP(16*2,"eax"),$D2);
-	&movdqa		(&QWP(16*3,"eax"),$D3);
-	&movdqa		(&QWP(16*4,"eax"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
-	# d3 = h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
-	# d2 = h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
-	# d1 = h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	# d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	&movdqa		($D1,$T0);
-	&pmuludq	($T0,$T2);			# h0*r0
-	&movdqa		($D0,$T1);
-	&pmuludq	($T1,$T2);			# h1*r0
-	&pmuludq	($D2,$T2);			# h2*r0
-	&pmuludq	($D3,$T2);			# h3*r0
-	&pmuludq	($D4,$T2);			# h4*r0
-
-sub pmuladd_alt {
-my $addr = shift;
-
-	&pmuludq	($D0,&$addr(8));		# h1*s4
-	&movdqa		($T2,$D1);
-	&pmuludq	($D1,&$addr(1));		# h0*r1
-	&paddq		($D0,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(2));		# h0*r2
-	&paddq		($D1,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(3));		# h0*r3
-	&paddq		($D2,$T2);
-	 &movdqa	($T2,&QWP(16*1,"eax"));		# pull h1
-	&pmuludq	($T1,&$addr(4));		# h0*r4
-	&paddq		($D3,$T0);
-
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(1));		# h1*r1
-	 &paddq		($D4,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(2));		# h1*r2
-	&paddq		($D2,$T2);
-	&movdqa		($T2,&QWP(16*2,"eax"));		# pull h2
-	&pmuludq	($T1,&$addr(3));		# h1*r3
-	&paddq		($D3,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(7));		# h2*s3
-	&paddq		($D4,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(8));		# h2*s4
-	&paddq		($D0,$T2);
-
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&$addr(1));		# h2*r1
-	 &paddq		($D1,$T0);
-	&movdqa		($T0,&QWP(16*3,"eax"));		# pull h3
-	&pmuludq	($T2,&$addr(2));		# h2*r2
-	&paddq		($D3,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(6));		# h3*s2
-	&paddq		($D4,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&$addr(7));		# h3*s3
-	&paddq		($D0,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(8));		# h3*s4
-	&paddq		($D1,$T1);
-
-	&movdqa		($T1,&QWP(16*4,"eax"));		# pull h4
-	&pmuludq	($T0,&$addr(1));		# h3*r1
-	 &paddq		($D2,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&$addr(8));		# h4*s4
-	&paddq		($D4,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(5));		# h4*s1
-	&paddq		($D3,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(6));		# h4*s2
-	&paddq		($D0,$T2);
-	 &movdqa	($MASK,&QWP(64,"ebx"));
-	&pmuludq	($T1,&$addr(7));		# h4*s3
-	&paddq		($D1,$T0);
-	&paddq		($D2,$T1);
-}
-	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*($i-9),"edx");	});
-
-	&load_input	(-16*2,0);
-	&lea		("eax",&DWP(-16*2,"esi"));
-	&sub		("ecx",64);
-
-	&paddd		($T0,&QWP(16*(5+0),"esp"));	# add hash value
-	&paddd		($T1,&QWP(16*(5+1),"esp"));
-	&paddd		($D2,&QWP(16*(5+2),"esp"));
-	&paddd		($D3,&QWP(16*(5+3),"esp"));
-	&paddd		($D4,&QWP(16*(5+4),"esp"));
-
-	&cmovb		("esi","eax");
-	&lea		("eax",&DWP(16*10,"esp"));
-
-	&movdqa		($T2,&QWP(16*0,"edx"));		# r0^4
-	&movdqa		(&QWP(16*1,"esp"),$D1);
-	&movdqa		(&QWP(16*1,"eax"),$T1);
-	&movdqa		(&QWP(16*2,"eax"),$D2);
-	&movdqa		(&QWP(16*3,"eax"),$D3);
-	&movdqa		(&QWP(16*4,"eax"),$D4);
-
-	################################################################
-	# d4 += h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
-	# d3 += h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
-	# d2 += h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
-	# d1 += h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	# d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	&movdqa		($D1,$T0);
-	&pmuludq	($T0,$T2);			# h0*r0
-	&paddq		($T0,$D0);
-	&movdqa		($D0,$T1);
-	&pmuludq	($T1,$T2);			# h1*r0
-	&pmuludq	($D2,$T2);			# h2*r0
-	&pmuludq	($D3,$T2);			# h3*r0
-	&pmuludq	($D4,$T2);			# h4*r0
-
-	&paddq		($T1,&QWP(16*1,"esp"));
-	&paddq		($D2,&QWP(16*2,"esp"));
-	&paddq		($D3,&QWP(16*3,"esp"));
-	&paddq		($D4,&QWP(16*4,"esp"));
-
-	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*$i,"edx");	});
-
-	&lazy_reduction	();
-
-	&load_input	(16*2,16*5);
-
-	&ja		(&label("loop"));
-
-&set_label("skip_loop");
-	################################################################
-	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	 &pshufd	($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
-	&add		("ecx",32);
-	&jnz		(&label("long_tail"));
-
-	&paddd		($T0,$D0);			# add hash value
-	&paddd		($T1,$D1);
-	&paddd		($D2,&QWP(16*7,"esp"));
-	&paddd		($D3,&QWP(16*8,"esp"));
-	&paddd		($D4,&QWP(16*9,"esp"));
-
-&set_label("long_tail");
-
-	&movdqa		(&QWP(16*0,"eax"),$T0);
-	&movdqa		(&QWP(16*1,"eax"),$T1);
-	&movdqa		(&QWP(16*2,"eax"),$D2);
-	&movdqa		(&QWP(16*3,"eax"),$D3);
-	&movdqa		(&QWP(16*4,"eax"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&pmuludq	($T0,$T2);			# h0*r0
-	&pmuludq	($T1,$T2);			# h1*r0
-	&pmuludq	($D2,$T2);			# h2*r0
-	&movdqa		($D0,$T0);
-	 &pshufd	($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
-	&pmuludq	($D3,$T2);			# h3*r0
-	&movdqa		($D1,$T1);
-	&pmuludq	($D4,$T2);			# h4*r0
-
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
-			     },"eax");
-
-	&jz		(&label("short_tail"));
-
-	&load_input	(-16*2,0);
-
-	 &pshufd	($T2,&QWP(16*0,"edx"),0x10);	# r0^n
-	&paddd		($T0,&QWP(16*5,"esp"));		# add hash value
-	&paddd		($T1,&QWP(16*6,"esp"));
-	&paddd		($D2,&QWP(16*7,"esp"));
-	&paddd		($D3,&QWP(16*8,"esp"));
-	&paddd		($D4,&QWP(16*9,"esp"));
-
-	################################################################
-	# multiply inp[0:1] by r^4:r^3 and accumulate
-
-	&movdqa		(&QWP(16*0,"esp"),$T0);
-	&pmuludq	($T0,$T2);			# h0*r0
-	&movdqa		(&QWP(16*1,"esp"),$T1);
-	&pmuludq	($T1,$T2);			# h1*r0
-	&paddq		($D0,$T0);
-	&movdqa		($T0,$D2);
-	&pmuludq	($D2,$T2);			# h2*r0
-	&paddq		($D1,$T1);
-	&movdqa		($T1,$D3);
-	&pmuludq	($D3,$T2);			# h3*r0
-	&paddq		($D2,&QWP(16*2,"esp"));
-	&movdqa		(&QWP(16*2,"esp"),$T0);
-	 &pshufd	($T0,&QWP(16*1,"edx"),0x10);	# r1^n
-	&paddq		($D3,&QWP(16*3,"esp"));
-	&movdqa		(&QWP(16*3,"esp"),$T1);
-	&movdqa		($T1,$D4);
-	&pmuludq	($D4,$T2);			# h4*r0
-	&paddq		($D4,&QWP(16*4,"esp"));
-	&movdqa		(&QWP(16*4,"esp"),$T1);
-
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&pshufd ($reg,&QWP(16*$i,"edx"),0x10);
-			     });
-
-&set_label("short_tail");
-
-	################################################################
-	# horizontal addition
-
-	&pshufd		($T1,$D4,0b01001110);
-	&pshufd		($T0,$D3,0b01001110);
-	&paddq		($D4,$T1);
-	&paddq		($D3,$T0);
-	&pshufd		($T1,$D0,0b01001110);
-	&pshufd		($T0,$D1,0b01001110);
-	&paddq		($D0,$T1);
-	&paddq		($D1,$T0);
-	&pshufd		($T1,$D2,0b01001110);
-	#&paddq		($D2,$T1);
-
-	&lazy_reduction	(sub { &paddq ($D2,$T1) });
-
-&set_label("done");
-	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
-	&movd		(&DWP(-16*3+4*1,"edi"),$D1);
-	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
-	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
-	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
-	&mov	("esp","ebp");
-&set_label("nodata");
-&function_end("_poly1305_blocks_sse2");
-
-&align	(32);
-&function_begin("_poly1305_emit_sse2");
-	&mov	("ebp",&wparam(0));		# context
-
-	&cmp	(&DWP(4*5,"ebp"),0);		# is_base2_26?
-	&je	(&label("enter_emit"));
-
-	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
-	&mov	("edi",&DWP(4*1,"ebp"));
-	&mov	("ecx",&DWP(4*2,"ebp"));
-	&mov	("edx",&DWP(4*3,"ebp"));
-	&mov	("esi",&DWP(4*4,"ebp"));
-
-	&mov	("ebx","edi");			# base 2^26 -> base 2^32
-	&shl	("edi",26);
-	&shr	("ebx",6);
-	&add	("eax","edi");
-	&mov	("edi","ecx");
-	&adc	("ebx",0);
-
-	&shl	("edi",20);
-	&shr	("ecx",12);
-	&add	("ebx","edi");
-	&mov	("edi","edx");
-	&adc	("ecx",0);
-
-	&shl	("edi",14);
-	&shr	("edx",18);
-	&add	("ecx","edi");
-	&mov	("edi","esi");
-	&adc	("edx",0);
-
-	&shl	("edi",8);
-	&shr	("esi",24);
-	&add	("edx","edi");
-	&adc	("esi",0);			# can be partially reduced
-
-	&mov	("edi","esi");			# final reduction
-	&and	("esi",3);
-	&shr	("edi",2);
-	&lea	("ebp",&DWP(0,"edi","edi",4));	# *5
-	 &mov	("edi",&wparam(1));		# output
-	&add	("eax","ebp");
-	 &mov	("ebp",&wparam(2));		# key
-	&adc	("ebx",0);
-	&adc	("ecx",0);
-	&adc	("edx",0);
-	&adc	("esi",0);
-
-	&movd	($D0,"eax");			# offload original hash value
-	&add	("eax",5);			# compare to modulus
-	&movd	($D1,"ebx");
-	&adc	("ebx",0);
-	&movd	($D2,"ecx");
-	&adc	("ecx",0);
-	&movd	($D3,"edx");
-	&adc	("edx",0);
-	&adc	("esi",0);
-	&shr	("esi",2);			# did it carry/borrow?
-
-	&neg	("esi");			# do we choose (hash-modulus) ...
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&and	("ecx","esi");
-	&and	("edx","esi");
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&movd	("eax",$D0);
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&movd	("ebx",$D1);
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&movd	("ecx",$D2);
-	&mov	(&DWP(4*3,"edi"),"edx");
-	&movd	("edx",$D3);
-
-	&not	("esi");			# ... or original hash value?
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&or	("eax",&DWP(4*0,"edi"));
-	&and	("ecx","esi");
-	&or	("ebx",&DWP(4*1,"edi"));
-	&and	("edx","esi");
-	&or	("ecx",&DWP(4*2,"edi"));
-	&or	("edx",&DWP(4*3,"edi"));
-
-	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
-	&adc	("ebx",&DWP(4*1,"ebp"));
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&adc	("ecx",&DWP(4*2,"ebp"));
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&adc	("edx",&DWP(4*3,"ebp"));
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&mov	(&DWP(4*3,"edi"),"edx");
-&function_end("_poly1305_emit_sse2");
-
-# The AVX2 code was removed.
-
-&set_label("const_sse2",64);
-	&data_word(1<<24,0,	1<<24,0,	1<<24,0,	1<<24,0);
-	&data_word(0,0,		0,0,		0,0,		0,0);
-	&data_word(0x03ffffff,0,0x03ffffff,0,	0x03ffffff,0,	0x03ffffff,0);
-	&data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
-}
-&asciz	("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
-&align	(4);
-
-&asm_finish();
-
-close STDOUT or die "error closing STDOUT";
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
deleted file mode 100755
index d1b547084a..0000000000
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ /dev/null
@@ -1,2243 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-#		IALU/gcc-4.8(*)	AVX(**)		AVX2
-# P4		4.46/+120%	-
-# Core 2	2.41/+90%	-
-# Westmere	1.88/+120%	-
-# Sandy Bridge	1.39/+140%	1.10
-# Haswell	1.14/+175%	1.11		0.65
-# Skylake	1.13/+120%	0.96		0.51
-# Silvermont	2.83/+95%	-
-# VIA Nano	1.82/+150%	-
-# Sledgehammer	1.38/+160%	-
-# Bulldozer	2.30/+130%	0.97
-#
-# (*)	improvement coefficients relative to clang are more modest and
-#	are ~50% on most processors, in both cases we are comparing to
-#	__int128 code;
-# (**)	SSE2 implementation was attempted, but among non-AVX processors
-#	it was faster than integer-only code only on older Intel P4 and
-#	Core processors, 50-30%, less newer processor is, but slower on
-#	contemporary ones, for example almost 2x slower on Atom, and as
-#	former are naturally disappearing, SSE2 is deemed unnecessary;
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-$avx = 2;
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len);	# *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
-
-sub poly1305_iteration {
-# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output:	$h0-$h2 *= $r0-$r1
-$code.=<<___;
-	mulq	$h0			# h0*r1
-	mov	%rax,$d2
-	 mov	$r0,%rax
-	mov	%rdx,$d3
-
-	mulq	$h0			# h0*r0
-	mov	%rax,$h0		# future $h0
-	 mov	$r0,%rax
-	mov	%rdx,$d1
-
-	mulq	$h1			# h1*r0
-	add	%rax,$d2
-	 mov	$s1,%rax
-	adc	%rdx,$d3
-
-	mulq	$h1			# h1*s1
-	 mov	$h2,$h1			# borrow $h1
-	add	%rax,$h0
-	adc	%rdx,$d1
-
-	imulq	$s1,$h1			# h2*s1
-	add	$h1,$d2
-	 mov	$d1,$h1
-	adc	\$0,$d3
-
-	imulq	$r0,$h2			# h2*r0
-	add	$d2,$h1
-	mov	\$-4,%rax		# mask value
-	adc	$h2,$d3
-
-	and	$d3,%rax		# last reduction step
-	mov	$d3,$h2
-	shr	\$2,$d3
-	and	\$3,$h2
-	add	$d3,%rax
-	add	%rax,$h0
-	adc	\$0,$h1
-	adc	\$0,$h2
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int64 h[3];		# current hash value base 2^64
-#	unsigned __int64 r[2];		# key value base 2^64
-
-$code.=<<___;
-.text
-
-.extern	GFp_ia32cap_P
-
-.globl	GFp_poly1305_init_asm
-.hidden	GFp_poly1305_init_asm
-.globl	GFp_poly1305_blocks
-.hidden	GFp_poly1305_blocks
-.globl	GFp_poly1305_emit
-.hidden	GFp_poly1305_emit
-
-.type	GFp_poly1305_init_asm,\@function,3
-.align	32
-GFp_poly1305_init_asm:
-	xor	%rax,%rax
-	mov	%rax,0($ctx)		# initialize hash value
-	mov	%rax,8($ctx)
-	mov	%rax,16($ctx)
-
-	cmp	\$0,$inp
-	je	.Lno_key
-
-	lea	GFp_poly1305_blocks(%rip),%r10
-	lea	GFp_poly1305_emit(%rip),%r11
-___
-$code.=<<___	if ($avx);
-	mov	GFp_ia32cap_P+4(%rip),%r9
-	lea	poly1305_blocks_avx(%rip),%rax
-	lea	poly1305_emit_avx(%rip),%rcx
-	bt	\$`60-32`,%r9		# AVX?
-	cmovc	%rax,%r10
-	cmovc	%rcx,%r11
-___
-$code.=<<___	if ($avx>1);
-	lea	poly1305_blocks_avx2(%rip),%rax
-	bt	\$`5+32`,%r9		# AVX2?
-	cmovc	%rax,%r10
-___
-$code.=<<___;
-	mov	\$0x0ffffffc0fffffff,%rax
-	mov	\$0x0ffffffc0ffffffc,%rcx
-	and	0($inp),%rax
-	and	8($inp),%rcx
-	mov	%rax,24($ctx)
-	mov	%rcx,32($ctx)
-___
-$code.=<<___	if ($flavour !~ /elf32/);
-	mov	%r10,0(%rdx)
-	mov	%r11,8(%rdx)
-___
-$code.=<<___	if ($flavour =~ /elf32/);
-	mov	%r10d,0(%rdx)
-	mov	%r11d,4(%rdx)
-___
-$code.=<<___;
-	mov	\$1,%eax
-.Lno_key:
-	ret
-.size	GFp_poly1305_init_asm,.-GFp_poly1305_init_asm
-
-.type	GFp_poly1305_blocks,\@function,4
-.align	32
-GFp_poly1305_blocks:
-.Lblocks:
-	shr	\$4,$len
-	jz	.Lno_data		# too short
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lblocks_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	mov	0($ctx),$h0		# load hash value
-	mov	8($ctx),$h1
-	mov	16($ctx),$h2
-
-	mov	$s1,$r1
-	shr	\$2,$s1
-	mov	$r1,%rax
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-	jmp	.Loop
-
-.align	32
-.Loop:
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-___
-	&poly1305_iteration();
-$code.=<<___;
-	mov	$r1,%rax
-	dec	%r15			# len-=16
-	jnz	.Loop
-
-	mov	$h0,0($ctx)		# store hash value
-	mov	$h1,8($ctx)
-	mov	$h2,16($ctx)
-
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rsp
-.Lno_data:
-.Lblocks_epilogue:
-	ret
-.size	GFp_poly1305_blocks,.-GFp_poly1305_blocks
-
-.type	GFp_poly1305_emit,\@function,3
-.align	32
-GFp_poly1305_emit:
-.Lemit:
-	mov	0($ctx),%r8	# load hash value
-	mov	8($ctx),%r9
-	mov	16($ctx),%r10
-
-	mov	%r8,%rax
-	add	\$5,%r8		# compare to modulus
-	mov	%r9,%rcx
-	adc	\$0,%r9
-	adc	\$0,%r10
-	shr	\$2,%r10	# did 130-bit value overfow?
-	cmovnz	%r8,%rax
-	cmovnz	%r9,%rcx
-
-	add	0($nonce),%rax	# accumulate nonce
-	adc	8($nonce),%rcx
-	mov	%rax,0($mac)	# write result
-	mov	%rcx,8($mac)
-
-	ret
-.size	GFp_poly1305_emit,.-GFp_poly1305_emit
-___
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int32 h[5];		# current hash value base 2^26
-#	unsigned __int32 is_base2_26;
-#	unsigned __int64 r[2];		# key value base 2^64
-#	unsigned __int64 pad;
-#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
-    map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type	__poly1305_block,\@abi-omnipotent
-.align	32
-__poly1305_block:
-___
-	&poly1305_iteration();
-$code.=<<___;
-	ret
-.size	__poly1305_block,.-__poly1305_block
-
-.type	__poly1305_init_avx,\@abi-omnipotent
-.align	32
-__poly1305_init_avx:
-	mov	$r0,$h0
-	mov	$r1,$h1
-	xor	$h2,$h2
-
-	lea	48+64($ctx),$ctx	# size optimization
-
-	mov	$r1,%rax
-	call	__poly1305_block	# r^2
-
-	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
-	mov	\$0x3ffffff,%edx
-	mov	$h0,$d1
-	and	$h0#d,%eax
-	mov	$r0,$d2
-	and	$r0#d,%edx
-	mov	%eax,`16*0+0-64`($ctx)
-	shr	\$26,$d1
-	mov	%edx,`16*0+4-64`($ctx)
-	shr	\$26,$d2
-
-	mov	\$0x3ffffff,%eax
-	mov	\$0x3ffffff,%edx
-	and	$d1#d,%eax
-	and	$d2#d,%edx
-	mov	%eax,`16*1+0-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	%edx,`16*1+4-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	mov	%eax,`16*2+0-64`($ctx)
-	shr	\$26,$d1
-	mov	%edx,`16*2+4-64`($ctx)
-	shr	\$26,$d2
-
-	mov	$h1,%rax
-	mov	$r1,%rdx
-	shl	\$12,%rax
-	shl	\$12,%rdx
-	or	$d1,%rax
-	or	$d2,%rdx
-	and	\$0x3ffffff,%eax
-	and	\$0x3ffffff,%edx
-	mov	%eax,`16*3+0-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	%edx,`16*3+4-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	mov	%eax,`16*4+0-64`($ctx)
-	mov	$h1,$d1
-	mov	%edx,`16*4+4-64`($ctx)
-	mov	$r1,$d2
-
-	mov	\$0x3ffffff,%eax
-	mov	\$0x3ffffff,%edx
-	shr	\$14,$d1
-	shr	\$14,$d2
-	and	$d1#d,%eax
-	and	$d2#d,%edx
-	mov	%eax,`16*5+0-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	%edx,`16*5+4-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	mov	%eax,`16*6+0-64`($ctx)
-	shr	\$26,$d1
-	mov	%edx,`16*6+4-64`($ctx)
-	shr	\$26,$d2
-
-	mov	$h2,%rax
-	shl	\$24,%rax
-	or	%rax,$d1
-	mov	$d1#d,`16*7+0-64`($ctx)
-	lea	($d1,$d1,4),$d1		# *5
-	mov	$d2#d,`16*7+4-64`($ctx)
-	lea	($d2,$d2,4),$d2		# *5
-	mov	$d1#d,`16*8+0-64`($ctx)
-	mov	$d2#d,`16*8+4-64`($ctx)
-
-	mov	$r1,%rax
-	call	__poly1305_block	# r^3
-
-	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
-	mov	$h0,$d1
-	and	$h0#d,%eax
-	shr	\$26,$d1
-	mov	%eax,`16*0+12-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	and	$d1#d,%edx
-	mov	%edx,`16*1+12-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*2+12-64`($ctx)
-
-	mov	$h1,%rax
-	shl	\$12,%rax
-	or	$d1,%rax
-	and	\$0x3ffffff,%eax
-	mov	%eax,`16*3+12-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	$h1,$d1
-	mov	%eax,`16*4+12-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	shr	\$14,$d1
-	and	$d1#d,%edx
-	mov	%edx,`16*5+12-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*6+12-64`($ctx)
-
-	mov	$h2,%rax
-	shl	\$24,%rax
-	or	%rax,$d1
-	mov	$d1#d,`16*7+12-64`($ctx)
-	lea	($d1,$d1,4),$d1		# *5
-	mov	$d1#d,`16*8+12-64`($ctx)
-
-	mov	$r1,%rax
-	call	__poly1305_block	# r^4
-
-	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
-	mov	$h0,$d1
-	and	$h0#d,%eax
-	shr	\$26,$d1
-	mov	%eax,`16*0+8-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	and	$d1#d,%edx
-	mov	%edx,`16*1+8-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*2+8-64`($ctx)
-
-	mov	$h1,%rax
-	shl	\$12,%rax
-	or	$d1,%rax
-	and	\$0x3ffffff,%eax
-	mov	%eax,`16*3+8-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	$h1,$d1
-	mov	%eax,`16*4+8-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	shr	\$14,$d1
-	and	$d1#d,%edx
-	mov	%edx,`16*5+8-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*6+8-64`($ctx)
-
-	mov	$h2,%rax
-	shl	\$24,%rax
-	or	%rax,$d1
-	mov	$d1#d,`16*7+8-64`($ctx)
-	lea	($d1,$d1,4),$d1		# *5
-	mov	$d1#d,`16*8+8-64`($ctx)
-
-	lea	-48-64($ctx),$ctx	# size [de-]optimization
-	ret
-.size	__poly1305_init_avx,.-__poly1305_init_avx
-
-.type	poly1305_blocks_avx,\@function,4
-.align	32
-poly1305_blocks_avx:
-	mov	20($ctx),%r8d		# is_base2_26
-	cmp	\$128,$len
-	jae	.Lblocks_avx
-	test	%r8d,%r8d
-	jz	.Lblocks
-
-.Lblocks_avx:
-	and	\$-16,$len
-	jz	.Lno_data_avx
-
-	vzeroupper
-
-	test	%r8d,%r8d
-	jz	.Lbase2_64_avx
-
-	test	\$31,$len
-	jz	.Leven_avx
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lblocks_avx_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	0($ctx),$d1		# load hash value
-	mov	8($ctx),$d2
-	mov	16($ctx),$h2#d
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	################################# base 2^26 -> base 2^64
-	mov	$d1#d,$h0#d
-	and	\$`-1*(1<<31)`,$d1
-	mov	$d2,$r1			# borrow $r1
-	mov	$d2#d,$h1#d
-	and	\$`-1*(1<<31)`,$d2
-
-	shr	\$6,$d1
-	shl	\$52,$r1
-	add	$d1,$h0
-	shr	\$12,$h1
-	shr	\$18,$d2
-	add	$r1,$h0
-	adc	$d2,$h1
-
-	mov	$h2,$d1
-	shl	\$40,$d1
-	shr	\$24,$h2
-	add	$d1,$h1
-	adc	\$0,$h2			# can be partially reduced...
-
-	mov	\$-4,$d2		# ... so reduce
-	mov	$h2,$d1
-	and	$h2,$d2
-	shr	\$2,$d1
-	and	\$3,$h2
-	add	$d2,$d1			# =*5
-	add	$d1,$h0
-	adc	\$0,$h1
-	adc	\$0,$h2
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-
-	call	__poly1305_block
-
-	test	$padbit,$padbit		# if $padbit is zero,
-	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
-
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$r0
-	mov	$h1,$r1
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$r0
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$r0,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$r1
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$r1,$h2			# h[4]
-
-	sub	\$16,%r15
-	jz	.Lstore_base2_26_avx
-
-	vmovd	%rax#d,$H0
-	vmovd	%rdx#d,$H1
-	vmovd	$h0#d,$H2
-	vmovd	$h1#d,$H3
-	vmovd	$h2#d,$H4
-	jmp	.Lproceed_avx
-
-.align	32
-.Lstore_base2_64_avx:
-	mov	$h0,0($ctx)
-	mov	$h1,8($ctx)
-	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
-	jmp	.Ldone_avx
-
-.align	16
-.Lstore_base2_26_avx:
-	mov	%rax#d,0($ctx)		# store hash value base 2^26
-	mov	%rdx#d,4($ctx)
-	mov	$h0#d,8($ctx)
-	mov	$h1#d,12($ctx)
-	mov	$h2#d,16($ctx)
-.align	16
-.Ldone_avx:
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rsp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
-	ret
-
-.align	32
-.Lbase2_64_avx:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lbase2_64_avx_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	mov	0($ctx),$h0		# load hash value
-	mov	8($ctx),$h1
-	mov	16($ctx),$h2#d
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-	test	\$31,$len
-	jz	.Linit_avx
-
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-	sub	\$16,%r15
-
-	call	__poly1305_block
-
-.Linit_avx:
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$d1
-	mov	$h1,$d2
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$d1
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$d1,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$d2
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$d2,$h2			# h[4]
-
-	vmovd	%rax#d,$H0
-	vmovd	%rdx#d,$H1
-	vmovd	$h0#d,$H2
-	vmovd	$h1#d,$H3
-	vmovd	$h2#d,$H4
-	movl	\$1,20($ctx)		# set is_base2_26
-
-	call	__poly1305_init_avx
-
-.Lproceed_avx:
-	mov	%r15,$len
-
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rax
-	lea	48(%rsp),%rsp
-.Lbase2_64_avx_epilogue:
-	jmp	.Ldo_avx
-
-.align	32
-.Leven_avx:
-	vmovd		4*0($ctx),$H0		# load hash value
-	vmovd		4*1($ctx),$H1
-	vmovd		4*2($ctx),$H2
-	vmovd		4*3($ctx),$H3
-	vmovd		4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___	if (!$win64);
-	lea		-0x58(%rsp),%r11
-	sub		\$0x178,%rsp
-___
-$code.=<<___	if ($win64);
-	lea		-0xf8(%rsp),%r11
-	sub		\$0x218,%rsp
-	vmovdqa		%xmm6,0x50(%r11)
-	vmovdqa		%xmm7,0x60(%r11)
-	vmovdqa		%xmm8,0x70(%r11)
-	vmovdqa		%xmm9,0x80(%r11)
-	vmovdqa		%xmm10,0x90(%r11)
-	vmovdqa		%xmm11,0xa0(%r11)
-	vmovdqa		%xmm12,0xb0(%r11)
-	vmovdqa		%xmm13,0xc0(%r11)
-	vmovdqa		%xmm14,0xd0(%r11)
-	vmovdqa		%xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
-	sub		\$64,$len
-	lea		-32($inp),%rax
-	cmovc		%rax,$inp
-
-	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
-	lea		`16*3+64`($ctx),$ctx	# size optimization
-	lea		.Lconst(%rip),%rcx
-
-	################################################################
-	# load input
-	vmovdqu		16*2($inp),$T0
-	vmovdqu		16*3($inp),$T1
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-
-	vpsrldq		\$6,$T0,$T2		# splat input
-	vpsrldq		\$6,$T1,$T3
-	vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	vpunpcklqdq	$T3,$T2,$T3		# 2:3
-
-	vpsrlq		\$40,$T4,$T4		# 4
-	vpsrlq		\$26,$T0,$T1
-	vpand		$MASK,$T0,$T0		# 0
-	vpsrlq		\$4,$T3,$T2
-	vpand		$MASK,$T1,$T1		# 1
-	vpsrlq		\$30,$T3,$T3
-	vpand		$MASK,$T2,$T2		# 2
-	vpand		$MASK,$T3,$T3		# 3
-	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	jbe		.Lskip_loop_avx
-
-	# expand and copy pre-calculated table to stack
-	vmovdqu		`16*1-64`($ctx),$D1
-	vmovdqu		`16*2-64`($ctx),$D2
-	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
-	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
-	vmovdqa		$D3,-0x90(%r11)
-	vmovdqa		$D0,0x00(%rsp)
-	vpshufd		\$0xEE,$D1,$D4
-	vmovdqu		`16*3-64`($ctx),$D0
-	vpshufd		\$0x44,$D1,$D1
-	vmovdqa		$D4,-0x80(%r11)
-	vmovdqa		$D1,0x10(%rsp)
-	vpshufd		\$0xEE,$D2,$D3
-	vmovdqu		`16*4-64`($ctx),$D1
-	vpshufd		\$0x44,$D2,$D2
-	vmovdqa		$D3,-0x70(%r11)
-	vmovdqa		$D2,0x20(%rsp)
-	vpshufd		\$0xEE,$D0,$D4
-	vmovdqu		`16*5-64`($ctx),$D2
-	vpshufd		\$0x44,$D0,$D0
-	vmovdqa		$D4,-0x60(%r11)
-	vmovdqa		$D0,0x30(%rsp)
-	vpshufd		\$0xEE,$D1,$D3
-	vmovdqu		`16*6-64`($ctx),$D0
-	vpshufd		\$0x44,$D1,$D1
-	vmovdqa		$D3,-0x50(%r11)
-	vmovdqa		$D1,0x40(%rsp)
-	vpshufd		\$0xEE,$D2,$D4
-	vmovdqu		`16*7-64`($ctx),$D1
-	vpshufd		\$0x44,$D2,$D2
-	vmovdqa		$D4,-0x40(%r11)
-	vmovdqa		$D2,0x50(%rsp)
-	vpshufd		\$0xEE,$D0,$D3
-	vmovdqu		`16*8-64`($ctx),$D2
-	vpshufd		\$0x44,$D0,$D0
-	vmovdqa		$D3,-0x30(%r11)
-	vmovdqa		$D0,0x60(%rsp)
-	vpshufd		\$0xEE,$D1,$D4
-	vpshufd		\$0x44,$D1,$D1
-	vmovdqa		$D4,-0x20(%r11)
-	vmovdqa		$D1,0x70(%rsp)
-	vpshufd		\$0xEE,$D2,$D3
-	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
-	vpshufd		\$0x44,$D2,$D2
-	vmovdqa		$D3,-0x10(%r11)
-	vmovdqa		$D2,0x80(%rsp)
-
-	jmp		.Loop_avx
-
-.align	32
-.Loop_avx:
-	################################################################
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	#   \___________________/
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	#   \___________________/ \____________________/
-	#
-	# Note that we start with inp[2:3]*r^2. This is because it
-	# doesn't depend on reduction in previous iteration.
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	#
-	# though note that $Tx and $Hx are "reversed" in this section,
-	# and $D4 is preloaded with r0^2...
-
-	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
-	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
-	  vmovdqa	$H2,0x20(%r11)				# offload hash
-	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
-	 vmovdqa	0x10(%rsp),$H2		# r1^2
-	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
-	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
-
-	  vmovdqa	$H0,0x00(%r11)				#
-	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
-	  vmovdqa	$H1,0x10(%r11)				#
-	vpmuludq	$T3,$H2,$H1		# h3*r1
-	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
-	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
-	  vmovdqa	$H3,0x30(%r11)				#
-	vpmuludq	$T2,$H2,$H0		# h2*r1
-	vpmuludq	$T1,$H2,$H1		# h1*r1
-	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
-	 vmovdqa	0x30(%rsp),$H3		# r2^2
-	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
-	  vmovdqa	$H4,0x40(%r11)				#
-	vpmuludq	$T0,$H2,$H2		# h0*r1
-	 vpmuludq	$T2,$H3,$H0		# h2*r2
-	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
-
-	 vmovdqa	0x40(%rsp),$H4		# s2^2
-	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
-	vpmuludq	$T1,$H3,$H1		# h1*r2
-	vpmuludq	$T0,$H3,$H3		# h0*r2
-	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
-	 vmovdqa	0x50(%rsp),$H2		# r3^2
-	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
-	vpmuludq	$T4,$H4,$H0		# h4*s2
-	vpmuludq	$T3,$H4,$H4		# h3*s2
-	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
-	 vmovdqa	0x60(%rsp),$H3		# s3^2
-	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
-
-	 vmovdqa	0x80(%rsp),$H4		# s4^2
-	vpmuludq	$T1,$H2,$H1		# h1*r3
-	vpmuludq	$T0,$H2,$H2		# h0*r3
-	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
-	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
-	vpmuludq	$T4,$H3,$H0		# h4*s3
-	vpmuludq	$T3,$H3,$H1		# h3*s3
-	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
-	 vmovdqu	16*0($inp),$H0				# load input
-	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
-	vpmuludq	$T2,$H3,$H3		# h2*s3
-	 vpmuludq	$T2,$H4,$T2		# h2*s4
-	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
-
-	 vmovdqu	16*1($inp),$H1				#
-	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
-	vpmuludq	$T3,$H4,$T3		# h3*s4
-	vpmuludq	$T4,$H4,$T4		# h4*s4
-	 vpsrldq	\$6,$H0,$H2				# splat input
-	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
-	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
-	 vpsrldq	\$6,$H1,$H3				#
-	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
-	vpmuludq	$T1,$H4,$T0		# h1*s4
-	 vpunpckhqdq	$H1,$H0,$H4		# 4
-	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
-	 vmovdqa	-0x90(%r11),$T4		# r0^4
-	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
-
-	vpunpcklqdq	$H1,$H0,$H0		# 0:1
-	vpunpcklqdq	$H3,$H2,$H3		# 2:3
-
-	#vpsrlq		\$40,$H4,$H4		# 4
-	vpsrldq		\$`40/8`,$H4,$H4	# 4
-	vpsrlq		\$26,$H0,$H1
-	vpand		$MASK,$H0,$H0		# 0
-	vpsrlq		\$4,$H3,$H2
-	vpand		$MASK,$H1,$H1		# 1
-	vpand		0(%rcx),$H4,$H4		# .Lmask24
-	vpsrlq		\$30,$H3,$H3
-	vpand		$MASK,$H2,$H2		# 2
-	vpand		$MASK,$H3,$H3		# 3
-	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
-
-	vpaddq		0x00(%r11),$H0,$H0	# add hash value
-	vpaddq		0x10(%r11),$H1,$H1
-	vpaddq		0x20(%r11),$H2,$H2
-	vpaddq		0x30(%r11),$H3,$H3
-	vpaddq		0x40(%r11),$H4,$H4
-
-	lea		16*2($inp),%rax
-	lea		16*4($inp),$inp
-	sub		\$64,$len
-	cmovc		%rax,$inp
-
-	################################################################
-	# Now we accumulate (inp[0:1]+hash)*r^4
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	vpmuludq	$H0,$T4,$T0		# h0*r0
-	vpmuludq	$H1,$T4,$T1		# h1*r0
-	vpaddq		$T0,$D0,$D0
-	vpaddq		$T1,$D1,$D1
-	 vmovdqa	-0x80(%r11),$T2		# r1^4
-	vpmuludq	$H2,$T4,$T0		# h2*r0
-	vpmuludq	$H3,$T4,$T1		# h3*r0
-	vpaddq		$T0,$D2,$D2
-	vpaddq		$T1,$D3,$D3
-	vpmuludq	$H4,$T4,$T4		# h4*r0
-	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
-	vpaddq		$T4,$D4,$D4
-
-	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
-	vpmuludq	$H2,$T2,$T1		# h2*r1
-	vpmuludq	$H3,$T2,$T0		# h3*r1
-	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
-	 vmovdqa	-0x60(%r11),$T3		# r2^4
-	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
-	vpmuludq	$H1,$T2,$T1		# h1*r1
-	vpmuludq	$H0,$T2,$T2		# h0*r1
-	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
-	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
-
-	 vmovdqa	-0x50(%r11),$T4		# s2^4
-	vpmuludq	$H2,$T3,$T0		# h2*r2
-	vpmuludq	$H1,$T3,$T1		# h1*r2
-	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
-	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
-	 vmovdqa	-0x40(%r11),$T2		# r3^4
-	vpmuludq	$H0,$T3,$T3		# h0*r2
-	vpmuludq	$H4,$T4,$T0		# h4*s2
-	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
-	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
-	 vmovdqa	-0x30(%r11),$T3		# s3^4
-	vpmuludq	$H3,$T4,$T4		# h3*s2
-	 vpmuludq	$H1,$T2,$T1		# h1*r3
-	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
-
-	 vmovdqa	-0x10(%r11),$T4		# s4^4
-	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
-	vpmuludq	$H0,$T2,$T2		# h0*r3
-	vpmuludq	$H4,$T3,$T0		# h4*s3
-	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
-	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
-	 vmovdqu	16*2($inp),$T0				# load input
-	vpmuludq	$H3,$T3,$T2		# h3*s3
-	vpmuludq	$H2,$T3,$T3		# h2*s3
-	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
-	 vmovdqu	16*3($inp),$T1				#
-	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
-
-	vpmuludq	$H2,$T4,$H2		# h2*s4
-	vpmuludq	$H3,$T4,$H3		# h3*s4
-	 vpsrldq	\$6,$T0,$T2				# splat input
-	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
-	vpmuludq	$H4,$T4,$H4		# h4*s4
-	 vpsrldq	\$6,$T1,$T3				#
-	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
-	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
-	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
-	vpmuludq	$H1,$T4,$H0
-	 vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
-	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
-
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	vpunpcklqdq	$T3,$T2,$T3		# 2:3
-
-	#vpsrlq		\$40,$T4,$T4		# 4
-	vpsrldq		\$`40/8`,$T4,$T4	# 4
-	vpsrlq		\$26,$T0,$T1
-	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
-	vpand		$MASK,$T0,$T0		# 0
-	vpsrlq		\$4,$T3,$T2
-	vpand		$MASK,$T1,$T1		# 1
-	vpand		0(%rcx),$T4,$T4		# .Lmask24
-	vpsrlq		\$30,$T3,$T3
-	vpand		$MASK,$T2,$T2		# 2
-	vpand		$MASK,$T3,$T3		# 3
-	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	################################################################
-	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	# and P. Schwabe
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$D1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H4,$D0
-	vpand		$MASK,$H4,$H4
-
-	vpsrlq		\$26,$H1,$D1
-	vpand		$MASK,$H1,$H1
-	vpaddq		$D1,$H2,$H2		# h1 -> h2
-
-	vpaddq		$D0,$H0,$H0
-	vpsllq		\$2,$D0,$D0
-	vpaddq		$D0,$H0,$H0		# h4 -> h0
-
-	vpsrlq		\$26,$H2,$D2
-	vpand		$MASK,$H2,$H2
-	vpaddq		$D2,$H3,$H3		# h2 -> h3
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$H1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	ja		.Loop_avx
-
-.Lskip_loop_avx:
-	################################################################
-	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
-	add		\$32,$len
-	jnz		.Long_tail_avx
-
-	vpaddq		$H2,$T2,$T2
-	vpaddq		$H0,$T0,$T0
-	vpaddq		$H1,$T1,$T1
-	vpaddq		$H3,$T3,$T3
-	vpaddq		$H4,$T4,$T4
-
-.Long_tail_avx:
-	vmovdqa		$H2,0x20(%r11)
-	vmovdqa		$H0,0x00(%r11)
-	vmovdqa		$H1,0x10(%r11)
-	vmovdqa		$H3,0x30(%r11)
-	vmovdqa		$H4,0x40(%r11)
-
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
-	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
-	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
-	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
-	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
-	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
-
-	vpmuludq	$T3,$H2,$H0		# h3*r1
-	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
-	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
-	vpmuludq	$T2,$H2,$H1		# h2*r1
-	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
-	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
-	vpmuludq	$T1,$H2,$H0		# h1*r1
-	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$T0,$H2,$H2		# h0*r1
-	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
-	vpmuludq	$T4,$H3,$H3		# h4*s1
-	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
-
-	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
-	vpmuludq	$T2,$H4,$H1		# h2*r2
-	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
-	vpmuludq	$T1,$H4,$H0		# h1*r2
-	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
-	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
-	vpmuludq	$T0,$H4,$H4		# h0*r2
-	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
-	vpmuludq	$T4,$H2,$H1		# h4*s2
-	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
-	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
-	vpmuludq	$T3,$H2,$H2		# h3*s2
-	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
-
-	vpmuludq	$T1,$H3,$H0		# h1*r3
-	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
-	vpmuludq	$T0,$H3,$H3		# h0*r3
-	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
-	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
-	vpmuludq	$T4,$H4,$H1		# h4*s3
-	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
-	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
-	vpmuludq	$T3,$H4,$H0		# h3*s3
-	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
-	vpmuludq	$T2,$H4,$H4		# h2*s3
-	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
-
-	vpmuludq	$T0,$H2,$H2		# h0*r4
-	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
-	vpmuludq	$T4,$H3,$H1		# h4*s4
-	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
-	vpmuludq	$T3,$H3,$H0		# h3*s4
-	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
-	vpmuludq	$T2,$H3,$H1		# h2*s4
-	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
-	vpmuludq	$T1,$H3,$H3		# h1*s4
-	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
-
-	jz		.Lshort_tail_avx
-
-	vmovdqu		16*0($inp),$H0		# load input
-	vmovdqu		16*1($inp),$H1
-
-	vpsrldq		\$6,$H0,$H2		# splat input
-	vpsrldq		\$6,$H1,$H3
-	vpunpckhqdq	$H1,$H0,$H4		# 4
-	vpunpcklqdq	$H1,$H0,$H0		# 0:1
-	vpunpcklqdq	$H3,$H2,$H3		# 2:3
-
-	vpsrlq		\$40,$H4,$H4		# 4
-	vpsrlq		\$26,$H0,$H1
-	vpand		$MASK,$H0,$H0		# 0
-	vpsrlq		\$4,$H3,$H2
-	vpand		$MASK,$H1,$H1		# 1
-	vpsrlq		\$30,$H3,$H3
-	vpand		$MASK,$H2,$H2		# 2
-	vpand		$MASK,$H3,$H3		# 3
-	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
-
-	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
-	vpaddq		0x00(%r11),$H0,$H0
-	vpaddq		0x10(%r11),$H1,$H1
-	vpaddq		0x20(%r11),$H2,$H2
-	vpaddq		0x30(%r11),$H3,$H3
-	vpaddq		0x40(%r11),$H4,$H4
-
-	################################################################
-	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
-	vpmuludq	$H0,$T4,$T0		# h0*r0
-	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
-	vpmuludq	$H1,$T4,$T1		# h1*r0
-	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
-	vpmuludq	$H2,$T4,$T0		# h2*r0
-	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
-	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
-	vpmuludq	$H3,$T4,$T1		# h3*r0
-	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
-	vpmuludq	$H4,$T4,$T4		# h4*r0
-	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
-
-	vpmuludq	$H3,$T2,$T0		# h3*r1
-	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
-	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
-	vpmuludq	$H2,$T2,$T1		# h2*r1
-	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
-	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
-	vpmuludq	$H1,$T2,$T0		# h1*r1
-	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$H0,$T2,$T2		# h0*r1
-	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
-	vpmuludq	$H4,$T3,$T3		# h4*s1
-	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
-
-	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
-	vpmuludq	$H2,$T4,$T1		# h2*r2
-	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
-	vpmuludq	$H1,$T4,$T0		# h1*r2
-	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
-	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
-	vpmuludq	$H0,$T4,$T4		# h0*r2
-	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
-	vpmuludq	$H4,$T2,$T1		# h4*s2
-	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
-	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
-	vpmuludq	$H3,$T2,$T2		# h3*s2
-	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
-
-	vpmuludq	$H1,$T3,$T0		# h1*r3
-	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
-	vpmuludq	$H0,$T3,$T3		# h0*r3
-	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
-	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
-	vpmuludq	$H4,$T4,$T1		# h4*s3
-	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
-	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
-	vpmuludq	$H3,$T4,$T0		# h3*s3
-	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
-	vpmuludq	$H2,$T4,$T4		# h2*s3
-	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
-
-	vpmuludq	$H0,$T2,$T2		# h0*r4
-	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
-	vpmuludq	$H4,$T3,$T1		# h4*s4
-	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
-	vpmuludq	$H3,$T3,$T0		# h3*s4
-	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
-	vpmuludq	$H2,$T3,$T1		# h2*s4
-	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
-	vpmuludq	$H1,$T3,$T3		# h1*s4
-	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
-
-.Lshort_tail_avx:
-	################################################################
-	# horizontal addition
-
-	vpsrldq		\$8,$D4,$T4
-	vpsrldq		\$8,$D3,$T3
-	vpsrldq		\$8,$D1,$T1
-	vpsrldq		\$8,$D0,$T0
-	vpsrldq		\$8,$D2,$T2
-	vpaddq		$T3,$D3,$D3
-	vpaddq		$T4,$D4,$D4
-	vpaddq		$T0,$D0,$D0
-	vpaddq		$T1,$D1,$D1
-	vpaddq		$T2,$D2,$D2
-
-	################################################################
-	# lazy reduction
-
-	vpsrlq		\$26,$D3,$H3
-	vpand		$MASK,$D3,$D3
-	vpaddq		$H3,$D4,$D4		# h3 -> h4
-
-	vpsrlq		\$26,$D0,$H0
-	vpand		$MASK,$D0,$D0
-	vpaddq		$H0,$D1,$D1		# h0 -> h1
-
-	vpsrlq		\$26,$D4,$H4
-	vpand		$MASK,$D4,$D4
-
-	vpsrlq		\$26,$D1,$H1
-	vpand		$MASK,$D1,$D1
-	vpaddq		$H1,$D2,$D2		# h1 -> h2
-
-	vpaddq		$H4,$D0,$D0
-	vpsllq		\$2,$H4,$H4
-	vpaddq		$H4,$D0,$D0		# h4 -> h0
-
-	vpsrlq		\$26,$D2,$H2
-	vpand		$MASK,$D2,$D2
-	vpaddq		$H2,$D3,$D3		# h2 -> h3
-
-	vpsrlq		\$26,$D0,$H0
-	vpand		$MASK,$D0,$D0
-	vpaddq		$H0,$D1,$D1		# h0 -> h1
-
-	vpsrlq		\$26,$D3,$H3
-	vpand		$MASK,$D3,$D3
-	vpaddq		$H3,$D4,$D4		# h3 -> h4
-
-	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
-	vmovd		$D1,`4*1-48-64`($ctx)
-	vmovd		$D2,`4*2-48-64`($ctx)
-	vmovd		$D3,`4*3-48-64`($ctx)
-	vmovd		$D4,`4*4-48-64`($ctx)
-___
-$code.=<<___	if ($win64);
-	vmovdqa		0x50(%r11),%xmm6
-	vmovdqa		0x60(%r11),%xmm7
-	vmovdqa		0x70(%r11),%xmm8
-	vmovdqa		0x80(%r11),%xmm9
-	vmovdqa		0x90(%r11),%xmm10
-	vmovdqa		0xa0(%r11),%xmm11
-	vmovdqa		0xb0(%r11),%xmm12
-	vmovdqa		0xc0(%r11),%xmm13
-	vmovdqa		0xd0(%r11),%xmm14
-	vmovdqa		0xe0(%r11),%xmm15
-	lea		0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___	if (!$win64);
-	lea		0x58(%r11),%rsp
-___
-$code.=<<___;
-	vzeroupper
-	ret
-.size	poly1305_blocks_avx,.-poly1305_blocks_avx
-
-.type	poly1305_emit_avx,\@function,3
-.align	32
-poly1305_emit_avx:
-	cmpl	\$0,20($ctx)	# is_base2_26?
-	je	.Lemit
-
-	mov	0($ctx),%eax	# load hash value base 2^26
-	mov	4($ctx),%ecx
-	mov	8($ctx),%r8d
-	mov	12($ctx),%r11d
-	mov	16($ctx),%r10d
-
-	shl	\$26,%rcx	# base 2^26 -> base 2^64
-	mov	%r8,%r9
-	shl	\$52,%r8
-	add	%rcx,%rax
-	shr	\$12,%r9
-	add	%rax,%r8	# h0
-	adc	\$0,%r9
-
-	shl	\$14,%r11
-	mov	%r10,%rax
-	shr	\$24,%r10
-	add	%r11,%r9
-	shl	\$40,%rax
-	add	%rax,%r9	# h1
-	adc	\$0,%r10	# h2
-
-	mov	%r10,%rax	# could be partially reduced, so reduce
-	mov	%r10,%rcx
-	and	\$3,%r10
-	shr	\$2,%rax
-	and	\$-4,%rcx
-	add	%rcx,%rax
-	add	%rax,%r8
-	adc	\$0,%r9
-	adc	\$0,%r10
-
-	mov	%r8,%rax
-	add	\$5,%r8		# compare to modulus
-	mov	%r9,%rcx
-	adc	\$0,%r9
-	adc	\$0,%r10
-	shr	\$2,%r10	# did 130-bit value overfow?
-	cmovnz	%r8,%rax
-	cmovnz	%r9,%rcx
-
-	add	0($nonce),%rax	# accumulate nonce
-	adc	8($nonce),%rcx
-	mov	%rax,0($mac)	# write result
-	mov	%rcx,8($mac)
-
-	ret
-.size	poly1305_emit_avx,.-poly1305_emit_avx
-___
-
-if ($avx>1) {
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
-    map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-$code.=<<___;
-.type	poly1305_blocks_avx2,\@function,4
-.align	32
-poly1305_blocks_avx2:
-	mov	20($ctx),%r8d		# is_base2_26
-	cmp	\$128,$len
-	jae	.Lblocks_avx2
-	test	%r8d,%r8d
-	jz	.Lblocks
-
-.Lblocks_avx2:
-	and	\$-16,$len
-	jz	.Lno_data_avx2
-
-	vzeroupper
-
-	test	%r8d,%r8d
-	jz	.Lbase2_64_avx2
-
-	test	\$63,$len
-	jz	.Leven_avx2
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lblocks_avx2_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	0($ctx),$d1		# load hash value
-	mov	8($ctx),$d2
-	mov	16($ctx),$h2#d
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	################################# base 2^26 -> base 2^64
-	mov	$d1#d,$h0#d
-	and	\$`-1*(1<<31)`,$d1
-	mov	$d2,$r1			# borrow $r1
-	mov	$d2#d,$h1#d
-	and	\$`-1*(1<<31)`,$d2
-
-	shr	\$6,$d1
-	shl	\$52,$r1
-	add	$d1,$h0
-	shr	\$12,$h1
-	shr	\$18,$d2
-	add	$r1,$h0
-	adc	$d2,$h1
-
-	mov	$h2,$d1
-	shl	\$40,$d1
-	shr	\$24,$h2
-	add	$d1,$h1
-	adc	\$0,$h2			# can be partially reduced...
-
-	mov	\$-4,$d2		# ... so reduce
-	mov	$h2,$d1
-	and	$h2,$d2
-	shr	\$2,$d1
-	and	\$3,$h2
-	add	$d2,$d1			# =*5
-	add	$d1,$h0
-	adc	\$0,$h1
-	adc	\$0,$h2
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2:
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-	sub	\$16,%r15
-
-	call	__poly1305_block
-	mov	$r1,%rax
-
-	test	\$63,%r15
-	jnz	.Lbase2_26_pre_avx2
-
-	test	$padbit,$padbit		# if $padbit is zero,
-	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
-
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$r0
-	mov	$h1,$r1
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$r0
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$r0,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$r1
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$r1,$h2			# h[4]
-
-	test	%r15,%r15
-	jz	.Lstore_base2_26_avx2
-
-	vmovd	%rax#d,%x#$H0
-	vmovd	%rdx#d,%x#$H1
-	vmovd	$h0#d,%x#$H2
-	vmovd	$h1#d,%x#$H3
-	vmovd	$h2#d,%x#$H4
-	jmp	.Lproceed_avx2
-
-.align	32
-.Lstore_base2_64_avx2:
-	mov	$h0,0($ctx)
-	mov	$h1,8($ctx)
-	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
-	jmp	.Ldone_avx2
-
-.align	16
-.Lstore_base2_26_avx2:
-	mov	%rax#d,0($ctx)		# store hash value base 2^26
-	mov	%rdx#d,4($ctx)
-	mov	$h0#d,8($ctx)
-	mov	$h1#d,12($ctx)
-	mov	$h2#d,16($ctx)
-.align	16
-.Ldone_avx2:
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rsp
-.Lno_data_avx2:
-.Lblocks_avx2_epilogue:
-	ret
-
-.align	32
-.Lbase2_64_avx2:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lbase2_64_avx2_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	mov	0($ctx),$h0		# load hash value
-	mov	8($ctx),$h1
-	mov	16($ctx),$h2#d
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-	test	\$63,$len
-	jz	.Linit_avx2
-
-.Lbase2_64_pre_avx2:
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-	sub	\$16,%r15
-
-	call	__poly1305_block
-	mov	$r1,%rax
-
-	test	\$63,%r15
-	jnz	.Lbase2_64_pre_avx2
-
-.Linit_avx2:
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$d1
-	mov	$h1,$d2
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$d1
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$d1,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$d2
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$d2,$h2			# h[4]
-
-	vmovd	%rax#d,%x#$H0
-	vmovd	%rdx#d,%x#$H1
-	vmovd	$h0#d,%x#$H2
-	vmovd	$h1#d,%x#$H3
-	vmovd	$h2#d,%x#$H4
-	movl	\$1,20($ctx)		# set is_base2_26
-
-	call	__poly1305_init_avx
-
-.Lproceed_avx2:
-	mov	%r15,$len
-
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rax
-	lea	48(%rsp),%rsp
-.Lbase2_64_avx2_epilogue:
-	jmp	.Ldo_avx2
-
-.align	32
-.Leven_avx2:
-	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
-	vmovd		4*1($ctx),%x#$H1
-	vmovd		4*2($ctx),%x#$H2
-	vmovd		4*3($ctx),%x#$H3
-	vmovd		4*4($ctx),%x#$H4
-
-.Ldo_avx2:
-___
-$code.=<<___	if (!$win64);
-	lea		-8(%rsp),%r11
-	sub		\$0x128,%rsp
-___
-$code.=<<___	if ($win64);
-	lea		-0xf8(%rsp),%r11
-	sub		\$0x1c8,%rsp
-	vmovdqa		%xmm6,0x50(%r11)
-	vmovdqa		%xmm7,0x60(%r11)
-	vmovdqa		%xmm8,0x70(%r11)
-	vmovdqa		%xmm9,0x80(%r11)
-	vmovdqa		%xmm10,0x90(%r11)
-	vmovdqa		%xmm11,0xa0(%r11)
-	vmovdqa		%xmm12,0xb0(%r11)
-	vmovdqa		%xmm13,0xc0(%r11)
-	vmovdqa		%xmm14,0xd0(%r11)
-	vmovdqa		%xmm15,0xe0(%r11)
-.Ldo_avx2_body:
-___
-$code.=<<___;
-	lea		48+64($ctx),$ctx	# size optimization
-	lea		.Lconst(%rip),%rcx
-
-	# expand and copy pre-calculated table to stack
-	vmovdqu		`16*0-64`($ctx),%x#$T2
-	and		\$-512,%rsp
-	vmovdqu		`16*1-64`($ctx),%x#$T3
-	vmovdqu		`16*2-64`($ctx),%x#$T4
-	vmovdqu		`16*3-64`($ctx),%x#$D0
-	vmovdqu		`16*4-64`($ctx),%x#$D1
-	vmovdqu		`16*5-64`($ctx),%x#$D2
-	vmovdqu		`16*6-64`($ctx),%x#$D3
-	vpermq		\$0x15,$T2,$T2		# 00003412 -> 12343434
-	vmovdqu		`16*7-64`($ctx),%x#$D4
-	vpermq		\$0x15,$T3,$T3
-	vpshufd		\$0xc8,$T2,$T2		# 12343434 -> 14243444
-	vmovdqu		`16*8-64`($ctx),%x#$MASK
-	vpermq		\$0x15,$T4,$T4
-	vpshufd		\$0xc8,$T3,$T3
-	vmovdqa		$T2,0x00(%rsp)
-	vpermq		\$0x15,$D0,$D0
-	vpshufd		\$0xc8,$T4,$T4
-	vmovdqa		$T3,0x20(%rsp)
-	vpermq		\$0x15,$D1,$D1
-	vpshufd		\$0xc8,$D0,$D0
-	vmovdqa		$T4,0x40(%rsp)
-	vpermq		\$0x15,$D2,$D2
-	vpshufd		\$0xc8,$D1,$D1
-	vmovdqa		$D0,0x60(%rsp)
-	vpermq		\$0x15,$D3,$D3
-	vpshufd		\$0xc8,$D2,$D2
-	vmovdqa		$D1,0x80(%rsp)
-	vpermq		\$0x15,$D4,$D4
-	vpshufd		\$0xc8,$D3,$D3
-	vmovdqa		$D2,0xa0(%rsp)
-	vpermq		\$0x15,$MASK,$MASK
-	vpshufd		\$0xc8,$D4,$D4
-	vmovdqa		$D3,0xc0(%rsp)
-	vpshufd		\$0xc8,$MASK,$MASK
-	vmovdqa		$D4,0xe0(%rsp)
-	vmovdqa		$MASK,0x100(%rsp)
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-
-	################################################################
-	# load input
-	vmovdqu		16*0($inp),%x#$T0
-	vmovdqu		16*1($inp),%x#$T1
-	vinserti128	\$1,16*2($inp),$T0,$T0
-	vinserti128	\$1,16*3($inp),$T1,$T1
-	lea		16*4($inp),$inp
-
-	vpsrldq		\$6,$T0,$T2		# splat input
-	vpsrldq		\$6,$T1,$T3
-	vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpunpcklqdq	$T3,$T2,$T2		# 2:3
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
-
-	vpsrlq		\$30,$T2,$T3
-	vpsrlq		\$4,$T2,$T2
-	vpsrlq		\$26,$T0,$T1
-	vpsrlq		\$40,$T4,$T4		# 4
-	vpand		$MASK,$T2,$T2		# 2
-	vpand		$MASK,$T0,$T0		# 0
-	vpand		$MASK,$T1,$T1		# 1
-	vpand		$MASK,$T3,$T3		# 3
-	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	lea		0x90(%rsp),%rax		# size optimization
-	vpaddq		$H2,$T2,$H2		# accumulate input
-	sub		\$64,$len
-	jz		.Ltail_avx2
-	jmp		.Loop_avx2
-
-.align	32
-.Loop_avx2:
-	################################################################
-	# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
-	# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
-	# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
-	# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
-	#   \________/\________/
-	################################################################
-	#vpaddq		$H2,$T2,$H2		# accumulate input
-	vpaddq		$H0,$T0,$H0
-	vmovdqa		`32*0`(%rsp),$T0	# r0^4
-	vpaddq		$H1,$T1,$H1
-	vmovdqa		`32*1`(%rsp),$T1	# r1^4
-	vpaddq		$H3,$T3,$H3
-	vmovdqa		`32*3`(%rsp),$T2	# r2^4
-	vpaddq		$H4,$T4,$H4
-	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
-	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
-
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	#
-	# however, as h2 is "chronologically" first one available pull
-	# corresponding operations up, so it's
-	#
-	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
-	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
-	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
-
-	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
-	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
-	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
-	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
-	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
-
-	vpmuludq	$H0,$T1,$T4		# h0*r1
-	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
-	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
-	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$H3,$T1,$T4		# h3*r1
-	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
-	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
-	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
-	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
-
-	vpmuludq	$H0,$T0,$T4		# h0*r0
-	vpmuludq	$H1,$T0,$H2		# h1*r0
-	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
-	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
-	vpmuludq	$H3,$T0,$T4		# h3*r0
-	vpmuludq	$H4,$T0,$H2		# h4*r0
-	 vmovdqu	16*0($inp),%x#$T0	# load input
-	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
-	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
-	 vinserti128	\$1,16*2($inp),$T0,$T0
-
-	vpmuludq	$H3,$T1,$T4		# h3*s2
-	vpmuludq	$H4,$T1,$H2		# h4*s2
-	 vmovdqu	16*1($inp),%x#$T1
-	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
-	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
-	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
-	vpmuludq	$H1,$T2,$T4		# h1*r2
-	vpmuludq	$H0,$T2,$T2		# h0*r2
-	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
-	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
-	 vinserti128	\$1,16*3($inp),$T1,$T1
-	 lea		16*4($inp),$inp
-
-	vpmuludq	$H1,$H2,$T4		# h1*r3
-	vpmuludq	$H0,$H2,$H2		# h0*r3
-	 vpsrldq	\$6,$T0,$T2		# splat input
-	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
-	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
-	vpmuludq	$H3,$T3,$T4		# h3*s3
-	vpmuludq	$H4,$T3,$H2		# h4*s3
-	 vpsrldq	\$6,$T1,$T3
-	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
-	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
-	 vpunpckhqdq	$T1,$T0,$T4		# 4
-
-	vpmuludq	$H3,$S4,$H3		# h3*s4
-	vpmuludq	$H4,$S4,$H4		# h4*s4
-	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
-	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
-	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
-	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
-	vpmuludq	$H1,$S4,$H0		# h1*s4
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
-	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
-
-	################################################################
-	# lazy reduction (interleaved with tail of input splat)
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$D1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H4,$D4
-	vpand		$MASK,$H4,$H4
-
-	 vpsrlq		\$4,$T3,$T2
-
-	vpsrlq		\$26,$H1,$D1
-	vpand		$MASK,$H1,$H1
-	vpaddq		$D1,$H2,$H2		# h1 -> h2
-
-	vpaddq		$D4,$H0,$H0
-	vpsllq		\$2,$D4,$D4
-	vpaddq		$D4,$H0,$H0		# h4 -> h0
-
-	 vpand		$MASK,$T2,$T2		# 2
-	 vpsrlq		\$26,$T0,$T1
-
-	vpsrlq		\$26,$H2,$D2
-	vpand		$MASK,$H2,$H2
-	vpaddq		$D2,$H3,$H3		# h2 -> h3
-
-	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
-	 vpsrlq		\$30,$T3,$T3
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$H1,$H1		# h0 -> h1
-
-	 vpsrlq		\$40,$T4,$T4		# 4
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	 vpand		$MASK,$T0,$T0		# 0
-	 vpand		$MASK,$T1,$T1		# 1
-	 vpand		$MASK,$T3,$T3		# 3
-	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	sub		\$64,$len
-	jnz		.Loop_avx2
-
-	.byte		0x66,0x90
-.Ltail_avx2:
-	################################################################
-	# while above multiplications were by r^4 in all lanes, in last
-	# iteration we multiply least significant lane by r^4 and most
-	# significant one by r, so copy of above except that references
-	# to the precomputed table are displaced by 4...
-
-	#vpaddq		$H2,$T2,$H2		# accumulate input
-	vpaddq		$H0,$T0,$H0
-	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
-	vpaddq		$H1,$T1,$H1
-	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
-	vpaddq		$H3,$T3,$H3
-	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
-	vpaddq		$H4,$T4,$H4
-	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
-	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
-
-	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
-	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
-	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
-	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
-	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
-
-	vpmuludq	$H0,$T1,$T4		# h0*r1
-	vpmuludq	$H1,$T1,$H2		# h1*r1
-	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
-	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$H3,$T1,$T4		# h3*r1
-	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
-	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
-	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
-
-	vpmuludq	$H0,$T0,$T4		# h0*r0
-	vpmuludq	$H1,$T0,$H2		# h1*r0
-	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
-	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
-	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
-	vpmuludq	$H3,$T0,$T4		# h3*r0
-	vpmuludq	$H4,$T0,$H2		# h4*r0
-	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
-	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
-
-	vpmuludq	$H3,$T1,$T4		# h3*s2
-	vpmuludq	$H4,$T1,$H2		# h4*s2
-	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
-	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
-	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
-	vpmuludq	$H1,$T2,$T4		# h1*r2
-	vpmuludq	$H0,$T2,$T2		# h0*r2
-	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
-	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
-
-	vpmuludq	$H1,$H2,$T4		# h1*r3
-	vpmuludq	$H0,$H2,$H2		# h0*r3
-	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
-	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
-	vpmuludq	$H3,$T3,$T4		# h3*s3
-	vpmuludq	$H4,$T3,$H2		# h4*s3
-	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
-	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
-
-	vpmuludq	$H3,$S4,$H3		# h3*s4
-	vpmuludq	$H4,$S4,$H4		# h4*s4
-	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
-	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
-	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
-	vpmuludq	$H1,$S4,$H0		# h1*s4
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
-	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
-
-	################################################################
-	# horizontal addition
-
-	vpsrldq		\$8,$D1,$T1
-	vpsrldq		\$8,$H2,$T2
-	vpsrldq		\$8,$H3,$T3
-	vpsrldq		\$8,$H4,$T4
-	vpsrldq		\$8,$H0,$T0
-	vpaddq		$T1,$D1,$D1
-	vpaddq		$T2,$H2,$H2
-	vpaddq		$T3,$H3,$H3
-	vpaddq		$T4,$H4,$H4
-	vpaddq		$T0,$H0,$H0
-
-	vpermq		\$0x2,$H3,$T3
-	vpermq		\$0x2,$H4,$T4
-	vpermq		\$0x2,$H0,$T0
-	vpermq		\$0x2,$D1,$T1
-	vpermq		\$0x2,$H2,$T2
-	vpaddq		$T3,$H3,$H3
-	vpaddq		$T4,$H4,$H4
-	vpaddq		$T0,$H0,$H0
-	vpaddq		$T1,$D1,$D1
-	vpaddq		$T2,$H2,$H2
-
-	################################################################
-	# lazy reduction
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$D1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H4,$D4
-	vpand		$MASK,$H4,$H4
-
-	vpsrlq		\$26,$H1,$D1
-	vpand		$MASK,$H1,$H1
-	vpaddq		$D1,$H2,$H2		# h1 -> h2
-
-	vpaddq		$D4,$H0,$H0
-	vpsllq		\$2,$D4,$D4
-	vpaddq		$D4,$H0,$H0		# h4 -> h0
-
-	vpsrlq		\$26,$H2,$D2
-	vpand		$MASK,$H2,$H2
-	vpaddq		$D2,$H3,$H3		# h2 -> h3
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$H1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
-	vmovd		%x#$H1,`4*1-48-64`($ctx)
-	vmovd		%x#$H2,`4*2-48-64`($ctx)
-	vmovd		%x#$H3,`4*3-48-64`($ctx)
-	vmovd		%x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___	if ($win64);
-	vmovdqa		0x50(%r11),%xmm6
-	vmovdqa		0x60(%r11),%xmm7
-	vmovdqa		0x70(%r11),%xmm8
-	vmovdqa		0x80(%r11),%xmm9
-	vmovdqa		0x90(%r11),%xmm10
-	vmovdqa		0xa0(%r11),%xmm11
-	vmovdqa		0xb0(%r11),%xmm12
-	vmovdqa		0xc0(%r11),%xmm13
-	vmovdqa		0xd0(%r11),%xmm14
-	vmovdqa		0xe0(%r11),%xmm15
-	lea		0xf8(%r11),%rsp
-.Ldo_avx2_epilogue:
-___
-$code.=<<___	if (!$win64);
-	lea		8(%r11),%rsp
-___
-$code.=<<___;
-	vzeroupper
-	ret
-.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
-___
-}
-$code.=<<___;
-.align	64
-.Lconst:
-.Lmask24:
-.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long	5,0,5,0,5,0,5,0
-___
-}
-
-$code.=<<___;
-.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align	16
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
-.align	16
-se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lcommon_seh_tail
-
-	lea	48(%rax),%rax
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R14
-
-	jmp	.Lcommon_seh_tail
-.size	se_handler,.-se_handler
-
-.type	avx_handler,\@abi-omnipotent
-.align	16
-avx_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lcommon_seh_tail
-
-	mov	208($context),%rax	# pull context->R11
-
-	lea	0x50(%rax),%rsi
-	lea	0xf8(%rax),%rax
-	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$20,%ecx
-	.long	0xa548f3fc		# cld; rep movsq
-
-.Lcommon_seh_tail:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$154,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	avx_handler,.-avx_handler
-
-.section	.pdata
-.align	4
-	.rva	.LSEH_begin_GFp_poly1305_init_asm
-	.rva	.LSEH_end_GFp_poly1305_init_asm
-	.rva	.LSEH_info_GFp_poly1305_init_asm
-
-	.rva	.LSEH_begin_GFp_poly1305_blocks
-	.rva	.LSEH_end_GFp_poly1305_blocks
-	.rva	.LSEH_info_GFp_poly1305_blocks
-
-	.rva	.LSEH_begin_GFp_poly1305_emit
-	.rva	.LSEH_end_GFp_poly1305_emit
-	.rva	.LSEH_info_GFp_poly1305_emit
-___
-$code.=<<___ if ($avx);
-	.rva	.LSEH_begin_poly1305_blocks_avx
-	.rva	.Lbase2_64_avx
-	.rva	.LSEH_info_poly1305_blocks_avx_1
-
-	.rva	.Lbase2_64_avx
-	.rva	.Leven_avx
-	.rva	.LSEH_info_poly1305_blocks_avx_2
-
-	.rva	.Leven_avx
-	.rva	.LSEH_end_poly1305_blocks_avx
-	.rva	.LSEH_info_poly1305_blocks_avx_3
-
-	.rva	.LSEH_begin_poly1305_emit_avx
-	.rva	.LSEH_end_poly1305_emit_avx
-	.rva	.LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-	.rva	.LSEH_begin_poly1305_blocks_avx2
-	.rva	.Lbase2_64_avx2
-	.rva	.LSEH_info_poly1305_blocks_avx2_1
-
-	.rva	.Lbase2_64_avx2
-	.rva	.Leven_avx2
-	.rva	.LSEH_info_poly1305_blocks_avx2_2
-
-	.rva	.Leven_avx2
-	.rva	.LSEH_end_poly1305_blocks_avx2
-	.rva	.LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___;
-.section	.xdata
-.align	8
-.LSEH_info_GFp_poly1305_init_asm:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.LSEH_begin_GFp_poly1305_init_asm,.LSEH_begin_GFp_poly1305_init_asm
-
-.LSEH_info_GFp_poly1305_blocks:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_GFp_poly1305_emit:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.LSEH_begin_GFp_poly1305_emit,.LSEH_begin_GFp_poly1305_emit
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
-	.byte	9,0,0,0
-	.rva	avx_handler
-	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
-	.byte	9,0,0,0
-	.rva	avx_handler
-	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
-___
-}
-
-foreach (split('\n',$code)) {
-	s/\`([^\`]*)\`/eval($1)/ge;
-	s/%r([a-z]+)#d/%e$1/g;
-	s/%r([0-9]+)#d/%r$1d/g;
-	s/%x#%y/%x/g;
-
-	print $_,"\n";
-}
-close STDOUT or die "error closing STDOUT";
diff --git a/crypto/poly1305/internal.h b/crypto/poly1305/internal.h
new file mode 100644
index 0000000000..98e7a482d1
--- /dev/null
+++ b/crypto/poly1305/internal.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_POLY1305_INTERNAL_H
+#define OPENSSL_HEADER_POLY1305_INTERNAL_H
+
+#include <GFp/base.h>
+#include <GFp/poly1305.h>
+
+#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE)
+#define OPENSSL_POLY1305_NEON
+#endif
+
+#endif  // OPENSSL_HEADER_POLY1305_INTERNAL_H
diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
new file mode 100644
index 0000000000..66620580ae
--- /dev/null
+++ b/crypto/poly1305/poly1305.c
@@ -0,0 +1,301 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// This implementation of poly1305 is by Andrew Moon
+// (https://github.com/floodyberry/poly1305-donna) and released as public
+// domain.
+
+#include <GFp/poly1305.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+
+#if !defined(BORINGSSL_HAS_UINT128) || !defined(OPENSSL_X86_64)
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+// We can assume little-endian.
+static uint32_t U8TO32_LE(const uint8_t *m) {
+  uint32_t r;
+  GFp_memcpy(&r, m, sizeof(r));
+  return r;
+}
+
+static void U32TO8_LE(uint8_t *m, uint32_t v) {
+  GFp_memcpy(m, &v, sizeof(v));
+}
+
+static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }
+
+struct poly1305_state_st {
+  uint32_t r0, r1, r2, r3, r4;
+  uint32_t s1, s2, s3, s4;
+  uint32_t h0, h1, h2, h3, h4;
+  uint8_t buf[16];
+  size_t buf_used;
+  uint8_t key[16];
+};
+
+OPENSSL_STATIC_ASSERT(sizeof(struct poly1305_state_st) <= sizeof(poly1305_state),
+  "poly1305_state isn't large enough to hold aligned poly1305_state_st");
+
+static inline struct poly1305_state_st *poly1305_aligned_state(
+    poly1305_state *state) {
+  dev_assert_secret(((uintptr_t)state & 63) == 0);
+  return (struct poly1305_state_st *)(((uintptr_t)state + 63) & ~63);
+}
+
+// poly1305_blocks updates |state| given some amount of input data. This
+// function may only be called with a |len| that is not a multiple of 16 at the
+// end of the data. Otherwise the input must be buffered into 16 byte blocks.
+static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
+                            size_t len) {
+  uint32_t t0, t1, t2, t3;
+  uint64_t t[5];
+  uint32_t b;
+  uint64_t c;
+  size_t j;
+  uint8_t mp[16];
+
+  if (len < 16) {
+    goto poly1305_donna_atmost15bytes;
+  }
+
+poly1305_donna_16bytes:
+  t0 = U8TO32_LE(in);
+  t1 = U8TO32_LE(in + 4);
+  t2 = U8TO32_LE(in + 8);
+  t3 = U8TO32_LE(in + 12);
+
+  in += 16;
+  len -= 16;
+
+  state->h0 += t0 & 0x3ffffff;
+  state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+  state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+  state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+  state->h4 += (t3 >> 8) | (1 << 24);
+
+poly1305_donna_mul:
+  t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) +
+         mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) +
+         mul32x32_64(state->h4, state->s1);
+  t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) +
+         mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) +
+         mul32x32_64(state->h4, state->s2);
+  t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) +
+         mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) +
+         mul32x32_64(state->h4, state->s3);
+  t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) +
+         mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) +
+         mul32x32_64(state->h4, state->s4);
+  t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) +
+         mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) +
+         mul32x32_64(state->h4, state->r0);
+
+  state->h0 = (uint32_t)t[0] & 0x3ffffff;
+  c = (t[0] >> 26);
+  t[1] += c;
+  state->h1 = (uint32_t)t[1] & 0x3ffffff;
+  b = (uint32_t)(t[1] >> 26);
+  t[2] += b;
+  state->h2 = (uint32_t)t[2] & 0x3ffffff;
+  b = (uint32_t)(t[2] >> 26);
+  t[3] += b;
+  state->h3 = (uint32_t)t[3] & 0x3ffffff;
+  b = (uint32_t)(t[3] >> 26);
+  t[4] += b;
+  state->h4 = (uint32_t)t[4] & 0x3ffffff;
+  b = (uint32_t)(t[4] >> 26);
+  state->h0 += b * 5;
+
+  if (len >= 16) {
+    goto poly1305_donna_16bytes;
+  }
+
+// final bytes
+poly1305_donna_atmost15bytes:
+  if (!len) {
+    return;
+  }
+
+  for (j = 0; j < len; j++) {
+    mp[j] = in[j];
+  }
+  mp[j++] = 1;
+  for (; j < 16; j++) {
+    mp[j] = 0;
+  }
+  len = 0;
+
+  t0 = U8TO32_LE(mp + 0);
+  t1 = U8TO32_LE(mp + 4);
+  t2 = U8TO32_LE(mp + 8);
+  t3 = U8TO32_LE(mp + 12);
+
+  state->h0 += t0 & 0x3ffffff;
+  state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+  state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+  state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+  state->h4 += (t3 >> 8);
+
+  goto poly1305_donna_mul;
+}
+
+void GFp_poly1305_init(poly1305_state *statep, const uint8_t key[32]) {
+  struct poly1305_state_st *state = poly1305_aligned_state(statep);
+  uint32_t t0, t1, t2, t3;
+
+  t0 = U8TO32_LE(key + 0);
+  t1 = U8TO32_LE(key + 4);
+  t2 = U8TO32_LE(key + 8);
+  t3 = U8TO32_LE(key + 12);
+
+  // precompute multipliers
+  state->r0 = t0 & 0x3ffffff;
+  t0 >>= 26;
+  t0 |= t1 << 6;
+  state->r1 = t0 & 0x3ffff03;
+  t1 >>= 20;
+  t1 |= t2 << 12;
+  state->r2 = t1 & 0x3ffc0ff;
+  t2 >>= 14;
+  t2 |= t3 << 18;
+  state->r3 = t2 & 0x3f03fff;
+  t3 >>= 8;
+  state->r4 = t3 & 0x00fffff;
+
+  state->s1 = state->r1 * 5;
+  state->s2 = state->r2 * 5;
+  state->s3 = state->r3 * 5;
+  state->s4 = state->r4 * 5;
+
+  // init state
+  state->h0 = 0;
+  state->h1 = 0;
+  state->h2 = 0;
+  state->h3 = 0;
+  state->h4 = 0;
+
+  state->buf_used = 0;
+  GFp_memcpy(state->key, key + 16, sizeof(state->key));
+}
+
+void GFp_poly1305_update(poly1305_state *statep, const uint8_t *in,
+                         size_t in_len) {
+  struct poly1305_state_st *state = poly1305_aligned_state(statep);
+
+  if (state->buf_used) {
+    size_t todo = 16 - state->buf_used;
+    if (todo > in_len) {
+      todo = in_len;
+    }
+    for (size_t i = 0; i < todo; i++) {
+      state->buf[state->buf_used + i] = in[i];
+    }
+    state->buf_used += todo;
+    in_len -= todo;
+    in += todo;
+
+    if (state->buf_used == 16) {
+      poly1305_update(state, state->buf, 16);
+      state->buf_used = 0;
+    }
+  }
+
+  if (in_len >= 16) {
+    size_t todo = in_len & ~0xf;
+    poly1305_update(state, in, todo);
+    in += todo;
+    in_len &= 0xf;
+  }
+
+  if (in_len) {
+    for (size_t i = 0; i < in_len; i++) {
+      state->buf[i] = in[i];
+    }
+    state->buf_used = in_len;
+  }
+}
+
+void GFp_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
+  struct poly1305_state_st *state = poly1305_aligned_state(statep);
+  uint64_t f0, f1, f2, f3;
+  uint32_t g0, g1, g2, g3, g4;
+  uint32_t b, nb;
+
+  if (state->buf_used) {
+    poly1305_update(state, state->buf, state->buf_used);
+  }
+
+  b = state->h0 >> 26;
+  state->h0 = state->h0 & 0x3ffffff;
+  state->h1 += b;
+  b = state->h1 >> 26;
+  state->h1 = state->h1 & 0x3ffffff;
+  state->h2 += b;
+  b = state->h2 >> 26;
+  state->h2 = state->h2 & 0x3ffffff;
+  state->h3 += b;
+  b = state->h3 >> 26;
+  state->h3 = state->h3 & 0x3ffffff;
+  state->h4 += b;
+  b = state->h4 >> 26;
+  state->h4 = state->h4 & 0x3ffffff;
+  state->h0 += b * 5;
+
+  g0 = state->h0 + 5;
+  b = g0 >> 26;
+  g0 &= 0x3ffffff;
+  g1 = state->h1 + b;
+  b = g1 >> 26;
+  g1 &= 0x3ffffff;
+  g2 = state->h2 + b;
+  b = g2 >> 26;
+  g2 &= 0x3ffffff;
+  g3 = state->h3 + b;
+  b = g3 >> 26;
+  g3 &= 0x3ffffff;
+  g4 = state->h4 + b - (1 << 26);
+
+  b = (g4 >> 31) - 1;
+  nb = ~b;
+  state->h0 = (state->h0 & nb) | (g0 & b);
+  state->h1 = (state->h1 & nb) | (g1 & b);
+  state->h2 = (state->h2 & nb) | (g2 & b);
+  state->h3 = (state->h3 & nb) | (g3 & b);
+  state->h4 = (state->h4 & nb) | (g4 & b);
+
+  f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]);
+  f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
+       (uint64_t)U8TO32_LE(&state->key[4]);
+  f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
+       (uint64_t)U8TO32_LE(&state->key[8]);
+  f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
+       (uint64_t)U8TO32_LE(&state->key[12]);
+
+  U32TO8_LE(&mac[0], (uint32_t)f0);
+  f1 += (f0 >> 32);
+  U32TO8_LE(&mac[4], (uint32_t)f1);
+  f2 += (f1 >> 32);
+  U32TO8_LE(&mac[8], (uint32_t)f2);
+  f3 += (f2 >> 32);
+  U32TO8_LE(&mac[12], (uint32_t)f3);
+}
+
+#endif  // !BORINGSSL_HAS_UINT128 || !OPENSSL_X86_64
diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c
new file mode 100644
index 0000000000..3b00a9f2f3
--- /dev/null
+++ b/crypto/poly1305/poly1305_arm.c
@@ -0,0 +1,307 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// This implementation was taken from the public domain, neon2 version in
+// SUPERCOP by D. J. Bernstein and Peter Schwabe.
+
+#include <GFp/poly1305.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+
+#if defined(OPENSSL_POLY1305_NEON)
+
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wcast-align"
+
+typedef struct {
+  uint32_t v[12];  // for alignment; only using 10
+} fe1305x2;
+
+#define addmulmod GFp_poly1305_neon2_addmulmod
+#define blocks GFp_poly1305_neon2_blocks
+
+extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y,
+                      const fe1305x2 *c);
+
+extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in,
+                  size_t inlen);
+
+static void freeze(fe1305x2 *r) {
+  int i;
+
+  uint32_t x0 = r->v[0];
+  uint32_t x1 = r->v[2];
+  uint32_t x2 = r->v[4];
+  uint32_t x3 = r->v[6];
+  uint32_t x4 = r->v[8];
+  uint32_t y0;
+  uint32_t y1;
+  uint32_t y2;
+  uint32_t y3;
+  uint32_t y4;
+  uint32_t swap;
+
+  for (i = 0; i < 3; ++i) {
+    x1 += x0 >> 26;
+    x0 &= 0x3ffffff;
+    x2 += x1 >> 26;
+    x1 &= 0x3ffffff;
+    x3 += x2 >> 26;
+    x2 &= 0x3ffffff;
+    x4 += x3 >> 26;
+    x3 &= 0x3ffffff;
+    x0 += 5 * (x4 >> 26);
+    x4 &= 0x3ffffff;
+  }
+
+  y0 = x0 + 5;
+  y1 = x1 + (y0 >> 26);
+  y0 &= 0x3ffffff;
+  y2 = x2 + (y1 >> 26);
+  y1 &= 0x3ffffff;
+  y3 = x3 + (y2 >> 26);
+  y2 &= 0x3ffffff;
+  y4 = x4 + (y3 >> 26);
+  y3 &= 0x3ffffff;
+  swap = -(y4 >> 26);
+  y4 &= 0x3ffffff;
+
+  y0 ^= x0;
+  y1 ^= x1;
+  y2 ^= x2;
+  y3 ^= x3;
+  y4 ^= x4;
+
+  y0 &= swap;
+  y1 &= swap;
+  y2 &= swap;
+  y3 &= swap;
+  y4 &= swap;
+
+  y0 ^= x0;
+  y1 ^= x1;
+  y2 ^= x2;
+  y3 ^= x3;
+  y4 ^= x4;
+
+  r->v[0] = y0;
+  r->v[2] = y1;
+  r->v[4] = y2;
+  r->v[6] = y3;
+  r->v[8] = y4;
+}
+
+static void store32(uint8_t out[4], uint32_t v) { GFp_memcpy(out, &v, 4); }
+
+// load32 exists to avoid breaking strict aliasing rules in
+// fe1305x2_frombytearray.
+static uint32_t load32(const uint8_t t[4]) {
+  uint32_t tmp;
+  GFp_memcpy(&tmp, t, sizeof(tmp));
+  return tmp;
+}
+
+static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) {
+  uint32_t x0 = x->v[0];
+  uint32_t x1 = x->v[2];
+  uint32_t x2 = x->v[4];
+  uint32_t x3 = x->v[6];
+  uint32_t x4 = x->v[8];
+
+  x1 += x0 >> 26;
+  x0 &= 0x3ffffff;
+  x2 += x1 >> 26;
+  x1 &= 0x3ffffff;
+  x3 += x2 >> 26;
+  x2 &= 0x3ffffff;
+  x4 += x3 >> 26;
+  x3 &= 0x3ffffff;
+
+  store32(r, x0 + (x1 << 26));
+  store32(r + 4, (x1 >> 6) + (x2 << 20));
+  store32(r + 8, (x2 >> 12) + (x3 << 14));
+  store32(r + 12, (x3 >> 18) + (x4 << 8));
+}
+
+static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) {
+  size_t i;
+  uint8_t t[17];
+
+  for (i = 0; (i < 16) && (i < xlen); i++) {
+    t[i] = x[i];
+  }
+  xlen -= i;
+  x += i;
+  t[i++] = 1;
+  for (; i < 17; i++) {
+    t[i] = 0;
+  }
+
+  r->v[0] = 0x3ffffff & load32(t);
+  r->v[2] = 0x3ffffff & (load32(t + 3) >> 2);
+  r->v[4] = 0x3ffffff & (load32(t + 6) >> 4);
+  r->v[6] = 0x3ffffff & (load32(t + 9) >> 6);
+  r->v[8] = load32(t + 13);
+
+  if (xlen) {
+    for (i = 0; (i < 16) && (i < xlen); i++) {
+      t[i] = x[i];
+    }
+    t[i++] = 1;
+    for (; i < 17; i++) {
+      t[i] = 0;
+    }
+
+    r->v[1] = 0x3ffffff & load32(t);
+    r->v[3] = 0x3ffffff & (load32(t + 3) >> 2);
+    r->v[5] = 0x3ffffff & (load32(t + 6) >> 4);
+    r->v[7] = 0x3ffffff & (load32(t + 9) >> 6);
+    r->v[9] = load32(t + 13);
+  } else {
+    r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0;
+  }
+}
+
+static const alignas(16) fe1305x2 zero;
+
+struct poly1305_state_st {
+  uint8_t data[sizeof(fe1305x2[5]) + 128];
+  uint8_t buf[32];
+  size_t buf_used;
+  uint8_t key[16];
+};
+
+OPENSSL_STATIC_ASSERT(sizeof(struct poly1305_state_st) <= sizeof(poly1305_state),
+  "poly1305_state isn't large enough to hold aligned poly1305_state_st");
+
+void GFp_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) {
+  struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
+  fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
+  fe1305x2 *const h = r + 1;
+  fe1305x2 *const c = h + 1;
+  fe1305x2 *const precomp = c + 1;
+
+  r->v[1] = r->v[0] = 0x3ffffff & load32(key);
+  r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2);
+  r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4);
+  r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6);
+  r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8);
+
+  for (size_t j = 0; j < 10; j++) {
+    h->v[j] = 0;  // XXX: should fast-forward a bit
+  }
+
+  addmulmod(precomp, r, r, &zero);                  // precompute r^2
+  addmulmod(precomp + 1, precomp, precomp, &zero);  // precompute r^4
+
+  GFp_memcpy(st->key, key + 16, 16);
+  st->buf_used = 0;
+}
+
+void GFp_poly1305_update_neon(poly1305_state *state, const uint8_t *in,
+                              size_t in_len) {
+  struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
+  fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
+  fe1305x2 *const h = r + 1;
+  fe1305x2 *const c = h + 1;
+  fe1305x2 *const precomp = c + 1;
+
+  if (st->buf_used) {
+    size_t todo = 32 - st->buf_used;
+    if (todo > in_len) {
+      todo = in_len;
+    }
+    for (size_t i = 0; i < todo; i++) {
+      st->buf[st->buf_used + i] = in[i];
+    }
+    st->buf_used += todo;
+    in_len -= todo;
+    in += todo;
+
+    if (st->buf_used == sizeof(st->buf) && in_len) {
+      addmulmod(h, h, precomp, &zero);
+      fe1305x2_frombytearray(c, st->buf, sizeof(st->buf));
+      for (size_t i = 0; i < 10; i++) {
+        h->v[i] += c->v[i];
+      }
+      st->buf_used = 0;
+    }
+  }
+
+  while (in_len > 32) {
+    size_t tlen = 1048576;
+    if (in_len < tlen) {
+      tlen = in_len;
+    }
+    tlen -= blocks(h, precomp, in, tlen);
+    in_len -= tlen;
+    in += tlen;
+  }
+
+  if (in_len) {
+    for (size_t i = 0; i < in_len; i++) {
+      st->buf[i] = in[i];
+    }
+    st->buf_used = in_len;
+  }
+}
+
+void GFp_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) {
+  struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
+  fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
+  fe1305x2 *const h = r + 1;
+  fe1305x2 *const c = h + 1;
+  fe1305x2 *const precomp = c + 1;
+
+  addmulmod(h, h, precomp, &zero);
+
+  if (st->buf_used > 16) {
+    fe1305x2_frombytearray(c, st->buf, st->buf_used);
+    precomp->v[1] = r->v[1];
+    precomp->v[3] = r->v[3];
+    precomp->v[5] = r->v[5];
+    precomp->v[7] = r->v[7];
+    precomp->v[9] = r->v[9];
+    addmulmod(h, h, precomp, c);
+  } else if (st->buf_used > 0) {
+    fe1305x2_frombytearray(c, st->buf, st->buf_used);
+    r->v[1] = 1;
+    r->v[3] = 0;
+    r->v[5] = 0;
+    r->v[7] = 0;
+    r->v[9] = 0;
+    addmulmod(h, h, r, c);
+  }
+
+  h->v[0] += h->v[1];
+  h->v[2] += h->v[3];
+  h->v[4] += h->v[5];
+  h->v[6] += h->v[7];
+  h->v[8] += h->v[9];
+  freeze(h);
+
+  fe1305x2_frombytearray(c, st->key, 16);
+  c->v[8] ^= (1 << 24);
+
+  h->v[0] += c->v[0];
+  h->v[2] += c->v[2];
+  h->v[4] += c->v[4];
+  h->v[6] += c->v[6];
+  h->v[8] += c->v[8];
+  fe1305x2_tobytearray(mac, h);
+}
+
+#endif  // OPENSSL_POLY1305_NEON
diff --git a/crypto/poly1305/poly1305_arm_asm.S b/crypto/poly1305/poly1305_arm_asm.S
new file mode 100644
index 0000000000..24ae435fdd
--- /dev/null
+++ b/crypto/poly1305/poly1305_arm_asm.S
@@ -0,0 +1,2031 @@
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__arm__) && !defined(OPENSSL_NO_ASM) && !defined(__APPLE__)
+
+#pragma GCC diagnostic ignored "-Wlanguage-extension-token"
+
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+
+# This implementation was taken from the public domain, neon2 version in
+# SUPERCOP by D. J. Bernstein and Peter Schwabe.
+
+# qhasm: int32 input_0
+
+# qhasm: int32 input_1
+
+# qhasm: int32 input_2
+
+# qhasm: int32 input_3
+
+# qhasm: stack32 input_4
+
+# qhasm: stack32 input_5
+
+# qhasm: stack32 input_6
+
+# qhasm: stack32 input_7
+
+# qhasm: int32 caller_r4
+
+# qhasm: int32 caller_r5
+
+# qhasm: int32 caller_r6
+
+# qhasm: int32 caller_r7
+
+# qhasm: int32 caller_r8
+
+# qhasm: int32 caller_r9
+
+# qhasm: int32 caller_r10
+
+# qhasm: int32 caller_r11
+
+# qhasm: int32 caller_r12
+
+# qhasm: int32 caller_r14
+
+# qhasm: reg128 caller_q4
+
+# qhasm: reg128 caller_q5
+
+# qhasm: reg128 caller_q6
+
+# qhasm: reg128 caller_q7
+
+# qhasm: startcode
+.fpu neon
+.text
+
+# qhasm: reg128 r0
+
+# qhasm: reg128 r1
+
+# qhasm: reg128 r2
+
+# qhasm: reg128 r3
+
+# qhasm: reg128 r4
+
+# qhasm: reg128 x01
+
+# qhasm: reg128 x23
+
+# qhasm: reg128 x4
+
+# qhasm: reg128 y0
+
+# qhasm: reg128 y12
+
+# qhasm: reg128 y34
+
+# qhasm: reg128 5y12
+
+# qhasm: reg128 5y34
+
+# qhasm: stack128 y0_stack
+
+# qhasm: stack128 y12_stack
+
+# qhasm: stack128 y34_stack
+
+# qhasm: stack128 5y12_stack
+
+# qhasm: stack128 5y34_stack
+
+# qhasm: reg128 z0
+
+# qhasm: reg128 z12
+
+# qhasm: reg128 z34
+
+# qhasm: reg128 5z12
+
+# qhasm: reg128 5z34
+
+# qhasm: stack128 z0_stack
+
+# qhasm: stack128 z12_stack
+
+# qhasm: stack128 z34_stack
+
+# qhasm: stack128 5z12_stack
+
+# qhasm: stack128 5z34_stack
+
+# qhasm: stack128 two24
+
+# qhasm: int32 ptr
+
+# qhasm: reg128 c01
+
+# qhasm: reg128 c23
+
+# qhasm: reg128 d01
+
+# qhasm: reg128 d23
+
+# qhasm: reg128 t0
+
+# qhasm: reg128 t1
+
+# qhasm: reg128 t2
+
+# qhasm: reg128 t3
+
+# qhasm: reg128 t4
+
+# qhasm: reg128 mask
+
+# qhasm: reg128 u0
+
+# qhasm: reg128 u1
+
+# qhasm: reg128 u2
+
+# qhasm: reg128 u3
+
+# qhasm: reg128 u4
+
+# qhasm: reg128 v01
+
+# qhasm: reg128 mid
+
+# qhasm: reg128 v23
+
+# qhasm: reg128 v4
+
+# qhasm: int32 len
+
+# qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks
+.align 4
+.global GFp_poly1305_neon2_blocks
+.hidden GFp_poly1305_neon2_blocks
+.type GFp_poly1305_neon2_blocks STT_FUNC
+GFp_poly1305_neon2_blocks:
+vpush {q4,q5,q6,q7}
+mov r12,sp
+sub sp,sp,#192
+bic sp,sp,#31
+
+# qhasm: len = input_3
+# asm 1: mov >len=int32#4,<input_3=int32#4
+# asm 2: mov >len=r3,<input_3=r3
+mov r3,r3
+
+# qhasm: new y0
+
+# qhasm: y0  = mem64[input_1]y0[1]; input_1 += 8
+# asm 1: vld1.8 {<y0=reg128#1%bot},[<input_1=int32#2]!
+# asm 2: vld1.8 {<y0=d0},[<input_1=r1]!
+vld1.8 {d0},[r1]!
+
+# qhasm: y12 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>y12=d2->y12=d3},[<input_1=r1]!
+vld1.8 {d2-d3},[r1]!
+
+# qhasm: y34 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>y34=d4->y34=d5},[<input_1=r1]!
+vld1.8 {d4-d5},[r1]!
+
+# qhasm: input_1 += 8
+# asm 1: add >input_1=int32#2,<input_1=int32#2,#8
+# asm 2: add >input_1=r1,<input_1=r1,#8
+add r1,r1,#8
+
+# qhasm: new z0
+
+# qhasm: z0  = mem64[input_1]z0[1]; input_1 += 8
+# asm 1: vld1.8 {<z0=reg128#4%bot},[<input_1=int32#2]!
+# asm 2: vld1.8 {<z0=d6},[<input_1=r1]!
+vld1.8 {d6},[r1]!
+
+# qhasm: z12 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>z12=reg128#5%bot->z12=reg128#5%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>z12=d8->z12=d9},[<input_1=r1]!
+vld1.8 {d8-d9},[r1]!
+
+# qhasm: z34 = mem128[input_1]; input_1 += 16
+# asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<input_1=int32#2]!
+# asm 2: vld1.8 {>z34=d10->z34=d11},[<input_1=r1]!
+vld1.8 {d10-d11},[r1]!
+
+# qhasm: 2x mask = 0xffffffff
+# asm 1: vmov.i64 >mask=reg128#7,#0xffffffff
+# asm 2: vmov.i64 >mask=q6,#0xffffffff
+vmov.i64 q6,#0xffffffff
+
+# qhasm: 2x u4 = 0xff
+# asm 1: vmov.i64 >u4=reg128#8,#0xff
+# asm 2: vmov.i64 >u4=q7,#0xff
+vmov.i64 q7,#0xff
+
+# qhasm: x01 aligned= mem128[input_0];input_0+=16
+# asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[<input_0=int32#1,: 128]!
+# asm 2: vld1.8 {>x01=d16->x01=d17},[<input_0=r0,: 128]!
+vld1.8 {d16-d17},[r0,: 128]!
+
+# qhasm: x23 aligned= mem128[input_0];input_0+=16
+# asm 1: vld1.8 {>x23=reg128#10%bot->x23=reg128#10%top},[<input_0=int32#1,: 128]!
+# asm 2: vld1.8 {>x23=d18->x23=d19},[<input_0=r0,: 128]!
+vld1.8 {d18-d19},[r0,: 128]!
+
+# qhasm: x4  aligned= mem64[input_0]x4[1]
+# asm 1: vld1.8 {<x4=reg128#11%bot},[<input_0=int32#1,: 64]
+# asm 2: vld1.8 {<x4=d20},[<input_0=r0,: 64]
+vld1.8 {d20},[r0,: 64]
+
+# qhasm: input_0 -= 32
+# asm 1: sub >input_0=int32#1,<input_0=int32#1,#32
+# asm 2: sub >input_0=r0,<input_0=r0,#32
+sub r0,r0,#32
+
+# qhasm: 2x mask unsigned>>=6
+# asm 1: vshr.u64 >mask=reg128#7,<mask=reg128#7,#6
+# asm 2: vshr.u64 >mask=q6,<mask=q6,#6
+vshr.u64 q6,q6,#6
+
+# qhasm: 2x u4 unsigned>>= 7
+# asm 1: vshr.u64 >u4=reg128#8,<u4=reg128#8,#7
+# asm 2: vshr.u64 >u4=q7,<u4=q7,#7
+vshr.u64 q7,q7,#7
+
+# qhasm: 4x 5y12 = y12 << 2
+# asm 1: vshl.i32 >5y12=reg128#12,<y12=reg128#2,#2
+# asm 2: vshl.i32 >5y12=q11,<y12=q1,#2
+vshl.i32 q11,q1,#2
+
+# qhasm: 4x 5y34 = y34 << 2
+# asm 1: vshl.i32 >5y34=reg128#13,<y34=reg128#3,#2
+# asm 2: vshl.i32 >5y34=q12,<y34=q2,#2
+vshl.i32 q12,q2,#2
+
+# qhasm: 4x 5y12 += y12
+# asm 1: vadd.i32 >5y12=reg128#12,<5y12=reg128#12,<y12=reg128#2
+# asm 2: vadd.i32 >5y12=q11,<5y12=q11,<y12=q1
+vadd.i32 q11,q11,q1
+
+# qhasm: 4x 5y34 += y34
+# asm 1: vadd.i32 >5y34=reg128#13,<5y34=reg128#13,<y34=reg128#3
+# asm 2: vadd.i32 >5y34=q12,<5y34=q12,<y34=q2
+vadd.i32 q12,q12,q2
+
+# qhasm: 2x u4 <<= 24
+# asm 1: vshl.i64 >u4=reg128#8,<u4=reg128#8,#24
+# asm 2: vshl.i64 >u4=q7,<u4=q7,#24
+vshl.i64 q7,q7,#24
+
+# qhasm: 4x 5z12 = z12 << 2
+# asm 1: vshl.i32 >5z12=reg128#14,<z12=reg128#5,#2
+# asm 2: vshl.i32 >5z12=q13,<z12=q4,#2
+vshl.i32 q13,q4,#2
+
+# qhasm: 4x 5z34 = z34 << 2
+# asm 1: vshl.i32 >5z34=reg128#15,<z34=reg128#6,#2
+# asm 2: vshl.i32 >5z34=q14,<z34=q5,#2
+vshl.i32 q14,q5,#2
+
+# qhasm: 4x 5z12 += z12
+# asm 1: vadd.i32 >5z12=reg128#14,<5z12=reg128#14,<z12=reg128#5
+# asm 2: vadd.i32 >5z12=q13,<5z12=q13,<z12=q4
+vadd.i32 q13,q13,q4
+
+# qhasm: 4x 5z34 += z34
+# asm 1: vadd.i32 >5z34=reg128#15,<5z34=reg128#15,<z34=reg128#6
+# asm 2: vadd.i32 >5z34=q14,<5z34=q14,<z34=q5
+vadd.i32 q14,q14,q5
+
+# qhasm: new two24
+
+# qhasm: new y0_stack
+
+# qhasm: new y12_stack
+
+# qhasm: new y34_stack
+
+# qhasm: new 5y12_stack
+
+# qhasm: new 5y34_stack
+
+# qhasm: new z0_stack
+
+# qhasm: new z12_stack
+
+# qhasm: new z34_stack
+
+# qhasm: new 5z12_stack
+
+# qhasm: new 5z34_stack
+
+# qhasm: ptr = &two24
+# asm 1: lea >ptr=int32#2,<two24=stack128#1
+# asm 2: lea >ptr=r1,<two24=[sp,#0]
+add r1,sp,#0
+
+# qhasm: mem128[ptr] aligned= u4
+# asm 1: vst1.8 {<u4=reg128#8%bot-<u4=reg128#8%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<u4=d14-<u4=d15},[<ptr=r1,: 128]
+vst1.8 {d14-d15},[r1,: 128]
+
+# qhasm: r4 = u4
+# asm 1: vmov >r4=reg128#16,<u4=reg128#8
+# asm 2: vmov >r4=q15,<u4=q7
+vmov q15,q7
+
+# qhasm: r0 = u4
+# asm 1: vmov >r0=reg128#8,<u4=reg128#8
+# asm 2: vmov >r0=q7,<u4=q7
+vmov q7,q7
+
+# qhasm: ptr = &y0_stack
+# asm 1: lea >ptr=int32#2,<y0_stack=stack128#2
+# asm 2: lea >ptr=r1,<y0_stack=[sp,#16]
+add r1,sp,#16
+
+# qhasm: mem128[ptr] aligned= y0
+# asm 1: vst1.8 {<y0=reg128#1%bot-<y0=reg128#1%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<y0=d0-<y0=d1},[<ptr=r1,: 128]
+vst1.8 {d0-d1},[r1,: 128]
+
+# qhasm: ptr = &y12_stack
+# asm 1: lea >ptr=int32#2,<y12_stack=stack128#3
+# asm 2: lea >ptr=r1,<y12_stack=[sp,#32]
+add r1,sp,#32
+
+# qhasm: mem128[ptr] aligned= y12
+# asm 1: vst1.8 {<y12=reg128#2%bot-<y12=reg128#2%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<y12=d2-<y12=d3},[<ptr=r1,: 128]
+vst1.8 {d2-d3},[r1,: 128]
+
+# qhasm: ptr = &y34_stack
+# asm 1: lea >ptr=int32#2,<y34_stack=stack128#4
+# asm 2: lea >ptr=r1,<y34_stack=[sp,#48]
+add r1,sp,#48
+
+# qhasm: mem128[ptr] aligned= y34
+# asm 1: vst1.8 {<y34=reg128#3%bot-<y34=reg128#3%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<y34=d4-<y34=d5},[<ptr=r1,: 128]
+vst1.8 {d4-d5},[r1,: 128]
+
+# qhasm: ptr = &z0_stack
+# asm 1: lea >ptr=int32#2,<z0_stack=stack128#7
+# asm 2: lea >ptr=r1,<z0_stack=[sp,#96]
+add r1,sp,#96
+
+# qhasm: mem128[ptr] aligned= z0
+# asm 1: vst1.8 {<z0=reg128#4%bot-<z0=reg128#4%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<z0=d6-<z0=d7},[<ptr=r1,: 128]
+vst1.8 {d6-d7},[r1,: 128]
+
+# qhasm: ptr = &z12_stack
+# asm 1: lea >ptr=int32#2,<z12_stack=stack128#8
+# asm 2: lea >ptr=r1,<z12_stack=[sp,#112]
+add r1,sp,#112
+
+# qhasm: mem128[ptr] aligned= z12
+# asm 1: vst1.8 {<z12=reg128#5%bot-<z12=reg128#5%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<z12=d8-<z12=d9},[<ptr=r1,: 128]
+vst1.8 {d8-d9},[r1,: 128]
+
+# qhasm: ptr = &z34_stack
+# asm 1: lea >ptr=int32#2,<z34_stack=stack128#9
+# asm 2: lea >ptr=r1,<z34_stack=[sp,#128]
+add r1,sp,#128
+
+# qhasm: mem128[ptr] aligned= z34
+# asm 1: vst1.8 {<z34=reg128#6%bot-<z34=reg128#6%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<z34=d10-<z34=d11},[<ptr=r1,: 128]
+vst1.8 {d10-d11},[r1,: 128]
+
+# qhasm: ptr = &5y12_stack
+# asm 1: lea >ptr=int32#2,<5y12_stack=stack128#5
+# asm 2: lea >ptr=r1,<5y12_stack=[sp,#64]
+add r1,sp,#64
+
+# qhasm: mem128[ptr] aligned= 5y12
+# asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5y12=d22-<5y12=d23},[<ptr=r1,: 128]
+vst1.8 {d22-d23},[r1,: 128]
+
+# qhasm: ptr = &5y34_stack
+# asm 1: lea >ptr=int32#2,<5y34_stack=stack128#6
+# asm 2: lea >ptr=r1,<5y34_stack=[sp,#80]
+add r1,sp,#80
+
+# qhasm: mem128[ptr] aligned= 5y34
+# asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5y34=d24-<5y34=d25},[<ptr=r1,: 128]
+vst1.8 {d24-d25},[r1,: 128]
+
+# qhasm: ptr = &5z12_stack
+# asm 1: lea >ptr=int32#2,<5z12_stack=stack128#10
+# asm 2: lea >ptr=r1,<5z12_stack=[sp,#144]
+add r1,sp,#144
+
+# qhasm: mem128[ptr] aligned= 5z12
+# asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5z12=d26-<5z12=d27},[<ptr=r1,: 128]
+vst1.8 {d26-d27},[r1,: 128]
+
+# qhasm: ptr = &5z34_stack
+# asm 1: lea >ptr=int32#2,<5z34_stack=stack128#11
+# asm 2: lea >ptr=r1,<5z34_stack=[sp,#160]
+add r1,sp,#160
+
+# qhasm: mem128[ptr] aligned= 5z34
+# asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[<ptr=int32#2,: 128]
+# asm 2: vst1.8 {<5z34=d28-<5z34=d29},[<ptr=r1,: 128]
+vst1.8 {d28-d29},[r1,: 128]
+
+# qhasm:                       unsigned>? len - 64
+# asm 1: cmp <len=int32#4,#64
+# asm 2: cmp <len=r3,#64
+cmp r3,#64
+
+# qhasm: goto below64bytes if !unsigned>
+bls ._below64bytes
+
+# qhasm: input_2 += 32
+# asm 1: add >input_2=int32#2,<input_2=int32#3,#32
+# asm 2: add >input_2=r1,<input_2=r2,#32
+add r1,r2,#32
+
+# qhasm: mainloop2:
+._mainloop2:
+
+# qhasm:   c01 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c01=reg128#1%bot->c01=reg128#1%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>c01=d0->c01=d1},[<input_2=r1]!
+vld1.8 {d0-d1},[r1]!
+
+# qhasm:   c23 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c23=reg128#2%bot->c23=reg128#2%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>c23=d2->c23=d3},[<input_2=r1]!
+vld1.8 {d2-d3},[r1]!
+
+# qhasm: r4[0,1] += x01[0] unsigned*  z34[2];  r4[2,3] += x01[1] unsigned*  z34[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%bot,<z34=reg128#6%top
+# asm 2: vmlal.u32 <r4=q15,<x01=d16,<z34=d11
+vmlal.u32 q15,d16,d11
+
+# qhasm:   ptr = &z12_stack
+# asm 1: lea >ptr=int32#3,<z12_stack=stack128#8
+# asm 2: lea >ptr=r2,<z12_stack=[sp,#112]
+add r2,sp,#112
+
+# qhasm:   z12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>z12=reg128#3%bot->z12=reg128#3%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>z12=d4->z12=d5},[<ptr=r2,: 128]
+vld1.8 {d4-d5},[r2,: 128]
+
+# qhasm: r4[0,1] += x01[2] unsigned* z34[0];  r4[2,3] += x01[3] unsigned* z34[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%top,<z34=reg128#6%bot
+# asm 2: vmlal.u32 <r4=q15,<x01=d17,<z34=d10
+vmlal.u32 q15,d17,d10
+
+# qhasm:   ptr = &z0_stack
+# asm 1: lea >ptr=int32#3,<z0_stack=stack128#7
+# asm 2: lea >ptr=r2,<z0_stack=[sp,#96]
+add r2,sp,#96
+
+# qhasm:   z0 aligned= mem128[ptr]
+# asm 1: vld1.8 {>z0=reg128#4%bot->z0=reg128#4%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>z0=d6->z0=d7},[<ptr=r2,: 128]
+vld1.8 {d6-d7},[r2,: 128]
+
+# qhasm: r4[0,1] += x23[0] unsigned* z12[2];  r4[2,3] += x23[1] unsigned* z12[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%bot,<z12=reg128#3%top
+# asm 2: vmlal.u32 <r4=q15,<x23=d18,<z12=d5
+vmlal.u32 q15,d18,d5
+
+# qhasm:   c01 c23 = c01[0]c01[1]c01[2]c23[2]c23[0]c23[1]c01[3]c23[3]
+# asm 1: vtrn.32 <c01=reg128#1%top,<c23=reg128#2%top
+# asm 2: vtrn.32 <c01=d1,<c23=d3
+vtrn.32 d1,d3
+
+# qhasm: r4[0,1] += x23[2] unsigned* z12[0];  r4[2,3] += x23[3] unsigned* z12[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%top,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r4=q15,<x23=d19,<z12=d4
+vmlal.u32 q15,d19,d4
+
+# qhasm: r4[0,1] +=  x4[0] unsigned* z0[0];  r4[2,3] +=  x4[1] unsigned* z0[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<x4=reg128#11%bot,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r4=q15,<x4=d20,<z0=d6
+vmlal.u32 q15,d20,d6
+
+# qhasm: r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 
+# asm 1: vshll.u32 >r3=reg128#5,<c23=reg128#2%top,#18
+# asm 2: vshll.u32 >r3=q4,<c23=d3,#18
+vshll.u32 q4,d3,#18
+
+# qhasm:   c01 c23 = c01[0]c23[0]c01[2]c01[3]c01[1]c23[1]c23[2]c23[3]
+# asm 1: vtrn.32 <c01=reg128#1%bot,<c23=reg128#2%bot
+# asm 2: vtrn.32 <c01=d0,<c23=d2
+vtrn.32 d0,d2
+
+# qhasm: r3[0,1] += x01[0] unsigned* z34[0];   r3[2,3] += x01[1] unsigned* z34[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%bot,<z34=reg128#6%bot
+# asm 2: vmlal.u32 <r3=q4,<x01=d16,<z34=d10
+vmlal.u32 q4,d16,d10
+
+# qhasm: r3[0,1] += x01[2] unsigned* z12[2];   r3[2,3] += x01[3] unsigned* z12[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%top,<z12=reg128#3%top
+# asm 2: vmlal.u32 <r3=q4,<x01=d17,<z12=d5
+vmlal.u32 q4,d17,d5
+
+# qhasm:   r0 = r0[1]c01[0]r0[2,3] 
+# asm 1: vext.32 <r0=reg128#8%bot,<r0=reg128#8%bot,<c01=reg128#1%bot,#1
+# asm 2: vext.32 <r0=d14,<r0=d14,<c01=d0,#1
+vext.32 d14,d14,d0,#1
+
+# qhasm: r3[0,1] += x23[0] unsigned* z12[0];   r3[2,3] += x23[1] unsigned* z12[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%bot,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r3=q4,<x23=d18,<z12=d4
+vmlal.u32 q4,d18,d4
+
+# qhasm: 								input_2 -= 64
+# asm 1: sub >input_2=int32#2,<input_2=int32#2,#64
+# asm 2: sub >input_2=r1,<input_2=r1,#64
+sub r1,r1,#64
+
+# qhasm: r3[0,1] += x23[2] unsigned* z0[0];   r3[2,3] += x23[3] unsigned* z0[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%top,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r3=q4,<x23=d19,<z0=d6
+vmlal.u32 q4,d19,d6
+
+# qhasm:   ptr = &5z34_stack
+# asm 1: lea >ptr=int32#3,<5z34_stack=stack128#11
+# asm 2: lea >ptr=r2,<5z34_stack=[sp,#160]
+add r2,sp,#160
+
+# qhasm:   5z34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5z34=d10->5z34=d11},[<ptr=r2,: 128]
+vld1.8 {d10-d11},[r2,: 128]
+
+# qhasm: r3[0,1] +=  x4[0] unsigned*  5z34[2]; r3[2,3] +=  x4[1] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<x4=reg128#11%bot,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r3=q4,<x4=d20,<5z34=d11
+vmlal.u32 q4,d20,d11
+
+# qhasm:   r0 = r0[1]r0[0]r0[3]r0[2] 
+# asm 1: vrev64.i32 >r0=reg128#8,<r0=reg128#8
+# asm 2: vrev64.i32 >r0=q7,<r0=q7
+vrev64.i32 q7,q7
+
+# qhasm:   r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 
+# asm 1: vshll.u32 >r2=reg128#14,<c01=reg128#1%top,#12
+# asm 2: vshll.u32 >r2=q13,<c01=d1,#12
+vshll.u32 q13,d1,#12
+
+# qhasm:   		d01 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>d01=reg128#12%bot->d01=reg128#12%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>d01=d22->d01=d23},[<input_2=r1]!
+vld1.8 {d22-d23},[r1]!
+
+# qhasm: r2[0,1] += x01[0] unsigned* z12[2];   r2[2,3] += x01[1] unsigned* z12[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%bot,<z12=reg128#3%top
+# asm 2: vmlal.u32 <r2=q13,<x01=d16,<z12=d5
+vmlal.u32 q13,d16,d5
+
+# qhasm: r2[0,1] += x01[2] unsigned* z12[0];   r2[2,3] += x01[3] unsigned* z12[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%top,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r2=q13,<x01=d17,<z12=d4
+vmlal.u32 q13,d17,d4
+
+# qhasm: r2[0,1] += x23[0] unsigned* z0[0];   r2[2,3] += x23[1] unsigned* z0[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%bot,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r2=q13,<x23=d18,<z0=d6
+vmlal.u32 q13,d18,d6
+
+# qhasm: r2[0,1] += x23[2] unsigned*  5z34[2]; r2[2,3] += x23[3] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%top,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r2=q13,<x23=d19,<5z34=d11
+vmlal.u32 q13,d19,d11
+
+# qhasm: r2[0,1] +=  x4[0] unsigned* 5z34[0]; r2[2,3] +=  x4[1] unsigned* 5z34[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<x4=reg128#11%bot,<5z34=reg128#6%bot
+# asm 2: vmlal.u32 <r2=q13,<x4=d20,<5z34=d10
+vmlal.u32 q13,d20,d10
+
+# qhasm:   r0 = r0[0,1]c01[1]r0[2] 
+# asm 1: vext.32 <r0=reg128#8%top,<c01=reg128#1%bot,<r0=reg128#8%top,#1
+# asm 2: vext.32 <r0=d15,<c01=d0,<r0=d15,#1
+vext.32 d15,d0,d15,#1
+
+# qhasm:   r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 
+# asm 1: vshll.u32 >r1=reg128#15,<c23=reg128#2%bot,#6
+# asm 2: vshll.u32 >r1=q14,<c23=d2,#6
+vshll.u32 q14,d2,#6
+
+# qhasm: r1[0,1] += x01[0] unsigned* z12[0];   r1[2,3] += x01[1] unsigned* z12[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%bot,<z12=reg128#3%bot
+# asm 2: vmlal.u32 <r1=q14,<x01=d16,<z12=d4
+vmlal.u32 q14,d16,d4
+
+# qhasm: r1[0,1] += x01[2] unsigned* z0[0];   r1[2,3] += x01[3] unsigned* z0[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%top,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r1=q14,<x01=d17,<z0=d6
+vmlal.u32 q14,d17,d6
+
+# qhasm: r1[0,1] += x23[0] unsigned*  5z34[2]; r1[2,3] += x23[1] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%bot,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r1=q14,<x23=d18,<5z34=d11
+vmlal.u32 q14,d18,d11
+
+# qhasm: r1[0,1] += x23[2] unsigned* 5z34[0]; r1[2,3] += x23[3] unsigned* 5z34[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%top,<5z34=reg128#6%bot
+# asm 2: vmlal.u32 <r1=q14,<x23=d19,<5z34=d10
+vmlal.u32 q14,d19,d10
+
+# qhasm: ptr = &5z12_stack
+# asm 1: lea >ptr=int32#3,<5z12_stack=stack128#10
+# asm 2: lea >ptr=r2,<5z12_stack=[sp,#144]
+add r2,sp,#144
+
+# qhasm: 5z12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5z12=d0->5z12=d1},[<ptr=r2,: 128]
+vld1.8 {d0-d1},[r2,: 128]
+
+# qhasm: r1[0,1] +=  x4[0] unsigned* 5z12[2]; r1[2,3] +=  x4[1] unsigned* 5z12[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<x4=reg128#11%bot,<5z12=reg128#1%top
+# asm 2: vmlal.u32 <r1=q14,<x4=d20,<5z12=d1
+vmlal.u32 q14,d20,d1
+
+# qhasm:   		d23 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>d23=reg128#2%bot->d23=reg128#2%top},[<input_2=int32#2]!
+# asm 2: vld1.8 {>d23=d2->d23=d3},[<input_2=r1]!
+vld1.8 {d2-d3},[r1]!
+
+# qhasm:   		input_2 += 32
+# asm 1: add >input_2=int32#2,<input_2=int32#2,#32
+# asm 2: add >input_2=r1,<input_2=r1,#32
+add r1,r1,#32
+
+# qhasm: r0[0,1] +=  x4[0] unsigned* 5z12[0]; r0[2,3] +=  x4[1] unsigned* 5z12[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<x4=reg128#11%bot,<5z12=reg128#1%bot
+# asm 2: vmlal.u32 <r0=q7,<x4=d20,<5z12=d0
+vmlal.u32 q7,d20,d0
+
+# qhasm: r0[0,1] += x23[0] unsigned* 5z34[0]; r0[2,3] += x23[1] unsigned* 5z34[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%bot,<5z34=reg128#6%bot
+# asm 2: vmlal.u32 <r0=q7,<x23=d18,<5z34=d10
+vmlal.u32 q7,d18,d10
+
+# qhasm:   		d01 d23 = d01[0] d23[0] d01[1] d23[1] 
+# asm 1: vswp <d23=reg128#2%bot,<d01=reg128#12%top
+# asm 2: vswp <d23=d2,<d01=d23
+vswp d2,d23
+
+# qhasm: r0[0,1] += x23[2] unsigned* 5z12[2]; r0[2,3] += x23[3] unsigned* 5z12[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%top,<5z12=reg128#1%top
+# asm 2: vmlal.u32 <r0=q7,<x23=d19,<5z12=d1
+vmlal.u32 q7,d19,d1
+
+# qhasm: r0[0,1] += x01[0] unsigned* z0[0];   r0[2,3] += x01[1] unsigned* z0[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%bot,<z0=reg128#4%bot
+# asm 2: vmlal.u32 <r0=q7,<x01=d16,<z0=d6
+vmlal.u32 q7,d16,d6
+
+# qhasm:   		new mid
+
+# qhasm:   		2x v4 = d23 unsigned>> 40  
+# asm 1: vshr.u64 >v4=reg128#4,<d23=reg128#2,#40
+# asm 2: vshr.u64 >v4=q3,<d23=q1,#40
+vshr.u64 q3,q1,#40
+
+# qhasm:   		mid = d01[1]d23[0] mid[2,3] 
+# asm 1: vext.32 <mid=reg128#1%bot,<d01=reg128#12%bot,<d23=reg128#2%bot,#1
+# asm 2: vext.32 <mid=d0,<d01=d22,<d23=d2,#1
+vext.32 d0,d22,d2,#1
+
+# qhasm:   		new v23
+
+# qhasm:   		v23[2] = d23[0,1] unsigned>> 14; v23[3] = d23[2,3] unsigned>> 14
+# asm 1: vshrn.u64 <v23=reg128#10%top,<d23=reg128#2,#14
+# asm 2: vshrn.u64 <v23=d19,<d23=q1,#14
+vshrn.u64 d19,q1,#14
+
+# qhasm:   		mid = mid[0,1] d01[3]d23[2] 
+# asm 1: vext.32 <mid=reg128#1%top,<d01=reg128#12%top,<d23=reg128#2%top,#1
+# asm 2: vext.32 <mid=d1,<d01=d23,<d23=d3,#1
+vext.32 d1,d23,d3,#1
+
+# qhasm:   		new v01
+
+# qhasm:   		v01[2] = d01[0,1] unsigned>> 26; v01[3] = d01[2,3] unsigned>> 26
+# asm 1: vshrn.u64 <v01=reg128#11%top,<d01=reg128#12,#26
+# asm 2: vshrn.u64 <v01=d21,<d01=q11,#26
+vshrn.u64 d21,q11,#26
+
+# qhasm:   		v01 = d01[1]d01[0] v01[2,3] 
+# asm 1: vext.32 <v01=reg128#11%bot,<d01=reg128#12%bot,<d01=reg128#12%bot,#1
+# asm 2: vext.32 <v01=d20,<d01=d22,<d01=d22,#1
+vext.32 d20,d22,d22,#1
+
+# qhasm: r0[0,1] += x01[2] unsigned*  5z34[2]; r0[2,3] += x01[3] unsigned*  5z34[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%top,<5z34=reg128#6%top
+# asm 2: vmlal.u32 <r0=q7,<x01=d17,<5z34=d11
+vmlal.u32 q7,d17,d11
+
+# qhasm:   		v01 = v01[1]d01[2] v01[2,3] 
+# asm 1: vext.32 <v01=reg128#11%bot,<v01=reg128#11%bot,<d01=reg128#12%top,#1
+# asm 2: vext.32 <v01=d20,<v01=d20,<d01=d23,#1
+vext.32 d20,d20,d23,#1
+
+# qhasm:   		v23[0] = mid[0,1] unsigned>> 20; v23[1] = mid[2,3] unsigned>> 20
+# asm 1: vshrn.u64 <v23=reg128#10%bot,<mid=reg128#1,#20
+# asm 2: vshrn.u64 <v23=d18,<mid=q0,#20
+vshrn.u64 d18,q0,#20
+
+# qhasm:   		v4 = v4[0]v4[2]v4[1]v4[3]  
+# asm 1: vtrn.32 <v4=reg128#4%bot,<v4=reg128#4%top
+# asm 2: vtrn.32 <v4=d6,<v4=d7
+vtrn.32 d6,d7
+
+# qhasm:   		4x v01 &= 0x03ffffff
+# asm 1: vand.i32 <v01=reg128#11,#0x03ffffff
+# asm 2: vand.i32 <v01=q10,#0x03ffffff
+vand.i32 q10,#0x03ffffff
+
+# qhasm: ptr = &y34_stack
+# asm 1: lea >ptr=int32#3,<y34_stack=stack128#4
+# asm 2: lea >ptr=r2,<y34_stack=[sp,#48]
+add r2,sp,#48
+
+# qhasm: y34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>y34=d4->y34=d5},[<ptr=r2,: 128]
+vld1.8 {d4-d5},[r2,: 128]
+
+# qhasm:   		4x v23 &= 0x03ffffff
+# asm 1: vand.i32 <v23=reg128#10,#0x03ffffff
+# asm 2: vand.i32 <v23=q9,#0x03ffffff
+vand.i32 q9,#0x03ffffff
+
+# qhasm: ptr = &y12_stack
+# asm 1: lea >ptr=int32#3,<y12_stack=stack128#3
+# asm 2: lea >ptr=r2,<y12_stack=[sp,#32]
+add r2,sp,#32
+
+# qhasm: y12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>y12=d2->y12=d3},[<ptr=r2,: 128]
+vld1.8 {d2-d3},[r2,: 128]
+
+# qhasm:   		4x v4 |= 0x01000000
+# asm 1: vorr.i32 <v4=reg128#4,#0x01000000
+# asm 2: vorr.i32 <v4=q3,#0x01000000
+vorr.i32 q3,#0x01000000
+
+# qhasm: ptr = &y0_stack
+# asm 1: lea >ptr=int32#3,<y0_stack=stack128#2
+# asm 2: lea >ptr=r2,<y0_stack=[sp,#16]
+add r2,sp,#16
+
+# qhasm: y0 aligned= mem128[ptr]
+# asm 1: vld1.8 {>y0=reg128#1%bot->y0=reg128#1%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>y0=d0->y0=d1},[<ptr=r2,: 128]
+vld1.8 {d0-d1},[r2,: 128]
+
+# qhasm: r4[0,1] += v01[0] unsigned*  y34[2];  r4[2,3] += v01[1] unsigned*  y34[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%bot,<y34=reg128#3%top
+# asm 2: vmlal.u32 <r4=q15,<v01=d20,<y34=d5
+vmlal.u32 q15,d20,d5
+
+# qhasm: r4[0,1] += v01[2] unsigned* y34[0];  r4[2,3] += v01[3] unsigned* y34[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%top,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r4=q15,<v01=d21,<y34=d4
+vmlal.u32 q15,d21,d4
+
+# qhasm: r4[0,1] += v23[0] unsigned* y12[2];  r4[2,3] += v23[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r4=q15,<v23=d18,<y12=d3
+vmlal.u32 q15,d18,d3
+
+# qhasm: r4[0,1] += v23[2] unsigned* y12[0];  r4[2,3] += v23[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r4=q15,<v23=d19,<y12=d2
+vmlal.u32 q15,d19,d2
+
+# qhasm: r4[0,1] +=  v4[0] unsigned* y0[0];  r4[2,3] +=  v4[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r4=reg128#16,<v4=reg128#4%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r4=q15,<v4=d6,<y0=d0
+vmlal.u32 q15,d6,d0
+
+# qhasm: ptr = &5y34_stack
+# asm 1: lea >ptr=int32#3,<5y34_stack=stack128#6
+# asm 2: lea >ptr=r2,<5y34_stack=[sp,#80]
+add r2,sp,#80
+
+# qhasm: 5y34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5y34=d24->5y34=d25},[<ptr=r2,: 128]
+vld1.8 {d24-d25},[r2,: 128]
+
+# qhasm: r3[0,1] += v01[0] unsigned* y34[0];   r3[2,3] += v01[1] unsigned* y34[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%bot,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r3=q4,<v01=d20,<y34=d4
+vmlal.u32 q4,d20,d4
+
+# qhasm: r3[0,1] += v01[2] unsigned* y12[2];   r3[2,3] += v01[3] unsigned* y12[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%top,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r3=q4,<v01=d21,<y12=d3
+vmlal.u32 q4,d21,d3
+
+# qhasm: r3[0,1] += v23[0] unsigned* y12[0];   r3[2,3] += v23[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r3=q4,<v23=d18,<y12=d2
+vmlal.u32 q4,d18,d2
+
+# qhasm: r3[0,1] += v23[2] unsigned* y0[0];   r3[2,3] += v23[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r3=q4,<v23=d19,<y0=d0
+vmlal.u32 q4,d19,d0
+
+# qhasm: r3[0,1] +=  v4[0] unsigned*  5y34[2]; r3[2,3] +=  v4[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r3=reg128#5,<v4=reg128#4%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r3=q4,<v4=d6,<5y34=d25
+vmlal.u32 q4,d6,d25
+
+# qhasm: ptr = &5y12_stack
+# asm 1: lea >ptr=int32#3,<5y12_stack=stack128#5
+# asm 2: lea >ptr=r2,<5y12_stack=[sp,#64]
+add r2,sp,#64
+
+# qhasm: 5y12 aligned= mem128[ptr]
+# asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>5y12=d22->5y12=d23},[<ptr=r2,: 128]
+vld1.8 {d22-d23},[r2,: 128]
+
+# qhasm: r0[0,1] +=  v4[0] unsigned* 5y12[0]; r0[2,3] +=  v4[1] unsigned* 5y12[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<v4=reg128#4%bot,<5y12=reg128#12%bot
+# asm 2: vmlal.u32 <r0=q7,<v4=d6,<5y12=d22
+vmlal.u32 q7,d6,d22
+
+# qhasm: r0[0,1] += v23[0] unsigned* 5y34[0]; r0[2,3] += v23[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r0=q7,<v23=d18,<5y34=d24
+vmlal.u32 q7,d18,d24
+
+# qhasm: r0[0,1] += v23[2] unsigned* 5y12[2]; r0[2,3] += v23[3] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%top,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r0=q7,<v23=d19,<5y12=d23
+vmlal.u32 q7,d19,d23
+
+# qhasm: r0[0,1] += v01[0] unsigned* y0[0];   r0[2,3] += v01[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r0=q7,<v01=d20,<y0=d0
+vmlal.u32 q7,d20,d0
+
+# qhasm: r0[0,1] += v01[2] unsigned*  5y34[2]; r0[2,3] += v01[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r0=q7,<v01=d21,<5y34=d25
+vmlal.u32 q7,d21,d25
+
+# qhasm: r1[0,1] += v01[0] unsigned* y12[0];   r1[2,3] += v01[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r1=q14,<v01=d20,<y12=d2
+vmlal.u32 q14,d20,d2
+
+# qhasm: r1[0,1] += v01[2] unsigned* y0[0];   r1[2,3] += v01[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r1=q14,<v01=d21,<y0=d0
+vmlal.u32 q14,d21,d0
+
+# qhasm: r1[0,1] += v23[0] unsigned*  5y34[2]; r1[2,3] += v23[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r1=q14,<v23=d18,<5y34=d25
+vmlal.u32 q14,d18,d25
+
+# qhasm: r1[0,1] += v23[2] unsigned* 5y34[0]; r1[2,3] += v23[3] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%top,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r1=q14,<v23=d19,<5y34=d24
+vmlal.u32 q14,d19,d24
+
+# qhasm: r1[0,1] +=  v4[0] unsigned* 5y12[2]; r1[2,3] +=  v4[1] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r1=reg128#15,<v4=reg128#4%bot,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r1=q14,<v4=d6,<5y12=d23
+vmlal.u32 q14,d6,d23
+
+# qhasm: r2[0,1] += v01[0] unsigned* y12[2];   r2[2,3] += v01[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r2=q13,<v01=d20,<y12=d3
+vmlal.u32 q13,d20,d3
+
+# qhasm: r2[0,1] += v01[2] unsigned* y12[0];   r2[2,3] += v01[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r2=q13,<v01=d21,<y12=d2
+vmlal.u32 q13,d21,d2
+
+# qhasm: r2[0,1] += v23[0] unsigned* y0[0];   r2[2,3] += v23[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r2=q13,<v23=d18,<y0=d0
+vmlal.u32 q13,d18,d0
+
+# qhasm: r2[0,1] += v23[2] unsigned*  5y34[2]; r2[2,3] += v23[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r2=q13,<v23=d19,<5y34=d25
+vmlal.u32 q13,d19,d25
+
+# qhasm: r2[0,1] +=  v4[0] unsigned* 5y34[0]; r2[2,3] +=  v4[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r2=reg128#14,<v4=reg128#4%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r2=q13,<v4=d6,<5y34=d24
+vmlal.u32 q13,d6,d24
+
+# qhasm: 				ptr = &two24
+# asm 1: lea >ptr=int32#3,<two24=stack128#1
+# asm 2: lea >ptr=r2,<two24=[sp,#0]
+add r2,sp,#0
+
+# qhasm: 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#4,<r0=reg128#8,#26
+# asm 2: vshr.u64 >t1=q3,<r0=q7,#26
+vshr.u64 q3,q7,#26
+
+# qhasm:   				len -= 64
+# asm 1: sub >len=int32#4,<len=int32#4,#64
+# asm 2: sub >len=r3,<len=r3,#64
+sub r3,r3,#64
+
+# qhasm:    r0 &= mask
+# asm 1: vand >r0=reg128#6,<r0=reg128#8,<mask=reg128#7
+# asm 2: vand >r0=q5,<r0=q7,<mask=q6
+vand q5,q7,q6
+
+# qhasm: 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#15,<t1=reg128#4
+# asm 2: vadd.i64 >r1=q3,<r1=q14,<t1=q3
+vadd.i64 q3,q14,q3
+
+# qhasm: 		2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#5,#26
+# asm 2: vshr.u64 >t4=q7,<r3=q4,#26
+vshr.u64 q7,q4,#26
+
+# qhasm: 		   r3 &= mask
+# asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7
+# asm 2: vand >r3=q4,<r3=q4,<mask=q6
+vand q4,q4,q6
+
+# qhasm: 		2x x4 = r4 + t4
+# asm 1: vadd.i64 >x4=reg128#8,<r4=reg128#16,<t4=reg128#8
+# asm 2: vadd.i64 >x4=q7,<r4=q15,<t4=q7
+vadd.i64 q7,q15,q7
+
+# qhasm: 				r4 aligned= mem128[ptr]
+# asm 1: vld1.8 {>r4=reg128#16%bot->r4=reg128#16%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>r4=d30->r4=d31},[<ptr=r2,: 128]
+vld1.8 {d30-d31},[r2,: 128]
+
+# qhasm: 2x t2 = r1 unsigned>> 26
+# asm 1: vshr.u64 >t2=reg128#9,<r1=reg128#4,#26
+# asm 2: vshr.u64 >t2=q8,<r1=q3,#26
+vshr.u64 q8,q3,#26
+
+# qhasm:    r1 &= mask
+# asm 1: vand >r1=reg128#4,<r1=reg128#4,<mask=reg128#7
+# asm 2: vand >r1=q3,<r1=q3,<mask=q6
+vand q3,q3,q6
+
+# qhasm: 		2x t0 = x4 unsigned>> 26
+# asm 1: vshr.u64 >t0=reg128#10,<x4=reg128#8,#26
+# asm 2: vshr.u64 >t0=q9,<x4=q7,#26
+vshr.u64 q9,q7,#26
+
+# qhasm: 2x r2 += t2
+# asm 1: vadd.i64 >r2=reg128#9,<r2=reg128#14,<t2=reg128#9
+# asm 2: vadd.i64 >r2=q8,<r2=q13,<t2=q8
+vadd.i64 q8,q13,q8
+
+# qhasm: 		   x4 &= mask
+# asm 1: vand >x4=reg128#11,<x4=reg128#8,<mask=reg128#7
+# asm 2: vand >x4=q10,<x4=q7,<mask=q6
+vand q10,q7,q6
+
+# qhasm: 		2x x01 = r0 + t0
+# asm 1: vadd.i64 >x01=reg128#6,<r0=reg128#6,<t0=reg128#10
+# asm 2: vadd.i64 >x01=q5,<r0=q5,<t0=q9
+vadd.i64 q5,q5,q9
+
+# qhasm: 				r0 aligned= mem128[ptr]
+# asm 1: vld1.8 {>r0=reg128#8%bot->r0=reg128#8%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>r0=d14->r0=d15},[<ptr=r2,: 128]
+vld1.8 {d14-d15},[r2,: 128]
+
+# qhasm: 				ptr = &z34_stack
+# asm 1: lea >ptr=int32#3,<z34_stack=stack128#9
+# asm 2: lea >ptr=r2,<z34_stack=[sp,#128]
+add r2,sp,#128
+
+# qhasm: 		2x t0 <<= 2
+# asm 1: vshl.i64 >t0=reg128#10,<t0=reg128#10,#2
+# asm 2: vshl.i64 >t0=q9,<t0=q9,#2
+vshl.i64 q9,q9,#2
+
+# qhasm: 2x t3 = r2 unsigned>> 26
+# asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#9,#26
+# asm 2: vshr.u64 >t3=q13,<r2=q8,#26
+vshr.u64 q13,q8,#26
+
+# qhasm: 		2x x01 += t0
+# asm 1: vadd.i64 >x01=reg128#15,<x01=reg128#6,<t0=reg128#10
+# asm 2: vadd.i64 >x01=q14,<x01=q5,<t0=q9
+vadd.i64 q14,q5,q9
+
+# qhasm: 				z34 aligned= mem128[ptr]
+# asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<ptr=int32#3,: 128]
+# asm 2: vld1.8 {>z34=d10->z34=d11},[<ptr=r2,: 128]
+vld1.8 {d10-d11},[r2,: 128]
+
+# qhasm:    x23 = r2 & mask
+# asm 1: vand >x23=reg128#10,<r2=reg128#9,<mask=reg128#7
+# asm 2: vand >x23=q9,<r2=q8,<mask=q6
+vand q9,q8,q6
+
+# qhasm: 2x r3 += t3
+# asm 1: vadd.i64 >r3=reg128#5,<r3=reg128#5,<t3=reg128#14
+# asm 2: vadd.i64 >r3=q4,<r3=q4,<t3=q13
+vadd.i64 q4,q4,q13
+
+# qhasm: 								input_2 += 32
+# asm 1: add >input_2=int32#2,<input_2=int32#2,#32
+# asm 2: add >input_2=r1,<input_2=r1,#32
+add r1,r1,#32
+
+# qhasm: 		2x t1 = x01 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#14,<x01=reg128#15,#26
+# asm 2: vshr.u64 >t1=q13,<x01=q14,#26
+vshr.u64 q13,q14,#26
+
+# qhasm: 						x23 = x23[0,2,1,3]
+# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
+# asm 2: vtrn.32 <x23=d18,<x23=d19
+vtrn.32 d18,d19
+
+# qhasm: 		   x01 = x01 & mask
+# asm 1: vand >x01=reg128#9,<x01=reg128#15,<mask=reg128#7
+# asm 2: vand >x01=q8,<x01=q14,<mask=q6
+vand q8,q14,q6
+
+# qhasm: 		2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#4,<t1=reg128#14
+# asm 2: vadd.i64 >r1=q3,<r1=q3,<t1=q13
+vadd.i64 q3,q3,q13
+
+# qhasm: 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#14,<r3=reg128#5,#26
+# asm 2: vshr.u64 >t4=q13,<r3=q4,#26
+vshr.u64 q13,q4,#26
+
+# qhasm: 						x01 = x01[0,2,1,3]
+# asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top
+# asm 2: vtrn.32 <x01=d16,<x01=d17
+vtrn.32 d16,d17
+
+# qhasm:    r3 &= mask
+# asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7
+# asm 2: vand >r3=q4,<r3=q4,<mask=q6
+vand q4,q4,q6
+
+# qhasm: 						r1 = r1[0,2,1,3]
+# asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top
+# asm 2: vtrn.32 <r1=d6,<r1=d7
+vtrn.32 d6,d7
+
+# qhasm: 2x x4 += t4
+# asm 1: vadd.i64 >x4=reg128#11,<x4=reg128#11,<t4=reg128#14
+# asm 2: vadd.i64 >x4=q10,<x4=q10,<t4=q13
+vadd.i64 q10,q10,q13
+
+# qhasm: 						r3 = r3[0,2,1,3]
+# asm 1: vtrn.32 <r3=reg128#5%bot,<r3=reg128#5%top
+# asm 2: vtrn.32 <r3=d8,<r3=d9
+vtrn.32 d8,d9
+
+# qhasm: 						x01 = x01[0,1] r1[0,1]
+# asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0
+# asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0
+vext.32 d17,d6,d6,#0
+
+# qhasm: 						x23 = x23[0,1] r3[0,1]
+# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#5%bot,<r3=reg128#5%bot,#0
+# asm 2: vext.32 <x23=d19,<r3=d8,<r3=d8,#0
+vext.32 d19,d8,d8,#0
+
+# qhasm: 						x4 = x4[0,2,1,3]
+# asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top
+# asm 2: vtrn.32 <x4=d20,<x4=d21
+vtrn.32 d20,d21
+
+# qhasm:                   unsigned>? len - 64
+# asm 1: cmp <len=int32#4,#64
+# asm 2: cmp <len=r3,#64
+cmp r3,#64
+
+# qhasm: goto mainloop2 if unsigned>
+bhi ._mainloop2
+
+# qhasm: input_2 -= 32
+# asm 1: sub >input_2=int32#3,<input_2=int32#2,#32
+# asm 2: sub >input_2=r2,<input_2=r1,#32
+sub r2,r1,#32
+
+# qhasm: below64bytes:
+._below64bytes:
+
+# qhasm:              unsigned>? len - 32
+# asm 1: cmp <len=int32#4,#32
+# asm 2: cmp <len=r3,#32
+cmp r3,#32
+
+# qhasm: goto end if !unsigned>
+bls ._end
+
+# qhasm: mainloop:
+._mainloop:
+
+# qhasm:   new r0
+
+# qhasm: ptr = &two24
+# asm 1: lea >ptr=int32#2,<two24=stack128#1
+# asm 2: lea >ptr=r1,<two24=[sp,#0]
+add r1,sp,#0
+
+# qhasm: r4 aligned= mem128[ptr]
+# asm 1: vld1.8 {>r4=reg128#5%bot->r4=reg128#5%top},[<ptr=int32#2,: 128]
+# asm 2: vld1.8 {>r4=d8->r4=d9},[<ptr=r1,: 128]
+vld1.8 {d8-d9},[r1,: 128]
+
+# qhasm: u4 aligned= mem128[ptr]
+# asm 1: vld1.8 {>u4=reg128#6%bot->u4=reg128#6%top},[<ptr=int32#2,: 128]
+# asm 2: vld1.8 {>u4=d10->u4=d11},[<ptr=r1,: 128]
+vld1.8 {d10-d11},[r1,: 128]
+
+# qhasm:   c01 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c01=reg128#8%bot->c01=reg128#8%top},[<input_2=int32#3]!
+# asm 2: vld1.8 {>c01=d14->c01=d15},[<input_2=r2]!
+vld1.8 {d14-d15},[r2]!
+
+# qhasm: r4[0,1] += x01[0] unsigned*  y34[2];  r4[2,3] += x01[1] unsigned*  y34[3]
+# asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%bot,<y34=reg128#3%top
+# asm 2: vmlal.u32 <r4=q4,<x01=d16,<y34=d5
+vmlal.u32 q4,d16,d5
+
+# qhasm:   c23 = mem128[input_2];input_2+=16 
+# asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_2=int32#3]!
+# asm 2: vld1.8 {>c23=d26->c23=d27},[<input_2=r2]!
+vld1.8 {d26-d27},[r2]!
+
+# qhasm: r4[0,1] += x01[2] unsigned* y34[0];  r4[2,3] += x01[3] unsigned* y34[1]
+# asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%top,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r4=q4,<x01=d17,<y34=d4
+vmlal.u32 q4,d17,d4
+
+# qhasm:   r0 = u4[1]c01[0]r0[2,3] 
+# asm 1: vext.32 <r0=reg128#4%bot,<u4=reg128#6%bot,<c01=reg128#8%bot,#1
+# asm 2: vext.32 <r0=d6,<u4=d10,<c01=d14,#1
+vext.32 d6,d10,d14,#1
+
+# qhasm: r4[0,1] += x23[0] unsigned* y12[2];  r4[2,3] += x23[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r4=q4,<x23=d18,<y12=d3
+vmlal.u32 q4,d18,d3
+
+# qhasm:   r0 = r0[0,1]u4[1]c23[0] 
+# asm 1: vext.32 <r0=reg128#4%top,<u4=reg128#6%bot,<c23=reg128#14%bot,#1
+# asm 2: vext.32 <r0=d7,<u4=d10,<c23=d26,#1
+vext.32 d7,d10,d26,#1
+
+# qhasm: r4[0,1] += x23[2] unsigned* y12[0];  r4[2,3] += x23[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r4=q4,<x23=d19,<y12=d2
+vmlal.u32 q4,d19,d2
+
+# qhasm:   r0 = r0[1]r0[0]r0[3]r0[2] 
+# asm 1: vrev64.i32 >r0=reg128#4,<r0=reg128#4
+# asm 2: vrev64.i32 >r0=q3,<r0=q3
+vrev64.i32 q3,q3
+
+# qhasm: r4[0,1] +=  x4[0] unsigned* y0[0];  r4[2,3] +=  x4[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r4=reg128#5,<x4=reg128#11%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r4=q4,<x4=d20,<y0=d0
+vmlal.u32 q4,d20,d0
+
+# qhasm: r0[0,1] +=  x4[0] unsigned* 5y12[0]; r0[2,3] +=  x4[1] unsigned* 5y12[1]
+# asm 1: vmlal.u32 <r0=reg128#4,<x4=reg128#11%bot,<5y12=reg128#12%bot
+# asm 2: vmlal.u32 <r0=q3,<x4=d20,<5y12=d22
+vmlal.u32 q3,d20,d22
+
+# qhasm: r0[0,1] += x23[0] unsigned* 5y34[0]; r0[2,3] += x23[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r0=q3,<x23=d18,<5y34=d24
+vmlal.u32 q3,d18,d24
+
+# qhasm: r0[0,1] += x23[2] unsigned* 5y12[2]; r0[2,3] += x23[3] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%top,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r0=q3,<x23=d19,<5y12=d23
+vmlal.u32 q3,d19,d23
+
+# qhasm:   c01 c23 = c01[0]c23[0]c01[2]c23[2]c01[1]c23[1]c01[3]c23[3] 
+# asm 1: vtrn.32 <c01=reg128#8,<c23=reg128#14
+# asm 2: vtrn.32 <c01=q7,<c23=q13
+vtrn.32 q7,q13
+
+# qhasm: r0[0,1] += x01[0] unsigned* y0[0];   r0[2,3] += x01[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r0=q3,<x01=d16,<y0=d0
+vmlal.u32 q3,d16,d0
+
+# qhasm:   r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 
+# asm 1: vshll.u32 >r3=reg128#6,<c23=reg128#14%top,#18
+# asm 2: vshll.u32 >r3=q5,<c23=d27,#18
+vshll.u32 q5,d27,#18
+
+# qhasm: r0[0,1] += x01[2] unsigned*  5y34[2]; r0[2,3] += x01[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r0=q3,<x01=d17,<5y34=d25
+vmlal.u32 q3,d17,d25
+
+# qhasm: r3[0,1] += x01[0] unsigned* y34[0];   r3[2,3] += x01[1] unsigned* y34[1]
+# asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%bot,<y34=reg128#3%bot
+# asm 2: vmlal.u32 <r3=q5,<x01=d16,<y34=d4
+vmlal.u32 q5,d16,d4
+
+# qhasm: r3[0,1] += x01[2] unsigned* y12[2];   r3[2,3] += x01[3] unsigned* y12[3]
+# asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%top,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r3=q5,<x01=d17,<y12=d3
+vmlal.u32 q5,d17,d3
+
+# qhasm: r3[0,1] += x23[0] unsigned* y12[0];   r3[2,3] += x23[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r3=q5,<x23=d18,<y12=d2
+vmlal.u32 q5,d18,d2
+
+# qhasm: r3[0,1] += x23[2] unsigned* y0[0];   r3[2,3] += x23[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r3=q5,<x23=d19,<y0=d0
+vmlal.u32 q5,d19,d0
+
+# qhasm:   r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 
+# asm 1: vshll.u32 >r1=reg128#14,<c23=reg128#14%bot,#6
+# asm 2: vshll.u32 >r1=q13,<c23=d26,#6
+vshll.u32 q13,d26,#6
+
+# qhasm: r3[0,1] +=  x4[0] unsigned*  5y34[2]; r3[2,3] +=  x4[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r3=reg128#6,<x4=reg128#11%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r3=q5,<x4=d20,<5y34=d25
+vmlal.u32 q5,d20,d25
+
+# qhasm: r1[0,1] += x01[0] unsigned* y12[0];   r1[2,3] += x01[1] unsigned* y12[1]
+# asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%bot,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r1=q13,<x01=d16,<y12=d2
+vmlal.u32 q13,d16,d2
+
+# qhasm: r1[0,1] += x01[2] unsigned* y0[0];   r1[2,3] += x01[3] unsigned* y0[1]
+# asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%top,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r1=q13,<x01=d17,<y0=d0
+vmlal.u32 q13,d17,d0
+
+# qhasm: r1[0,1] += x23[0] unsigned*  5y34[2]; r1[2,3] += x23[1] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%bot,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r1=q13,<x23=d18,<5y34=d25
+vmlal.u32 q13,d18,d25
+
+# qhasm: r1[0,1] += x23[2] unsigned* 5y34[0]; r1[2,3] += x23[3] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%top,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r1=q13,<x23=d19,<5y34=d24
+vmlal.u32 q13,d19,d24
+
+# qhasm:   r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 
+# asm 1: vshll.u32 >r2=reg128#8,<c01=reg128#8%top,#12
+# asm 2: vshll.u32 >r2=q7,<c01=d15,#12
+vshll.u32 q7,d15,#12
+
+# qhasm: r1[0,1] +=  x4[0] unsigned* 5y12[2]; r1[2,3] +=  x4[1] unsigned* 5y12[3]
+# asm 1: vmlal.u32 <r1=reg128#14,<x4=reg128#11%bot,<5y12=reg128#12%top
+# asm 2: vmlal.u32 <r1=q13,<x4=d20,<5y12=d23
+vmlal.u32 q13,d20,d23
+
+# qhasm: r2[0,1] += x01[0] unsigned* y12[2];   r2[2,3] += x01[1] unsigned* y12[3]
+# asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%bot,<y12=reg128#2%top
+# asm 2: vmlal.u32 <r2=q7,<x01=d16,<y12=d3
+vmlal.u32 q7,d16,d3
+
+# qhasm: r2[0,1] += x01[2] unsigned* y12[0];   r2[2,3] += x01[3] unsigned* y12[1]
+# asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%top,<y12=reg128#2%bot
+# asm 2: vmlal.u32 <r2=q7,<x01=d17,<y12=d2
+vmlal.u32 q7,d17,d2
+
+# qhasm: r2[0,1] += x23[0] unsigned* y0[0];   r2[2,3] += x23[1] unsigned* y0[1]
+# asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%bot,<y0=reg128#1%bot
+# asm 2: vmlal.u32 <r2=q7,<x23=d18,<y0=d0
+vmlal.u32 q7,d18,d0
+
+# qhasm: r2[0,1] += x23[2] unsigned*  5y34[2]; r2[2,3] += x23[3] unsigned*  5y34[3]
+# asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%top,<5y34=reg128#13%top
+# asm 2: vmlal.u32 <r2=q7,<x23=d19,<5y34=d25
+vmlal.u32 q7,d19,d25
+
+# qhasm: r2[0,1] +=  x4[0] unsigned* 5y34[0]; r2[2,3] +=  x4[1] unsigned* 5y34[1]
+# asm 1: vmlal.u32 <r2=reg128#8,<x4=reg128#11%bot,<5y34=reg128#13%bot
+# asm 2: vmlal.u32 <r2=q7,<x4=d20,<5y34=d24
+vmlal.u32 q7,d20,d24
+
+# qhasm: 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#9,<r0=reg128#4,#26
+# asm 2: vshr.u64 >t1=q8,<r0=q3,#26
+vshr.u64 q8,q3,#26
+
+# qhasm:    r0 &= mask
+# asm 1: vand >r0=reg128#4,<r0=reg128#4,<mask=reg128#7
+# asm 2: vand >r0=q3,<r0=q3,<mask=q6
+vand q3,q3,q6
+
+# qhasm: 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#9,<r1=reg128#14,<t1=reg128#9
+# asm 2: vadd.i64 >r1=q8,<r1=q13,<t1=q8
+vadd.i64 q8,q13,q8
+
+# qhasm: 		2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#10,<r3=reg128#6,#26
+# asm 2: vshr.u64 >t4=q9,<r3=q5,#26
+vshr.u64 q9,q5,#26
+
+# qhasm: 		   r3 &= mask
+# asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7
+# asm 2: vand >r3=q5,<r3=q5,<mask=q6
+vand q5,q5,q6
+
+# qhasm: 		2x r4 += t4
+# asm 1: vadd.i64 >r4=reg128#5,<r4=reg128#5,<t4=reg128#10
+# asm 2: vadd.i64 >r4=q4,<r4=q4,<t4=q9
+vadd.i64 q4,q4,q9
+
+# qhasm: 2x t2 = r1 unsigned>> 26
+# asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#9,#26
+# asm 2: vshr.u64 >t2=q9,<r1=q8,#26
+vshr.u64 q9,q8,#26
+
+# qhasm:    r1 &= mask
+# asm 1: vand >r1=reg128#11,<r1=reg128#9,<mask=reg128#7
+# asm 2: vand >r1=q10,<r1=q8,<mask=q6
+vand q10,q8,q6
+
+# qhasm: 		2x t0 = r4 unsigned>> 26
+# asm 1: vshr.u64 >t0=reg128#9,<r4=reg128#5,#26
+# asm 2: vshr.u64 >t0=q8,<r4=q4,#26
+vshr.u64 q8,q4,#26
+
+# qhasm: 2x r2 += t2
+# asm 1: vadd.i64 >r2=reg128#8,<r2=reg128#8,<t2=reg128#10
+# asm 2: vadd.i64 >r2=q7,<r2=q7,<t2=q9
+vadd.i64 q7,q7,q9
+
+# qhasm: 		   r4 &= mask
+# asm 1: vand >r4=reg128#5,<r4=reg128#5,<mask=reg128#7
+# asm 2: vand >r4=q4,<r4=q4,<mask=q6
+vand q4,q4,q6
+
+# qhasm: 		2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8
+vadd.i64 q3,q3,q8
+
+# qhasm: 		2x t0 <<= 2
+# asm 1: vshl.i64 >t0=reg128#9,<t0=reg128#9,#2
+# asm 2: vshl.i64 >t0=q8,<t0=q8,#2
+vshl.i64 q8,q8,#2
+
+# qhasm: 2x t3 = r2 unsigned>> 26
+# asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#8,#26
+# asm 2: vshr.u64 >t3=q13,<r2=q7,#26
+vshr.u64 q13,q7,#26
+
+# qhasm: 		2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8
+vadd.i64 q3,q3,q8
+
+# qhasm:    x23 = r2 & mask
+# asm 1: vand >x23=reg128#10,<r2=reg128#8,<mask=reg128#7
+# asm 2: vand >x23=q9,<r2=q7,<mask=q6
+vand q9,q7,q6
+
+# qhasm: 2x r3 += t3
+# asm 1: vadd.i64 >r3=reg128#6,<r3=reg128#6,<t3=reg128#14
+# asm 2: vadd.i64 >r3=q5,<r3=q5,<t3=q13
+vadd.i64 q5,q5,q13
+
+# qhasm: 		2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#8,<r0=reg128#4,#26
+# asm 2: vshr.u64 >t1=q7,<r0=q3,#26
+vshr.u64 q7,q3,#26
+
+# qhasm: 		   x01 = r0 & mask
+# asm 1: vand >x01=reg128#9,<r0=reg128#4,<mask=reg128#7
+# asm 2: vand >x01=q8,<r0=q3,<mask=q6
+vand q8,q3,q6
+
+# qhasm: 		2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#11,<t1=reg128#8
+# asm 2: vadd.i64 >r1=q3,<r1=q10,<t1=q7
+vadd.i64 q3,q10,q7
+
+# qhasm: 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#6,#26
+# asm 2: vshr.u64 >t4=q7,<r3=q5,#26
+vshr.u64 q7,q5,#26
+
+# qhasm:    r3 &= mask
+# asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7
+# asm 2: vand >r3=q5,<r3=q5,<mask=q6
+vand q5,q5,q6
+
+# qhasm: 2x x4 = r4 + t4
+# asm 1: vadd.i64 >x4=reg128#11,<r4=reg128#5,<t4=reg128#8
+# asm 2: vadd.i64 >x4=q10,<r4=q4,<t4=q7
+vadd.i64 q10,q4,q7
+
+# qhasm:   len -= 32
+# asm 1: sub >len=int32#4,<len=int32#4,#32
+# asm 2: sub >len=r3,<len=r3,#32
+sub r3,r3,#32
+
+# qhasm: x01 = x01[0,2,1,3]
+# asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top
+# asm 2: vtrn.32 <x01=d16,<x01=d17
+vtrn.32 d16,d17
+
+# qhasm: x23 = x23[0,2,1,3]
+# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
+# asm 2: vtrn.32 <x23=d18,<x23=d19
+vtrn.32 d18,d19
+
+# qhasm: r1 = r1[0,2,1,3]
+# asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top
+# asm 2: vtrn.32 <r1=d6,<r1=d7
+vtrn.32 d6,d7
+
+# qhasm: r3 = r3[0,2,1,3]
+# asm 1: vtrn.32 <r3=reg128#6%bot,<r3=reg128#6%top
+# asm 2: vtrn.32 <r3=d10,<r3=d11
+vtrn.32 d10,d11
+
+# qhasm: x4 = x4[0,2,1,3]
+# asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top
+# asm 2: vtrn.32 <x4=d20,<x4=d21
+vtrn.32 d20,d21
+
+# qhasm: x01 = x01[0,1] r1[0,1]
+# asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0
+# asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0
+vext.32 d17,d6,d6,#0
+
+# qhasm: x23 = x23[0,1] r3[0,1]
+# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#6%bot,<r3=reg128#6%bot,#0
+# asm 2: vext.32 <x23=d19,<r3=d10,<r3=d10,#0
+vext.32 d19,d10,d10,#0
+
+# qhasm: unsigned>? len - 32
+# asm 1: cmp <len=int32#4,#32
+# asm 2: cmp <len=r3,#32
+cmp r3,#32
+
+# qhasm: goto mainloop if unsigned>
+bhi ._mainloop
+
+# qhasm: end:
+._end:
+
+# qhasm: mem128[input_0] = x01;input_0+=16
+# asm 1: vst1.8 {<x01=reg128#9%bot-<x01=reg128#9%top},[<input_0=int32#1]!
+# asm 2: vst1.8 {<x01=d16-<x01=d17},[<input_0=r0]!
+vst1.8 {d16-d17},[r0]!
+
+# qhasm: mem128[input_0] = x23;input_0+=16
+# asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1]!
+# asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0]!
+vst1.8 {d18-d19},[r0]!
+
+# qhasm: mem64[input_0] = x4[0]
+# asm 1: vst1.8 <x4=reg128#11%bot,[<input_0=int32#1]
+# asm 2: vst1.8 <x4=d20,[<input_0=r0]
+vst1.8 d20,[r0]
+
+# qhasm: len = len
+# asm 1: mov >len=int32#1,<len=int32#4
+# asm 2: mov >len=r0,<len=r3
+mov r0,r3
+
+# qhasm: qpopreturn len
+mov sp,r12
+vpop {q4,q5,q6,q7}
+bx lr
+
+# qhasm: int32 input_0
+
+# qhasm: int32 input_1
+
+# qhasm: int32 input_2
+
+# qhasm: int32 input_3
+
+# qhasm: stack32 input_4
+
+# qhasm: stack32 input_5
+
+# qhasm: stack32 input_6
+
+# qhasm: stack32 input_7
+
+# qhasm: int32 caller_r4
+
+# qhasm: int32 caller_r5
+
+# qhasm: int32 caller_r6
+
+# qhasm: int32 caller_r7
+
+# qhasm: int32 caller_r8
+
+# qhasm: int32 caller_r9
+
+# qhasm: int32 caller_r10
+
+# qhasm: int32 caller_r11
+
+# qhasm: int32 caller_r12
+
+# qhasm: int32 caller_r14
+
+# qhasm: reg128 caller_q4
+
+# qhasm: reg128 caller_q5
+
+# qhasm: reg128 caller_q6
+
+# qhasm: reg128 caller_q7
+
+# qhasm: reg128 r0
+
+# qhasm: reg128 r1
+
+# qhasm: reg128 r2
+
+# qhasm: reg128 r3
+
+# qhasm: reg128 r4
+
+# qhasm: reg128 x01
+
+# qhasm: reg128 x23
+
+# qhasm: reg128 x4
+
+# qhasm: reg128 y01
+
+# qhasm: reg128 y23
+
+# qhasm: reg128 y4
+
+# qhasm: reg128 _5y01
+
+# qhasm: reg128 _5y23
+
+# qhasm: reg128 _5y4
+
+# qhasm: reg128 c01
+
+# qhasm: reg128 c23
+
+# qhasm: reg128 c4
+
+# qhasm: reg128 t0
+
+# qhasm: reg128 t1
+
+# qhasm: reg128 t2
+
+# qhasm: reg128 t3
+
+# qhasm: reg128 t4
+
+# qhasm: reg128 mask
+
+# qhasm: enter crypto_onetimeauth_poly1305_neon2_addmulmod
+.align 2
+.global GFp_poly1305_neon2_addmulmod
+.hidden GFp_poly1305_neon2_addmulmod
+.type GFp_poly1305_neon2_addmulmod STT_FUNC
+GFp_poly1305_neon2_addmulmod:
+sub sp,sp,#0
+
+# qhasm: 				2x mask = 0xffffffff
+# asm 1: vmov.i64 >mask=reg128#1,#0xffffffff
+# asm 2: vmov.i64 >mask=q0,#0xffffffff
+vmov.i64 q0,#0xffffffff
+
+# qhasm:   y01 aligned= mem128[input_2];input_2+=16
+# asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[<input_2=int32#3,: 128]!
+# asm 2: vld1.8 {>y01=d2->y01=d3},[<input_2=r2,: 128]!
+vld1.8 {d2-d3},[r2,: 128]!
+
+# qhasm: 4x _5y01 = y01 << 2
+# asm 1: vshl.i32 >_5y01=reg128#3,<y01=reg128#2,#2
+# asm 2: vshl.i32 >_5y01=q2,<y01=q1,#2
+vshl.i32 q2,q1,#2
+
+# qhasm:   y23 aligned= mem128[input_2];input_2+=16
+# asm 1: vld1.8 {>y23=reg128#4%bot->y23=reg128#4%top},[<input_2=int32#3,: 128]!
+# asm 2: vld1.8 {>y23=d6->y23=d7},[<input_2=r2,: 128]!
+vld1.8 {d6-d7},[r2,: 128]!
+
+# qhasm: 4x _5y23 = y23 << 2
+# asm 1: vshl.i32 >_5y23=reg128#9,<y23=reg128#4,#2
+# asm 2: vshl.i32 >_5y23=q8,<y23=q3,#2
+vshl.i32 q8,q3,#2
+
+# qhasm:   y4  aligned= mem64[input_2]y4[1]
+# asm 1: vld1.8 {<y4=reg128#10%bot},[<input_2=int32#3,: 64]
+# asm 2: vld1.8 {<y4=d18},[<input_2=r2,: 64]
+vld1.8 {d18},[r2,: 64]
+
+# qhasm: 4x _5y4 = y4 << 2
+# asm 1: vshl.i32 >_5y4=reg128#11,<y4=reg128#10,#2
+# asm 2: vshl.i32 >_5y4=q10,<y4=q9,#2
+vshl.i32 q10,q9,#2
+
+# qhasm:   x01 aligned= mem128[input_1];input_1+=16
+# asm 1: vld1.8 {>x01=reg128#12%bot->x01=reg128#12%top},[<input_1=int32#2,: 128]!
+# asm 2: vld1.8 {>x01=d22->x01=d23},[<input_1=r1,: 128]!
+vld1.8 {d22-d23},[r1,: 128]!
+
+# qhasm: 4x _5y01 += y01
+# asm 1: vadd.i32 >_5y01=reg128#3,<_5y01=reg128#3,<y01=reg128#2
+# asm 2: vadd.i32 >_5y01=q2,<_5y01=q2,<y01=q1
+vadd.i32 q2,q2,q1
+
+# qhasm:   x23 aligned= mem128[input_1];input_1+=16
+# asm 1: vld1.8 {>x23=reg128#13%bot->x23=reg128#13%top},[<input_1=int32#2,: 128]!
+# asm 2: vld1.8 {>x23=d24->x23=d25},[<input_1=r1,: 128]!
+vld1.8 {d24-d25},[r1,: 128]!
+
+# qhasm: 4x _5y23 += y23
+# asm 1: vadd.i32 >_5y23=reg128#9,<_5y23=reg128#9,<y23=reg128#4
+# asm 2: vadd.i32 >_5y23=q8,<_5y23=q8,<y23=q3
+vadd.i32 q8,q8,q3
+
+# qhasm: 4x _5y4 += y4
+# asm 1: vadd.i32 >_5y4=reg128#11,<_5y4=reg128#11,<y4=reg128#10
+# asm 2: vadd.i32 >_5y4=q10,<_5y4=q10,<y4=q9
+vadd.i32 q10,q10,q9
+
+# qhasm:   c01 aligned= mem128[input_3];input_3+=16
+# asm 1: vld1.8 {>c01=reg128#14%bot->c01=reg128#14%top},[<input_3=int32#4,: 128]!
+# asm 2: vld1.8 {>c01=d26->c01=d27},[<input_3=r3,: 128]!
+vld1.8 {d26-d27},[r3,: 128]!
+
+# qhasm: 4x x01 += c01
+# asm 1: vadd.i32 >x01=reg128#12,<x01=reg128#12,<c01=reg128#14
+# asm 2: vadd.i32 >x01=q11,<x01=q11,<c01=q13
+vadd.i32 q11,q11,q13
+
+# qhasm:   c23 aligned= mem128[input_3];input_3+=16
+# asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_3=int32#4,: 128]!
+# asm 2: vld1.8 {>c23=d26->c23=d27},[<input_3=r3,: 128]!
+vld1.8 {d26-d27},[r3,: 128]!
+
+# qhasm: 4x x23 += c23
+# asm 1: vadd.i32 >x23=reg128#13,<x23=reg128#13,<c23=reg128#14
+# asm 2: vadd.i32 >x23=q12,<x23=q12,<c23=q13
+vadd.i32 q12,q12,q13
+
+# qhasm:   x4  aligned= mem64[input_1]x4[1]
+# asm 1: vld1.8 {<x4=reg128#14%bot},[<input_1=int32#2,: 64]
+# asm 2: vld1.8 {<x4=d26},[<input_1=r1,: 64]
+vld1.8 {d26},[r1,: 64]
+
+# qhasm: 				2x mask unsigned>>=6
+# asm 1: vshr.u64 >mask=reg128#1,<mask=reg128#1,#6
+# asm 2: vshr.u64 >mask=q0,<mask=q0,#6
+vshr.u64 q0,q0,#6
+
+# qhasm:   c4  aligned= mem64[input_3]c4[1]
+# asm 1: vld1.8 {<c4=reg128#15%bot},[<input_3=int32#4,: 64]
+# asm 2: vld1.8 {<c4=d28},[<input_3=r3,: 64]
+vld1.8 {d28},[r3,: 64]
+
+# qhasm: 4x x4 += c4
+# asm 1: vadd.i32 >x4=reg128#14,<x4=reg128#14,<c4=reg128#15
+# asm 2: vadd.i32 >x4=q13,<x4=q13,<c4=q14
+vadd.i32 q13,q13,q14
+
+# qhasm: r0[0,1]  = x01[0] unsigned* y01[0];   r0[2,3]  = x01[1] unsigned* y01[1]
+# asm 1: vmull.u32 >r0=reg128#15,<x01=reg128#12%bot,<y01=reg128#2%bot
+# asm 2: vmull.u32 >r0=q14,<x01=d22,<y01=d2
+vmull.u32 q14,d22,d2
+
+# qhasm: r0[0,1] += x01[2] unsigned*  _5y4[0]; r0[2,3] += x01[3] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r0=reg128#15,<x01=reg128#12%top,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r0=q14,<x01=d23,<_5y4=d20
+vmlal.u32 q14,d23,d20
+
+# qhasm: r0[0,1] += x23[0] unsigned* _5y23[2]; r0[2,3] += x23[1] unsigned* _5y23[3]
+# asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%bot,<_5y23=reg128#9%top
+# asm 2: vmlal.u32 <r0=q14,<x23=d24,<_5y23=d17
+vmlal.u32 q14,d24,d17
+
+# qhasm: r0[0,1] += x23[2] unsigned* _5y23[0]; r0[2,3] += x23[3] unsigned* _5y23[1]
+# asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%top,<_5y23=reg128#9%bot
+# asm 2: vmlal.u32 <r0=q14,<x23=d25,<_5y23=d16
+vmlal.u32 q14,d25,d16
+
+# qhasm: r0[0,1] +=  x4[0] unsigned* _5y01[2]; r0[2,3] +=  x4[1] unsigned* _5y01[3]
+# asm 1: vmlal.u32 <r0=reg128#15,<x4=reg128#14%bot,<_5y01=reg128#3%top
+# asm 2: vmlal.u32 <r0=q14,<x4=d26,<_5y01=d5
+vmlal.u32 q14,d26,d5
+
+# qhasm: r1[0,1]  = x01[0] unsigned* y01[2];   r1[2,3]  = x01[1] unsigned* y01[3]
+# asm 1: vmull.u32 >r1=reg128#3,<x01=reg128#12%bot,<y01=reg128#2%top
+# asm 2: vmull.u32 >r1=q2,<x01=d22,<y01=d3
+vmull.u32 q2,d22,d3
+
+# qhasm: r1[0,1] += x01[2] unsigned* y01[0];   r1[2,3] += x01[3] unsigned* y01[1]
+# asm 1: vmlal.u32 <r1=reg128#3,<x01=reg128#12%top,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r1=q2,<x01=d23,<y01=d2
+vmlal.u32 q2,d23,d2
+
+# qhasm: r1[0,1] += x23[0] unsigned*  _5y4[0]; r1[2,3] += x23[1] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%bot,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r1=q2,<x23=d24,<_5y4=d20
+vmlal.u32 q2,d24,d20
+
+# qhasm: r1[0,1] += x23[2] unsigned* _5y23[2]; r1[2,3] += x23[3] unsigned* _5y23[3]
+# asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%top,<_5y23=reg128#9%top
+# asm 2: vmlal.u32 <r1=q2,<x23=d25,<_5y23=d17
+vmlal.u32 q2,d25,d17
+
+# qhasm: r1[0,1] +=  x4[0] unsigned* _5y23[0]; r1[2,3] +=  x4[1] unsigned* _5y23[1]
+# asm 1: vmlal.u32 <r1=reg128#3,<x4=reg128#14%bot,<_5y23=reg128#9%bot
+# asm 2: vmlal.u32 <r1=q2,<x4=d26,<_5y23=d16
+vmlal.u32 q2,d26,d16
+
+# qhasm: r2[0,1]  = x01[0] unsigned* y23[0];   r2[2,3]  = x01[1] unsigned* y23[1]
+# asm 1: vmull.u32 >r2=reg128#16,<x01=reg128#12%bot,<y23=reg128#4%bot
+# asm 2: vmull.u32 >r2=q15,<x01=d22,<y23=d6
+vmull.u32 q15,d22,d6
+
+# qhasm: r2[0,1] += x01[2] unsigned* y01[2];   r2[2,3] += x01[3] unsigned* y01[3]
+# asm 1: vmlal.u32 <r2=reg128#16,<x01=reg128#12%top,<y01=reg128#2%top
+# asm 2: vmlal.u32 <r2=q15,<x01=d23,<y01=d3
+vmlal.u32 q15,d23,d3
+
+# qhasm: r2[0,1] += x23[0] unsigned* y01[0];   r2[2,3] += x23[1] unsigned* y01[1]
+# asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%bot,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r2=q15,<x23=d24,<y01=d2
+vmlal.u32 q15,d24,d2
+
+# qhasm: r2[0,1] += x23[2] unsigned*  _5y4[0]; r2[2,3] += x23[3] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%top,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r2=q15,<x23=d25,<_5y4=d20
+vmlal.u32 q15,d25,d20
+
+# qhasm: r2[0,1] +=  x4[0] unsigned* _5y23[2]; r2[2,3] +=  x4[1] unsigned* _5y23[3]
+# asm 1: vmlal.u32 <r2=reg128#16,<x4=reg128#14%bot,<_5y23=reg128#9%top
+# asm 2: vmlal.u32 <r2=q15,<x4=d26,<_5y23=d17
+vmlal.u32 q15,d26,d17
+
+# qhasm: r3[0,1]  = x01[0] unsigned* y23[2];   r3[2,3]  = x01[1] unsigned* y23[3]
+# asm 1: vmull.u32 >r3=reg128#9,<x01=reg128#12%bot,<y23=reg128#4%top
+# asm 2: vmull.u32 >r3=q8,<x01=d22,<y23=d7
+vmull.u32 q8,d22,d7
+
+# qhasm: r3[0,1] += x01[2] unsigned* y23[0];   r3[2,3] += x01[3] unsigned* y23[1]
+# asm 1: vmlal.u32 <r3=reg128#9,<x01=reg128#12%top,<y23=reg128#4%bot
+# asm 2: vmlal.u32 <r3=q8,<x01=d23,<y23=d6
+vmlal.u32 q8,d23,d6
+
+# qhasm: r3[0,1] += x23[0] unsigned* y01[2];   r3[2,3] += x23[1] unsigned* y01[3]
+# asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%bot,<y01=reg128#2%top
+# asm 2: vmlal.u32 <r3=q8,<x23=d24,<y01=d3
+vmlal.u32 q8,d24,d3
+
+# qhasm: r3[0,1] += x23[2] unsigned* y01[0];   r3[2,3] += x23[3] unsigned* y01[1]
+# asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%top,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r3=q8,<x23=d25,<y01=d2
+vmlal.u32 q8,d25,d2
+
+# qhasm: r3[0,1] +=  x4[0] unsigned*  _5y4[0]; r3[2,3] +=  x4[1] unsigned*  _5y4[1]
+# asm 1: vmlal.u32 <r3=reg128#9,<x4=reg128#14%bot,<_5y4=reg128#11%bot
+# asm 2: vmlal.u32 <r3=q8,<x4=d26,<_5y4=d20
+vmlal.u32 q8,d26,d20
+
+# qhasm: r4[0,1]  = x01[0] unsigned*  y4[0];  r4[2,3]  = x01[1] unsigned*  y4[1]
+# asm 1: vmull.u32 >r4=reg128#10,<x01=reg128#12%bot,<y4=reg128#10%bot
+# asm 2: vmull.u32 >r4=q9,<x01=d22,<y4=d18
+vmull.u32 q9,d22,d18
+
+# qhasm: r4[0,1] += x01[2] unsigned* y23[2];  r4[2,3] += x01[3] unsigned* y23[3]
+# asm 1: vmlal.u32 <r4=reg128#10,<x01=reg128#12%top,<y23=reg128#4%top
+# asm 2: vmlal.u32 <r4=q9,<x01=d23,<y23=d7
+vmlal.u32 q9,d23,d7
+
+# qhasm: r4[0,1] += x23[0] unsigned* y23[0];  r4[2,3] += x23[1] unsigned* y23[1]
+# asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%bot,<y23=reg128#4%bot
+# asm 2: vmlal.u32 <r4=q9,<x23=d24,<y23=d6
+vmlal.u32 q9,d24,d6
+
+# qhasm: r4[0,1] += x23[2] unsigned* y01[2];  r4[2,3] += x23[3] unsigned* y01[3]
+# asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%top,<y01=reg128#2%top
+# asm 2: vmlal.u32 <r4=q9,<x23=d25,<y01=d3
+vmlal.u32 q9,d25,d3
+
+# qhasm: r4[0,1] +=  x4[0] unsigned* y01[0];  r4[2,3] +=  x4[1] unsigned* y01[1]
+# asm 1: vmlal.u32 <r4=reg128#10,<x4=reg128#14%bot,<y01=reg128#2%bot
+# asm 2: vmlal.u32 <r4=q9,<x4=d26,<y01=d2
+vmlal.u32 q9,d26,d2
+
+# qhasm: 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#2,<r0=reg128#15,#26
+# asm 2: vshr.u64 >t1=q1,<r0=q14,#26
+vshr.u64 q1,q14,#26
+
+# qhasm:    r0 &= mask
+# asm 1: vand >r0=reg128#4,<r0=reg128#15,<mask=reg128#1
+# asm 2: vand >r0=q3,<r0=q14,<mask=q0
+vand q3,q14,q0
+
+# qhasm: 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#3,<t1=reg128#2
+# asm 2: vadd.i64 >r1=q1,<r1=q2,<t1=q1
+vadd.i64 q1,q2,q1
+
+# qhasm:                 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#3,<r3=reg128#9,#26
+# asm 2: vshr.u64 >t4=q2,<r3=q8,#26
+vshr.u64 q2,q8,#26
+
+# qhasm:                    r3 &= mask
+# asm 1: vand >r3=reg128#9,<r3=reg128#9,<mask=reg128#1
+# asm 2: vand >r3=q8,<r3=q8,<mask=q0
+vand q8,q8,q0
+
+# qhasm:                 2x r4 += t4
+# asm 1: vadd.i64 >r4=reg128#3,<r4=reg128#10,<t4=reg128#3
+# asm 2: vadd.i64 >r4=q2,<r4=q9,<t4=q2
+vadd.i64 q2,q9,q2
+
+# qhasm: 2x t2 = r1 unsigned>> 26
+# asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#2,#26
+# asm 2: vshr.u64 >t2=q9,<r1=q1,#26
+vshr.u64 q9,q1,#26
+
+# qhasm:    r1 &= mask
+# asm 1: vand >r1=reg128#2,<r1=reg128#2,<mask=reg128#1
+# asm 2: vand >r1=q1,<r1=q1,<mask=q0
+vand q1,q1,q0
+
+# qhasm:                 2x t0 = r4 unsigned>> 26
+# asm 1: vshr.u64 >t0=reg128#11,<r4=reg128#3,#26
+# asm 2: vshr.u64 >t0=q10,<r4=q2,#26
+vshr.u64 q10,q2,#26
+
+# qhasm: 2x r2 += t2
+# asm 1: vadd.i64 >r2=reg128#10,<r2=reg128#16,<t2=reg128#10
+# asm 2: vadd.i64 >r2=q9,<r2=q15,<t2=q9
+vadd.i64 q9,q15,q9
+
+# qhasm:                    r4 &= mask
+# asm 1: vand >r4=reg128#3,<r4=reg128#3,<mask=reg128#1
+# asm 2: vand >r4=q2,<r4=q2,<mask=q0
+vand q2,q2,q0
+
+# qhasm:                 2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10
+vadd.i64 q3,q3,q10
+
+# qhasm:                 2x t0 <<= 2
+# asm 1: vshl.i64 >t0=reg128#11,<t0=reg128#11,#2
+# asm 2: vshl.i64 >t0=q10,<t0=q10,#2
+vshl.i64 q10,q10,#2
+
+# qhasm: 2x t3 = r2 unsigned>> 26
+# asm 1: vshr.u64 >t3=reg128#12,<r2=reg128#10,#26
+# asm 2: vshr.u64 >t3=q11,<r2=q9,#26
+vshr.u64 q11,q9,#26
+
+# qhasm:                 2x r0 += t0
+# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11
+# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10
+vadd.i64 q3,q3,q10
+
+# qhasm:    x23 = r2 & mask
+# asm 1: vand >x23=reg128#10,<r2=reg128#10,<mask=reg128#1
+# asm 2: vand >x23=q9,<r2=q9,<mask=q0
+vand q9,q9,q0
+
+# qhasm: 2x r3 += t3
+# asm 1: vadd.i64 >r3=reg128#9,<r3=reg128#9,<t3=reg128#12
+# asm 2: vadd.i64 >r3=q8,<r3=q8,<t3=q11
+vadd.i64 q8,q8,q11
+
+# qhasm:                 2x t1 = r0 unsigned>> 26
+# asm 1: vshr.u64 >t1=reg128#11,<r0=reg128#4,#26
+# asm 2: vshr.u64 >t1=q10,<r0=q3,#26
+vshr.u64 q10,q3,#26
+
+# qhasm: 				x23 = x23[0,2,1,3]
+# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
+# asm 2: vtrn.32 <x23=d18,<x23=d19
+vtrn.32 d18,d19
+
+# qhasm:                    x01 = r0 & mask
+# asm 1: vand >x01=reg128#4,<r0=reg128#4,<mask=reg128#1
+# asm 2: vand >x01=q3,<r0=q3,<mask=q0
+vand q3,q3,q0
+
+# qhasm:                 2x r1 += t1
+# asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#2,<t1=reg128#11
+# asm 2: vadd.i64 >r1=q1,<r1=q1,<t1=q10
+vadd.i64 q1,q1,q10
+
+# qhasm: 2x t4 = r3 unsigned>> 26
+# asm 1: vshr.u64 >t4=reg128#11,<r3=reg128#9,#26
+# asm 2: vshr.u64 >t4=q10,<r3=q8,#26
+vshr.u64 q10,q8,#26
+
+# qhasm: 				x01 = x01[0,2,1,3]
+# asm 1: vtrn.32 <x01=reg128#4%bot,<x01=reg128#4%top
+# asm 2: vtrn.32 <x01=d6,<x01=d7
+vtrn.32 d6,d7
+
+# qhasm:    r3 &= mask
+# asm 1: vand >r3=reg128#1,<r3=reg128#9,<mask=reg128#1
+# asm 2: vand >r3=q0,<r3=q8,<mask=q0
+vand q0,q8,q0
+
+# qhasm: 				r1 = r1[0,2,1,3]
+# asm 1: vtrn.32 <r1=reg128#2%bot,<r1=reg128#2%top
+# asm 2: vtrn.32 <r1=d2,<r1=d3
+vtrn.32 d2,d3
+
+# qhasm: 2x x4 = r4 + t4
+# asm 1: vadd.i64 >x4=reg128#3,<r4=reg128#3,<t4=reg128#11
+# asm 2: vadd.i64 >x4=q2,<r4=q2,<t4=q10
+vadd.i64 q2,q2,q10
+
+# qhasm: 				r3 = r3[0,2,1,3]
+# asm 1: vtrn.32 <r3=reg128#1%bot,<r3=reg128#1%top
+# asm 2: vtrn.32 <r3=d0,<r3=d1
+vtrn.32 d0,d1
+
+# qhasm: 				x01 = x01[0,1] r1[0,1]
+# asm 1: vext.32 <x01=reg128#4%top,<r1=reg128#2%bot,<r1=reg128#2%bot,#0
+# asm 2: vext.32 <x01=d7,<r1=d2,<r1=d2,#0
+vext.32 d7,d2,d2,#0
+
+# qhasm: 				x23 = x23[0,1] r3[0,1]
+# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#1%bot,<r3=reg128#1%bot,#0
+# asm 2: vext.32 <x23=d19,<r3=d0,<r3=d0,#0
+vext.32 d19,d0,d0,#0
+
+# qhasm: 				x4 = x4[0,2,1,3]
+# asm 1: vtrn.32 <x4=reg128#3%bot,<x4=reg128#3%top
+# asm 2: vtrn.32 <x4=d4,<x4=d5
+vtrn.32 d4,d5
+
+# qhasm: mem128[input_0] aligned= x01;input_0+=16
+# asm 1: vst1.8 {<x01=reg128#4%bot-<x01=reg128#4%top},[<input_0=int32#1,: 128]!
+# asm 2: vst1.8 {<x01=d6-<x01=d7},[<input_0=r0,: 128]!
+vst1.8 {d6-d7},[r0,: 128]!
+
+# qhasm: mem128[input_0] aligned= x23;input_0+=16
+# asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1,: 128]!
+# asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0,: 128]!
+vst1.8 {d18-d19},[r0,: 128]!
+
+# qhasm: mem64[input_0] aligned= x4[0]
+# asm 1: vst1.8 <x4=reg128#3%bot,[<input_0=int32#1,: 64]
+# asm 2: vst1.8 <x4=d4,[<input_0=r0,: 64]
+vst1.8 d4,[r0,: 64]
+
+# qhasm: return
+add sp,sp,#0
+bx lr
+
+#endif  /* __arm__ && !OPENSSL_NO_ASM && !__APPLE__ */
+
+#if defined(__ELF__)
+.section	.note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/poly1305/poly1305_vec.c b/crypto/poly1305/poly1305_vec.c
new file mode 100644
index 0000000000..33c4d94946
--- /dev/null
+++ b/crypto/poly1305/poly1305_vec.c
@@ -0,0 +1,864 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// This implementation of poly1305 is by Andrew Moon
+// (https://github.com/floodyberry/poly1305-donna) and released as public
+// domain. It implements SIMD vectorization based on the algorithm described in
+// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
+// block size
+
+#include <GFp/poly1305.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+
+#if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
+
+#pragma GCC diagnostic ignored "-Wcast-align"
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+
+#include <emmintrin.h>
+
+static uint32_t load_u32_le(const uint8_t in[4]) {
+  uint32_t ret;
+  GFp_memcpy(&ret, in, 4);
+  return ret;
+}
+
+static uint64_t load_u64_le(const uint8_t in[8]) {
+  uint64_t ret;
+  GFp_memcpy(&ret, in, 8);
+  return ret;
+}
+
+static void store_u64_le(uint8_t out[8], uint64_t v) {
+  GFp_memcpy(out, &v, 8);
+}
+
+typedef __m128i xmmi;
+
+static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
+    (1 << 26) - 1, 0, (1 << 26) - 1, 0};
+static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
+static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
+    (1 << 24), 0, (1 << 24), 0};
+
+static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
+
+static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
+
+static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
+  return (uint128_t)a * b;
+}
+
+static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
+
+static inline uint64_t shr128(uint128_t v, const int shift) {
+  return (uint64_t)(v >> shift);
+}
+
+static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
+  return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
+}
+
+typedef struct poly1305_power_t {
+  union {
+    xmmi v;
+    uint64_t u[2];
+    uint32_t d[4];
+  } R20, R21, R22, R23, R24, S21, S22, S23, S24;
+} poly1305_power;
+
+typedef struct poly1305_state_internal_t {
+  poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
+                          bytes of free storage */
+  union {
+    xmmi H[5];  //  80 bytes
+    uint64_t HH[10];
+  };
+  // uint64_t r0,r1,r2;       [24 bytes]
+  // uint64_t pad0,pad1;      [16 bytes]
+  uint64_t started;        //   8 bytes
+  uint64_t leftover;       //   8 bytes
+  uint8_t buffer[64];      //  64 bytes
+} poly1305_state_internal; /* 448 bytes total + 63 bytes for
+                              alignment = 511 bytes raw */
+
+OPENSSL_STATIC_ASSERT(sizeof(poly1305_state_internal) <= sizeof(poly1305_state),
+     "poly1305_state isn't large enough to hold aligned poly1305_state_internal");
+
+static inline poly1305_state_internal *poly1305_aligned_state(
+    poly1305_state *state) {
+  dev_assert_secret(((uintptr_t)state & 63) == 0);
+  return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
+}
+
+static inline size_t poly1305_min(size_t a, size_t b) {
+  return (a < b) ? a : b;
+}
+
+void GFp_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
+  poly1305_state_internal *st = poly1305_aligned_state(state);
+  poly1305_power *p;
+  uint64_t r0, r1, r2;
+  uint64_t t0, t1;
+
+  // clamp key
+  t0 = load_u64_le(key + 0);
+  t1 = load_u64_le(key + 8);
+  r0 = t0 & 0xffc0fffffff;
+  t0 >>= 44;
+  t0 |= t1 << 20;
+  r1 = t0 & 0xfffffc0ffff;
+  t1 >>= 24;
+  r2 = t1 & 0x00ffffffc0f;
+
+  // store r in un-used space of st->P[1]
+  p = &st->P[1];
+  p->R20.d[1] = (uint32_t)(r0);
+  p->R20.d[3] = (uint32_t)(r0 >> 32);
+  p->R21.d[1] = (uint32_t)(r1);
+  p->R21.d[3] = (uint32_t)(r1 >> 32);
+  p->R22.d[1] = (uint32_t)(r2);
+  p->R22.d[3] = (uint32_t)(r2 >> 32);
+
+  // store pad
+  p->R23.d[1] = load_u32_le(key + 16);
+  p->R23.d[3] = load_u32_le(key + 20);
+  p->R24.d[1] = load_u32_le(key + 24);
+  p->R24.d[3] = load_u32_le(key + 28);
+
+  // H = 0
+  st->H[0] = _mm_setzero_si128();
+  st->H[1] = _mm_setzero_si128();
+  st->H[2] = _mm_setzero_si128();
+  st->H[3] = _mm_setzero_si128();
+  st->H[4] = _mm_setzero_si128();
+
+  st->started = 0;
+  st->leftover = 0;
+}
+
+static void poly1305_first_block(poly1305_state_internal *st,
+                                 const uint8_t *m) {
+  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
+  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
+  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
+  xmmi T5, T6;
+  poly1305_power *p;
+  uint128_t d[3];
+  uint64_t r0, r1, r2;
+  uint64_t r20, r21, r22, s22;
+  uint64_t pad0, pad1;
+  uint64_t c;
+  uint64_t i;
+
+  // pull out stored info
+  p = &st->P[1];
+
+  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
+  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
+  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
+  pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
+  pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
+
+  // compute powers r^2,r^4
+  r20 = r0;
+  r21 = r1;
+  r22 = r2;
+  for (i = 0; i < 2; i++) {
+    s22 = r22 * (5 << 2);
+
+    d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
+    d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
+    d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
+
+    r20 = lo128(d[0]) & 0xfffffffffff;
+    c = shr128(d[0], 44);
+    d[1] = add128_64(d[1], c);
+    r21 = lo128(d[1]) & 0xfffffffffff;
+    c = shr128(d[1], 44);
+    d[2] = add128_64(d[2], c);
+    r22 = lo128(d[2]) & 0x3ffffffffff;
+    c = shr128(d[2], 42);
+    r20 += c * 5;
+    c = (r20 >> 44);
+    r20 = r20 & 0xfffffffffff;
+    r21 += c;
+
+    p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
+                                 _MM_SHUFFLE(1, 0, 1, 0));
+    p->R21.v = _mm_shuffle_epi32(
+        _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
+        _MM_SHUFFLE(1, 0, 1, 0));
+    p->R22.v =
+        _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
+                          _MM_SHUFFLE(1, 0, 1, 0));
+    p->R23.v = _mm_shuffle_epi32(
+        _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
+        _MM_SHUFFLE(1, 0, 1, 0));
+    p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
+                                 _MM_SHUFFLE(1, 0, 1, 0));
+    p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
+    p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
+    p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
+    p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
+    p--;
+  }
+
+  // put saved info back
+  p = &st->P[1];
+  p->R20.d[1] = (uint32_t)(r0);
+  p->R20.d[3] = (uint32_t)(r0 >> 32);
+  p->R21.d[1] = (uint32_t)(r1);
+  p->R21.d[3] = (uint32_t)(r1 >> 32);
+  p->R22.d[1] = (uint32_t)(r2);
+  p->R22.d[3] = (uint32_t)(r2 >> 32);
+  p->R23.d[1] = (uint32_t)(pad0);
+  p->R23.d[3] = (uint32_t)(pad0 >> 32);
+  p->R24.d[1] = (uint32_t)(pad1);
+  p->R24.d[3] = (uint32_t)(pad1 >> 32);
+
+  // H = [Mx,My]
+  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
+                          _mm_loadl_epi64((const xmmi *)(m + 16)));
+  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
+                          _mm_loadl_epi64((const xmmi *)(m + 24)));
+  st->H[0] = _mm_and_si128(MMASK, T5);
+  st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+  st->H[2] = _mm_and_si128(MMASK, T5);
+  st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+  st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
+}
+
+static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
+                            size_t bytes) {
+  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
+  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
+  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
+
+  poly1305_power *p;
+  xmmi H0, H1, H2, H3, H4;
+  xmmi T0, T1, T2, T3, T4, T5, T6;
+  xmmi M0, M1, M2, M3, M4;
+  xmmi C1, C2;
+
+  H0 = st->H[0];
+  H1 = st->H[1];
+  H2 = st->H[2];
+  H3 = st->H[3];
+  H4 = st->H[4];
+
+  while (bytes >= 64) {
+    // H *= [r^4,r^4]
+    p = &st->P[0];
+    T0 = _mm_mul_epu32(H0, p->R20.v);
+    T1 = _mm_mul_epu32(H0, p->R21.v);
+    T2 = _mm_mul_epu32(H0, p->R22.v);
+    T3 = _mm_mul_epu32(H0, p->R23.v);
+    T4 = _mm_mul_epu32(H0, p->R24.v);
+    T5 = _mm_mul_epu32(H1, p->S24.v);
+    T6 = _mm_mul_epu32(H1, p->R20.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H2, p->S23.v);
+    T6 = _mm_mul_epu32(H2, p->S24.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H3, p->S22.v);
+    T6 = _mm_mul_epu32(H3, p->S23.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H4, p->S21.v);
+    T6 = _mm_mul_epu32(H4, p->S22.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H1, p->R21.v);
+    T6 = _mm_mul_epu32(H1, p->R22.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H2, p->R20.v);
+    T6 = _mm_mul_epu32(H2, p->R21.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H3, p->S24.v);
+    T6 = _mm_mul_epu32(H3, p->R20.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H4, p->S23.v);
+    T6 = _mm_mul_epu32(H4, p->S24.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H1, p->R23.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(H2, p->R22.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(H3, p->R21.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(H4, p->R20.v);
+    T4 = _mm_add_epi64(T4, T5);
+
+    // H += [Mx,My]*[r^2,r^2]
+    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
+                            _mm_loadl_epi64((const xmmi *)(m + 16)));
+    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
+                            _mm_loadl_epi64((const xmmi *)(m + 24)));
+    M0 = _mm_and_si128(MMASK, T5);
+    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+    M2 = _mm_and_si128(MMASK, T5);
+    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
+
+    p = &st->P[1];
+    T5 = _mm_mul_epu32(M0, p->R20.v);
+    T6 = _mm_mul_epu32(M0, p->R21.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(M1, p->S24.v);
+    T6 = _mm_mul_epu32(M1, p->R20.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(M2, p->S23.v);
+    T6 = _mm_mul_epu32(M2, p->S24.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(M3, p->S22.v);
+    T6 = _mm_mul_epu32(M3, p->S23.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(M4, p->S21.v);
+    T6 = _mm_mul_epu32(M4, p->S22.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(M0, p->R22.v);
+    T6 = _mm_mul_epu32(M0, p->R23.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(M1, p->R21.v);
+    T6 = _mm_mul_epu32(M1, p->R22.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(M2, p->R20.v);
+    T6 = _mm_mul_epu32(M2, p->R21.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(M3, p->S24.v);
+    T6 = _mm_mul_epu32(M3, p->R20.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(M4, p->S23.v);
+    T6 = _mm_mul_epu32(M4, p->S24.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(M0, p->R24.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(M1, p->R23.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(M2, p->R22.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(M3, p->R21.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(M4, p->R20.v);
+    T4 = _mm_add_epi64(T4, T5);
+
+    // H += [Mx,My]
+    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
+                            _mm_loadl_epi64((const xmmi *)(m + 48)));
+    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
+                            _mm_loadl_epi64((const xmmi *)(m + 56)));
+    M0 = _mm_and_si128(MMASK, T5);
+    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+    M2 = _mm_and_si128(MMASK, T5);
+    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
+
+    T0 = _mm_add_epi64(T0, M0);
+    T1 = _mm_add_epi64(T1, M1);
+    T2 = _mm_add_epi64(T2, M2);
+    T3 = _mm_add_epi64(T3, M3);
+    T4 = _mm_add_epi64(T4, M4);
+
+    // reduce
+    C1 = _mm_srli_epi64(T0, 26);
+    C2 = _mm_srli_epi64(T3, 26);
+    T0 = _mm_and_si128(T0, MMASK);
+    T3 = _mm_and_si128(T3, MMASK);
+    T1 = _mm_add_epi64(T1, C1);
+    T4 = _mm_add_epi64(T4, C2);
+    C1 = _mm_srli_epi64(T1, 26);
+    C2 = _mm_srli_epi64(T4, 26);
+    T1 = _mm_and_si128(T1, MMASK);
+    T4 = _mm_and_si128(T4, MMASK);
+    T2 = _mm_add_epi64(T2, C1);
+    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+    C1 = _mm_srli_epi64(T2, 26);
+    C2 = _mm_srli_epi64(T0, 26);
+    T2 = _mm_and_si128(T2, MMASK);
+    T0 = _mm_and_si128(T0, MMASK);
+    T3 = _mm_add_epi64(T3, C1);
+    T1 = _mm_add_epi64(T1, C2);
+    C1 = _mm_srli_epi64(T3, 26);
+    T3 = _mm_and_si128(T3, MMASK);
+    T4 = _mm_add_epi64(T4, C1);
+
+    // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
+    H0 = T0;
+    H1 = T1;
+    H2 = T2;
+    H3 = T3;
+    H4 = T4;
+
+    m += 64;
+    bytes -= 64;
+  }
+
+  st->H[0] = H0;
+  st->H[1] = H1;
+  st->H[2] = H2;
+  st->H[3] = H3;
+  st->H[4] = H4;
+}
+
+static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
+                               size_t bytes) {
+  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
+  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
+  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
+
+  poly1305_power *p;
+  xmmi H0, H1, H2, H3, H4;
+  xmmi M0, M1, M2, M3, M4;
+  xmmi T0, T1, T2, T3, T4, T5, T6;
+  xmmi C1, C2;
+
+  uint64_t r0, r1, r2;
+  uint64_t t0, t1, t2, t3, t4;
+  uint64_t c;
+  size_t consumed = 0;
+
+  H0 = st->H[0];
+  H1 = st->H[1];
+  H2 = st->H[2];
+  H3 = st->H[3];
+  H4 = st->H[4];
+
+  // p = [r^2,r^2]
+  p = &st->P[1];
+
+  if (bytes >= 32) {
+    // H *= [r^2,r^2]
+    T0 = _mm_mul_epu32(H0, p->R20.v);
+    T1 = _mm_mul_epu32(H0, p->R21.v);
+    T2 = _mm_mul_epu32(H0, p->R22.v);
+    T3 = _mm_mul_epu32(H0, p->R23.v);
+    T4 = _mm_mul_epu32(H0, p->R24.v);
+    T5 = _mm_mul_epu32(H1, p->S24.v);
+    T6 = _mm_mul_epu32(H1, p->R20.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H2, p->S23.v);
+    T6 = _mm_mul_epu32(H2, p->S24.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H3, p->S22.v);
+    T6 = _mm_mul_epu32(H3, p->S23.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H4, p->S21.v);
+    T6 = _mm_mul_epu32(H4, p->S22.v);
+    T0 = _mm_add_epi64(T0, T5);
+    T1 = _mm_add_epi64(T1, T6);
+    T5 = _mm_mul_epu32(H1, p->R21.v);
+    T6 = _mm_mul_epu32(H1, p->R22.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H2, p->R20.v);
+    T6 = _mm_mul_epu32(H2, p->R21.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H3, p->S24.v);
+    T6 = _mm_mul_epu32(H3, p->R20.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H4, p->S23.v);
+    T6 = _mm_mul_epu32(H4, p->S24.v);
+    T2 = _mm_add_epi64(T2, T5);
+    T3 = _mm_add_epi64(T3, T6);
+    T5 = _mm_mul_epu32(H1, p->R23.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(H2, p->R22.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(H3, p->R21.v);
+    T4 = _mm_add_epi64(T4, T5);
+    T5 = _mm_mul_epu32(H4, p->R20.v);
+    T4 = _mm_add_epi64(T4, T5);
+
+    // H += [Mx,My]
+    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
+                            _mm_loadl_epi64((const xmmi *)(m + 16)));
+    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
+                            _mm_loadl_epi64((const xmmi *)(m + 24)));
+    M0 = _mm_and_si128(MMASK, T5);
+    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+    M2 = _mm_and_si128(MMASK, T5);
+    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
+
+    T0 = _mm_add_epi64(T0, M0);
+    T1 = _mm_add_epi64(T1, M1);
+    T2 = _mm_add_epi64(T2, M2);
+    T3 = _mm_add_epi64(T3, M3);
+    T4 = _mm_add_epi64(T4, M4);
+
+    // reduce
+    C1 = _mm_srli_epi64(T0, 26);
+    C2 = _mm_srli_epi64(T3, 26);
+    T0 = _mm_and_si128(T0, MMASK);
+    T3 = _mm_and_si128(T3, MMASK);
+    T1 = _mm_add_epi64(T1, C1);
+    T4 = _mm_add_epi64(T4, C2);
+    C1 = _mm_srli_epi64(T1, 26);
+    C2 = _mm_srli_epi64(T4, 26);
+    T1 = _mm_and_si128(T1, MMASK);
+    T4 = _mm_and_si128(T4, MMASK);
+    T2 = _mm_add_epi64(T2, C1);
+    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+    C1 = _mm_srli_epi64(T2, 26);
+    C2 = _mm_srli_epi64(T0, 26);
+    T2 = _mm_and_si128(T2, MMASK);
+    T0 = _mm_and_si128(T0, MMASK);
+    T3 = _mm_add_epi64(T3, C1);
+    T1 = _mm_add_epi64(T1, C2);
+    C1 = _mm_srli_epi64(T3, 26);
+    T3 = _mm_and_si128(T3, MMASK);
+    T4 = _mm_add_epi64(T4, C1);
+
+    // H = (H*[r^2,r^2] + [Mx,My])
+    H0 = T0;
+    H1 = T1;
+    H2 = T2;
+    H3 = T3;
+    H4 = T4;
+
+    consumed = 32;
+  }
+
+  // finalize, H *= [r^2,r]
+  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
+  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
+  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
+
+  p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
+  p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
+  p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
+  p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
+  p->R24.d[2] = (uint32_t)((r2 >> 16));
+  p->S21.d[2] = p->R21.d[2] * 5;
+  p->S22.d[2] = p->R22.d[2] * 5;
+  p->S23.d[2] = p->R23.d[2] * 5;
+  p->S24.d[2] = p->R24.d[2] * 5;
+
+  // H *= [r^2,r]
+  T0 = _mm_mul_epu32(H0, p->R20.v);
+  T1 = _mm_mul_epu32(H0, p->R21.v);
+  T2 = _mm_mul_epu32(H0, p->R22.v);
+  T3 = _mm_mul_epu32(H0, p->R23.v);
+  T4 = _mm_mul_epu32(H0, p->R24.v);
+  T5 = _mm_mul_epu32(H1, p->S24.v);
+  T6 = _mm_mul_epu32(H1, p->R20.v);
+  T0 = _mm_add_epi64(T0, T5);
+  T1 = _mm_add_epi64(T1, T6);
+  T5 = _mm_mul_epu32(H2, p->S23.v);
+  T6 = _mm_mul_epu32(H2, p->S24.v);
+  T0 = _mm_add_epi64(T0, T5);
+  T1 = _mm_add_epi64(T1, T6);
+  T5 = _mm_mul_epu32(H3, p->S22.v);
+  T6 = _mm_mul_epu32(H3, p->S23.v);
+  T0 = _mm_add_epi64(T0, T5);
+  T1 = _mm_add_epi64(T1, T6);
+  T5 = _mm_mul_epu32(H4, p->S21.v);
+  T6 = _mm_mul_epu32(H4, p->S22.v);
+  T0 = _mm_add_epi64(T0, T5);
+  T1 = _mm_add_epi64(T1, T6);
+  T5 = _mm_mul_epu32(H1, p->R21.v);
+  T6 = _mm_mul_epu32(H1, p->R22.v);
+  T2 = _mm_add_epi64(T2, T5);
+  T3 = _mm_add_epi64(T3, T6);
+  T5 = _mm_mul_epu32(H2, p->R20.v);
+  T6 = _mm_mul_epu32(H2, p->R21.v);
+  T2 = _mm_add_epi64(T2, T5);
+  T3 = _mm_add_epi64(T3, T6);
+  T5 = _mm_mul_epu32(H3, p->S24.v);
+  T6 = _mm_mul_epu32(H3, p->R20.v);
+  T2 = _mm_add_epi64(T2, T5);
+  T3 = _mm_add_epi64(T3, T6);
+  T5 = _mm_mul_epu32(H4, p->S23.v);
+  T6 = _mm_mul_epu32(H4, p->S24.v);
+  T2 = _mm_add_epi64(T2, T5);
+  T3 = _mm_add_epi64(T3, T6);
+  T5 = _mm_mul_epu32(H1, p->R23.v);
+  T4 = _mm_add_epi64(T4, T5);
+  T5 = _mm_mul_epu32(H2, p->R22.v);
+  T4 = _mm_add_epi64(T4, T5);
+  T5 = _mm_mul_epu32(H3, p->R21.v);
+  T4 = _mm_add_epi64(T4, T5);
+  T5 = _mm_mul_epu32(H4, p->R20.v);
+  T4 = _mm_add_epi64(T4, T5);
+
+  C1 = _mm_srli_epi64(T0, 26);
+  C2 = _mm_srli_epi64(T3, 26);
+  T0 = _mm_and_si128(T0, MMASK);
+  T3 = _mm_and_si128(T3, MMASK);
+  T1 = _mm_add_epi64(T1, C1);
+  T4 = _mm_add_epi64(T4, C2);
+  C1 = _mm_srli_epi64(T1, 26);
+  C2 = _mm_srli_epi64(T4, 26);
+  T1 = _mm_and_si128(T1, MMASK);
+  T4 = _mm_and_si128(T4, MMASK);
+  T2 = _mm_add_epi64(T2, C1);
+  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+  C1 = _mm_srli_epi64(T2, 26);
+  C2 = _mm_srli_epi64(T0, 26);
+  T2 = _mm_and_si128(T2, MMASK);
+  T0 = _mm_and_si128(T0, MMASK);
+  T3 = _mm_add_epi64(T3, C1);
+  T1 = _mm_add_epi64(T1, C2);
+  C1 = _mm_srli_epi64(T3, 26);
+  T3 = _mm_and_si128(T3, MMASK);
+  T4 = _mm_add_epi64(T4, C1);
+
+  // H = H[0]+H[1]
+  H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
+  H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
+  H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
+  H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
+  H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
+
+  t0 = _mm_cvtsi128_si32(H0);
+  c = (t0 >> 26);
+  t0 &= 0x3ffffff;
+  t1 = _mm_cvtsi128_si32(H1) + c;
+  c = (t1 >> 26);
+  t1 &= 0x3ffffff;
+  t2 = _mm_cvtsi128_si32(H2) + c;
+  c = (t2 >> 26);
+  t2 &= 0x3ffffff;
+  t3 = _mm_cvtsi128_si32(H3) + c;
+  c = (t3 >> 26);
+  t3 &= 0x3ffffff;
+  t4 = _mm_cvtsi128_si32(H4) + c;
+  c = (t4 >> 26);
+  t4 &= 0x3ffffff;
+  t0 = t0 + (c * 5);
+  c = (t0 >> 26);
+  t0 &= 0x3ffffff;
+  t1 = t1 + c;
+
+  st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
+  st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
+  st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
+
+  return consumed;
+}
+
+void GFp_poly1305_update(poly1305_state *state, const uint8_t *m,
+                         size_t bytes) {
+  poly1305_state_internal *st = poly1305_aligned_state(state);
+  size_t want;
+
+  // Work around a C language bug. See https://crbug.com/1019588.
+  if (bytes == 0) {
+    return;
+  }
+
+  // need at least 32 initial bytes to start the accelerated branch
+  if (!st->started) {
+    if ((st->leftover == 0) && (bytes > 32)) {
+      poly1305_first_block(st, m);
+      m += 32;
+      bytes -= 32;
+    } else {
+      want = poly1305_min(32 - st->leftover, bytes);
+      GFp_memcpy(st->buffer + st->leftover, m, want);
+      bytes -= want;
+      m += want;
+      st->leftover += want;
+      if ((st->leftover < 32) || (bytes == 0)) {
+        return;
+      }
+      poly1305_first_block(st, st->buffer);
+      st->leftover = 0;
+    }
+    st->started = 1;
+  }
+
+  // handle leftover
+  if (st->leftover) {
+    want = poly1305_min(64 - st->leftover, bytes);
+    GFp_memcpy(st->buffer + st->leftover, m, want);
+    bytes -= want;
+    m += want;
+    st->leftover += want;
+    if (st->leftover < 64) {
+      return;
+    }
+    poly1305_blocks(st, st->buffer, 64);
+    st->leftover = 0;
+  }
+
+  // process 64 byte blocks
+  if (bytes >= 64) {
+    want = (bytes & ~63);
+    poly1305_blocks(st, m, want);
+    m += want;
+    bytes -= want;
+  }
+
+  if (bytes) {
+    GFp_memcpy(st->buffer + st->leftover, m, bytes);
+    st->leftover += bytes;
+  }
+}
+
+void GFp_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
+  poly1305_state_internal *st = poly1305_aligned_state(state);
+  size_t leftover = st->leftover;
+  uint8_t *m = st->buffer;
+  uint128_t d[3];
+  uint64_t h0, h1, h2;
+  uint64_t t0, t1;
+  uint64_t g0, g1, g2, c, nc;
+  uint64_t r0, r1, r2, s1, s2;
+  poly1305_power *p;
+
+  if (st->started) {
+    size_t consumed = poly1305_combine(st, m, leftover);
+    leftover -= consumed;
+    m += consumed;
+  }
+
+  // st->HH will either be 0 or have the combined result
+  h0 = st->HH[0];
+  h1 = st->HH[1];
+  h2 = st->HH[2];
+
+  p = &st->P[1];
+  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
+  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
+  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
+  s1 = r1 * (5 << 2);
+  s2 = r2 * (5 << 2);
+
+  if (leftover < 16) {
+    goto poly1305_donna_atmost15bytes;
+  }
+
+poly1305_donna_atleast16bytes:
+  t0 = load_u64_le(m + 0);
+  t1 = load_u64_le(m + 8);
+  h0 += t0 & 0xfffffffffff;
+  t0 = shr128_pair(t1, t0, 44);
+  h1 += t0 & 0xfffffffffff;
+  h2 += (t1 >> 24) | ((uint64_t)1 << 40);
+
+poly1305_donna_mul:
+  d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
+                mul64x64_128(h2, s1));
+  d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
+                mul64x64_128(h2, s2));
+  d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
+                mul64x64_128(h2, r0));
+  h0 = lo128(d[0]) & 0xfffffffffff;
+  c = shr128(d[0], 44);
+  d[1] = add128_64(d[1], c);
+  h1 = lo128(d[1]) & 0xfffffffffff;
+  c = shr128(d[1], 44);
+  d[2] = add128_64(d[2], c);
+  h2 = lo128(d[2]) & 0x3ffffffffff;
+  c = shr128(d[2], 42);
+  h0 += c * 5;
+
+  m += 16;
+  leftover -= 16;
+  if (leftover >= 16) {
+    goto poly1305_donna_atleast16bytes;
+  }
+
+// final bytes
+poly1305_donna_atmost15bytes:
+  if (!leftover) {
+    goto poly1305_donna_finish;
+  }
+
+  m[leftover++] = 1;
+  GFp_memset(m + leftover, 0, 16 - leftover);
+  leftover = 16;
+
+  t0 = load_u64_le(m + 0);
+  t1 = load_u64_le(m + 8);
+  h0 += t0 & 0xfffffffffff;
+  t0 = shr128_pair(t1, t0, 44);
+  h1 += t0 & 0xfffffffffff;
+  h2 += (t1 >> 24);
+
+  goto poly1305_donna_mul;
+
+poly1305_donna_finish:
+  c = (h0 >> 44);
+  h0 &= 0xfffffffffff;
+  h1 += c;
+  c = (h1 >> 44);
+  h1 &= 0xfffffffffff;
+  h2 += c;
+  c = (h2 >> 42);
+  h2 &= 0x3ffffffffff;
+  h0 += c * 5;
+
+  g0 = h0 + 5;
+  c = (g0 >> 44);
+  g0 &= 0xfffffffffff;
+  g1 = h1 + c;
+  c = (g1 >> 44);
+  g1 &= 0xfffffffffff;
+  g2 = h2 + c - ((uint64_t)1 << 42);
+
+  c = (g2 >> 63) - 1;
+  nc = ~c;
+  h0 = (h0 & nc) | (g0 & c);
+  h1 = (h1 & nc) | (g1 & c);
+  h2 = (h2 & nc) | (g2 & c);
+
+  // pad
+  t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
+  t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
+  h0 += (t0 & 0xfffffffffff);
+  c = (h0 >> 44);
+  h0 &= 0xfffffffffff;
+  t0 = shr128_pair(t1, t0, 44);
+  h1 += (t0 & 0xfffffffffff) + c;
+  c = (h1 >> 44);
+  h1 &= 0xfffffffffff;
+  t1 = (t1 >> 24);
+  h2 += (t1)+c;
+
+  store_u64_le(mac + 0, ((h0) | (h1 << 44)));
+  store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
+}
+
+#endif  // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64
diff --git a/deny.toml b/deny.toml
new file mode 100644
index 0000000000..dcfb03439f
--- /dev/null
+++ b/deny.toml
@@ -0,0 +1,30 @@
+[advisories]
+unmaintained = "deny"
+yanked = "deny"
+notice = "deny"
+
+[licenses]
+allow = [
+    "Apache-2.0",
+    "ISC",
+    "LicenseRef-ring",
+    "MIT",
+]
+confidence-threshold = 1.0
+
+[[licenses.clarify]]
+name = "ring"
+expression = "LicenseRef-ring"
+license-files = [
+    { path = "LICENSE", hash = 0xbd0eed23 },
+]
+
+[bans]
+# We don't maintain a fixed Cargo.lock so enforcing
+# `multiple-versions = "deny"` is impractical.
+multiple-versions = "allow"
+wildcards = "deny"
+
+[sources]
+unknown-registry = "deny"
+unknown-git = "deny"
diff --git a/include/GFp/.gitattributes b/include/GFp/.gitattributes
deleted file mode 100644
index 15a5c58091..0000000000
--- a/include/GFp/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-*.h linguist-language=C
diff --git a/include/GFp/arm_arch.h b/include/GFp/arm_arch.h
index ee5e32c8f3..2e64aa9e5e 100644
--- a/include/GFp/arm_arch.h
+++ b/include/GFp/arm_arch.h
@@ -110,4 +110,68 @@
 // ARMV8_SHA256 indicates support for hardware SHA-256 instructions.
 #define ARMV8_SHA256 (1 << 4)
 
+#if defined(__ASSEMBLER__)
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wundef"
+#endif
+
+// Support macros for
+//   - Armv8.3-A Pointer Authentication and
+//   - Armv8.5-A Branch Target Identification
+// features which require emitting a .note.gnu.property section with the
+// appropriate architecture-dependent feature bits set.
+// Read more: "ELF for the Arm® 64-bit Architecture"
+
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has Branch Target Identification
+#define AARCH64_VALID_CALL_TARGET hint #34  // BTI 'c'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0  // No Branch Target Identification
+#define AARCH64_VALID_CALL_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT) && \
+    (__ARM_FEATURE_PAC_DEFAULT & 1) == 1  // Signed with A-key
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+  (1 << 1)                                       // Has Pointer Authentication
+#define AARCH64_SIGN_LINK_REGISTER hint #25      // PACIASP
+#define AARCH64_VALIDATE_LINK_REGISTER hint #29  // AUTIASP
+#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
+    (__ARM_FEATURE_PAC_DEFAULT & 2) == 2  // Signed with B-key
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+  (1 << 1)                                       // Has Pointer Authentication
+#define AARCH64_SIGN_LINK_REGISTER hint #27      // PACIBSP
+#define AARCH64_VALIDATE_LINK_REGISTER hint #31  // AUTIBSP
+#else
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0  // No Pointer Authentication
+#if GNU_PROPERTY_AARCH64_BTI != 0
+#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
+#else
+#define AARCH64_SIGN_LINK_REGISTER
+#endif
+#define AARCH64_VALIDATE_LINK_REGISTER
+#endif
+
+#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
+.pushsection .note.gnu.property, "a";
+.balign 8;
+.long 4;
+.long 0x10;
+.long 0x5;
+.asciz "GNU";
+.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+.long 4;
+.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
+.long 0;
+.popsection;
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+#endif  /* defined __ASSEMBLER__ */
+
 #endif  // OPENSSL_HEADER_ARM_ARCH_H
diff --git a/include/GFp/check.h b/include/GFp/check.h
index cf44db834d..4bd257ca35 100644
--- a/include/GFp/check.h
+++ b/include/GFp/check.h
@@ -17,12 +17,13 @@
 
 // |debug_assert_nonsecret| is like |assert| and should be used (only) when the
 // assertion does not have any potential to leak a secret. |NDEBUG| controls this
-// exactly like |assert|. It is emulated for WebAssembly so that <assert.h> is
-// not required for it.
+// exactly like |assert|. It is emulated when there is no assert.h to make
+// cross-building easier.
 //
 // When reviewing uses of |debug_assert_nonsecret|, verify that the check
 // really does not have potential to leak a secret.
-#if !defined(__wasm__)
+
+#if !defined(GFp_NOSTDLIBINC)
 # include <assert.h>
 # define debug_assert_nonsecret(x) assert(x)
 #else
diff --git a/include/GFp/poly1305.h b/include/GFp/poly1305.h
new file mode 100644
index 0000000000..53c4036c86
--- /dev/null
+++ b/include/GFp/poly1305.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_POLY1305_H
+#define OPENSSL_HEADER_POLY1305_H
+
+#include <GFp/base.h>
+
+// Keep in sync with `poly1305_state` in poly1305.rs.
+typedef uint8_t poly1305_state[512];
+
+#endif  // OPENSSL_HEADER_POLY1305_H
diff --git a/mk/appveyor.bat b/mk/appveyor.bat
deleted file mode 100644
index ac7c2b713f..0000000000
--- a/mk/appveyor.bat
+++ /dev/null
@@ -1,60 +0,0 @@
-echo on
-SetLocal EnableDelayedExpansion
-
-set VCVARSALL="C:\Program Files (x86)\Microsoft Visual Studio %TOOLCHAIN_VERSION%\VC\vcvarsall.bat"
-
-if [%Platform%] NEQ [x64] goto win32
-set TARGET_ARCH=x86_64
-goto download
-
-:win32
-echo on
-if [%Platform%] NEQ [Win32] exit 1
-set TARGET_ARCH=i686
-goto download
-
-:download
-REM vcvarsall turns echo off
-echo on
-
-mkdir windows_build_tools
-mkdir windows_build_tools\
-echo Downloading Yasm...
-powershell -Command "(New-Object Net.WebClient).DownloadFile('https://www.tortall.net/projects/yasm/releases/yasm-1.3.0-win64.exe', 'windows_build_tools\yasm.exe')"
-if %ERRORLEVEL% NEQ 0 (
-  echo ...downloading Yasm failed.
-  exit 1
-)
-
-mkdir build
-set RUSTUP_URL=https://win.rustup.rs/%TARGET_ARCH%
-set RUSTUP_EXE=build\rustup-init-%TARGET_ARCH%.exe
-echo Downloading %RUSTUP_URL%...
-powershell -Command "(New-Object Net.WebClient).DownloadFile('%RUSTUP_URL%', '%RUSTUP_EXE%')"
-if %ERRORLEVEL% NEQ 0 (
-  echo ...downloading rustup failed.
-  exit 1
-)
-
-set TARGET=%TARGET_ARCH%-pc-windows-msvc
-%RUSTUP_EXE% -y --default-host %TARGET% --default-toolchain %RUST%
-if %ERRORLEVEL% NEQ 0 exit 1
-
-set PATH=%USERPROFILE%\.cargo\bin;%cd%\windows_build_tools;%PATH%
-
-if [%Configuration%] == [Release] set CARGO_MODE=--release
-
-set
-
-link /?
-cl /?
-rustc --version
-cargo --version
-
-cargo test -vv %CARGO_MODE%
-if %ERRORLEVEL% NEQ 0 exit 1
-
-REM Verify that `cargo build`, independent from `cargo test`, works; i.e.
-REM verify that non-test builds aren't trying to use test-only features.
-cargo build -vv %CARGO_MODE%
-if %ERRORLEVEL% NEQ 0 exit 1
diff --git a/mk/cargo.sh b/mk/cargo.sh
new file mode 100755
index 0000000000..a7b8154baf
--- /dev/null
+++ b/mk/cargo.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+#
+# Copyright 2020 Brian Smith.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+set -eux -o pipefail
+IFS=$'\n\t'
+
+rustflags_self_contained="-Clink-self-contained=yes -Clinker=rust-lld"
+qemu_aarch64="qemu-aarch64 -L /usr/aarch64-linux-gnu"
+qemu_arm="qemu-arm -L /usr/arm-linux-gnueabihf"
+
+# Avoid putting the Android tools in `$PATH` because there are tools in this
+# directory like `clang` that would conflict with the same-named tools that may
+# be needed to compile the build script, or to compile for other targets.
+if [ -n "${ANDROID_SDK_ROOT-}" ]; then
+  android_tools=$ANDROID_SDK_ROOT/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin
+fi
+
+for arg in $*; do
+  case $arg in
+    --target=*)
+      target=${arg#*=}
+      ;;
+    *)
+      ;;
+  esac
+done
+
+# See comments in install-build-tools.sh.
+llvm_version=10
+if [ -n "${RING_COVERAGE-}" ]; then
+  llvm_version=11
+fi
+
+case $target in
+   aarch64-linux-android)
+    export CC_aarch64_linux_android=$android_tools/aarch64-linux-android21-clang
+    export AR_aarch64_linux_android=$android_tools/aarch64-linux-android-ar
+    export CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=$android_tools/aarch64-linux-android21-clang
+    ;;
+  aarch64-unknown-linux-gnu)
+    export CC_aarch64_unknown_linux_gnu=clang-$llvm_version
+    export AR_aarch64_unknown_linux_gnu=llvm-ar-$llvm_version
+    export CFLAGS_aarch64_unknown_linux_gnu="--sysroot=/usr/aarch64-linux-gnu"
+    export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc
+    export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="$qemu_aarch64"
+    ;;
+  aarch64-unknown-linux-musl)
+    export CC_aarch64_unknown_linux_musl=clang-$llvm_version
+    export AR_aarch64_unknown_linux_musl=llvm-ar-$llvm_version
+    export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="$rustflags_self_contained"
+    export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_RUNNER="$qemu_aarch64"
+    ;;
+  arm-unknown-linux-gnueabihf)
+    export CC_arm_unknown_linux_gnueabihf=arm-linux-gnueabihf-gcc
+    export AR_arm_unknown_linux_gnueabihf=arm-linux-gnueabihf-gcc-ar
+    export CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc
+    export CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="$qemu_arm"
+    ;;
+  armv7-linux-androideabi)
+    export CC_armv7_linux_androideabi=$android_tools/armv7a-linux-androideabi18-clang
+    export AR_armv7_linux_androideabi=$android_tools/arm-linux-androideabi-ar
+    export CARGO_TARGET_ARMV7_LINUX_ANDROIDEABI_LINKER=$android_tools/armv7a-linux-androideabi18-clang
+    ;;
+  armv7-unknown-linux-musleabihf)
+    export CC_armv7_unknown_linux_musleabihf=clang-$llvm_version
+    export AR_armv7_unknown_linux_musleabihf=llvm-ar-$llvm_version
+    export CARGO_TARGET_ARMV7_UNKNOWN_LINUX_MUSLEABIHF_RUSTFLAGS="$rustflags_self_contained"
+    export CARGO_TARGET_ARMV7_UNKNOWN_LINUX_MUSLEABIHF_RUNNER="$qemu_arm"
+    ;;
+  i686-unknown-linux-gnu)
+    export CC_i686_unknown_linux_gnu=clang-$llvm_version
+    export AR_i686_unknown_linux_gnu=llvm-ar-$llvm_version
+    export CARGO_TARGET_I686_UNKNOWN_LINUX_GNU_LINKER=clang-$llvm_version
+    ;;
+  i686-unknown-linux-musl)
+    export CC_i686_unknown_linux_musl=clang-$llvm_version
+    export AR_i686_unknown_linux_musl=llvm-ar-$llvm_version
+    export CARGO_TARGET_I686_UNKNOWN_LINUX_MUSL_RUSTFLAGS="$rustflags_self_contained"
+    ;;
+  x86_64-unknown-linux-musl)
+    export CC_x86_64_unknown_linux_musl=clang-$llvm_version
+    export AR_x86_64_unknown_linux_musl=llvm-ar-$llvm_version
+    # XXX: Work around https://github.com/rust-lang/rust/issues/79555.
+    if [ -n "${RING_COVERAGE-}" ]; then
+      export CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_LINKER=clang-$llvm_version
+    else
+      export CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="$rustflags_self_contained"
+    fi
+    ;;
+  wasm32-unknown-unknown)
+    # The first two are only needed for when the "wasm_c" feature is enabled.
+    export CC_wasm32_unknown_unknown=clang-$llvm_version
+    export AR_wasm32_unknown_unknown=llvm-ar-$llvm_version
+    export CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner
+    ;;
+  *)
+    ;;
+esac
+
+if [ -n "${RING_COVERAGE-}" ]; then
+  # XXX: Collides between release and debug.
+  coverage_dir=$PWD/target/$target/debug/coverage
+  mkdir -p "$coverage_dir"
+  rm -f "$coverage_dir/*.profraw"
+
+  export RING_BUILD_EXECUTABLE_LIST="$coverage_dir/executables"
+  truncate --size=0 "$RING_BUILD_EXECUTABLE_LIST"
+
+  # This doesn't work when profiling under QEMU. Instead mk/runner does
+  # something similar but different.
+  # export LLVM_PROFILE_FILE="$coverage_dir/%m.profraw"
+
+  # ${target} with hyphens replaced by underscores, lowercase and uppercase.
+  target_lower=${target//-/_}
+  target_upper=${target_lower^^}
+
+  cflags_var=CFLAGS_${target_lower}
+  declare -x "${cflags_var}=-fprofile-instr-generate -fcoverage-mapping ${!cflags_var-}"
+
+  runner_var=CARGO_TARGET_${target_upper}_RUNNER
+  declare -x "${runner_var}=mk/runner ${!runner_var-}"
+
+  rustflags_var=CARGO_TARGET_${target_upper}_RUSTFLAGS
+  declare -x "${rustflags_var}=-Zinstrument-coverage ${!rustflags_var-}"
+fi
+
+cargo "$@"
+
+if [ -n "${RING_COVERAGE-}" ]; then
+  while read executable; do
+    basename=$(basename "$executable")
+    llvm-profdata-$llvm_version merge -sparse ""$coverage_dir"/$basename.profraw" -o "$coverage_dir"/$basename.profdata
+    mkdir -p "$coverage_dir"/reports
+    llvm-cov-$llvm_version export \
+      --instr-profile "$coverage_dir"/$basename.profdata \
+      --format lcov \
+      "$executable" \
+    > "$coverage_dir"/reports/coverage-$basename.txt
+  done < "$RING_BUILD_EXECUTABLE_LIST"
+fi
diff --git a/mk/install-build-tools.ps1 b/mk/install-build-tools.ps1
new file mode 100644
index 0000000000..f1d51b981a
--- /dev/null
+++ b/mk/install-build-tools.ps1
@@ -0,0 +1,67 @@
+function Verify-Or-Delete-File {
+    param (
+        [Parameter(Mandatory)]
+        [string]$File,
+        [Parameter(Mandatory)]
+        [string]$ExpectedDigest
+    )
+    $ActualDigest = ( Get-FileHash -Algorithm SHA256 $File ).Hash
+    if ( $ActualDigest -eq $ExpectedDigest )
+    {
+        return
+    }
+    rm $File
+    echo "Digest verification failed for $Url; actual $ActualDigest, expected $ExpectedDigest"
+    exit 1
+}
+
+function Download-Zip-and-Extract-File {
+    param (
+        [Parameter(Mandatory)]
+        [string]$Uri,
+        [Parameter(Mandatory)]
+        [string]$ZipExpectedDigest,
+        [Parameter(Mandatory)]
+        [string]$PathWithinZip,
+        [Parameter(Mandatory)]
+        [string]$FileExpectedDigest,
+        [Parameter(Mandatory)]
+        [string]$OutFile
+    )
+    $TmpZip = New-TemporaryFile
+    Invoke-WebRequest -Uri $Uri -OutFile $TmpZip.FullName
+    echo $TmpZip
+    Verify-Or-Delete-File -File $TmpZip.FullName -ExpectedDigest $ZipExpectedDigest
+
+    Add-Type -AssemblyName System.IO.Compression.FileSystem
+    $zip = [System.IO.Compression.ZipFile]::OpenRead($TmpZip)
+    $zip.Entries |
+        Where-Object { $_.FullName -eq $PathWithinZip } |
+        ForEach-Object {
+            $TmpFile = New-TemporaryFile
+            # extract the selected items from the ZIP archive
+            # and copy them to the out folder
+            $FileName = $_.Name
+            [System.IO.Compression.ZipFileExtensions]::ExtractToFile($_, "$TmpFile", $true)
+            Verify-Or-Delete-File -File $TmpFile -ExpectedDigest $FileExpectedDigest
+            Move-Item -Force $TmpFile $OutFile
+        }
+    $zip.Dispose()
+}
+
+$tools_dir = "target/tools"
+mkdir -Force $tools_dir
+
+# This is the file BoringSSL refers to in
+# https://boringssl.googlesource.com/boringssl/+/26f8297177ad8033cc39de84afe9c2000430a66d.
+$nasm_version = "nasm-2.13.03"
+$nasm_zip = "$nasm_version-win64.zip"
+$nasm_zip_sha256 = "B3A1F896B53D07854884C2E0D6BE7DEFBA7EBD09B864BBB9E6D69ADA1C3E989F"
+$nasm_exe = "nasm.exe"
+$nasm_exe_sha256 = "D8A933BF5CC3597C56193135CB78B225AB225E1F611D2FDB51EF6E3F555B21E3"
+Download-Zip-and-Extract-File `
+    -Uri "https://www.nasm.us/pub/nasm/releasebuilds/2.13.03/win64/$nasm_zip" `
+    -ZipExpectedDigest "$nasm_zip_sha256" `
+    -PathWithinZip "$nasm_version/$nasm_exe" `
+    -FileExpectedDigest "$nasm_exe_sha256" `
+    -OutFile "$tools_dir/$nasm_exe"
diff --git a/mk/install-build-tools.sh b/mk/install-build-tools.sh
new file mode 100755
index 0000000000..e997bbb40e
--- /dev/null
+++ b/mk/install-build-tools.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+#
+# Copyright 2020 Brian Smith.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+set -eux -o pipefail
+IFS=$'\n\t'
+
+target=$1
+features=${2-}
+
+function install_packages {
+  sudo apt-get -yq --no-install-suggests --no-install-recommends install "$@"
+}
+
+use_clang=
+case $target in
+--target*android*)
+  mkdir -p "${ANDROID_SDK_ROOT}/licenses"
+  android_license_file="${ANDROID_SDK_ROOT}/licenses/android-sdk-license"
+  accept_android_license=24333f8a63b6825ea9c5514f83c2829b004d1fee
+  grep --quiet --no-messages "$accept_android_license" "$android_license_file" \
+    || echo $accept_android_license  >> "$android_license_file"
+  sudo "${ANDROID_SDK_ROOT}/tools/bin/sdkmanager" ndk-bundle
+  ;;
+esac
+
+case $target in
+--target=aarch64-unknown-linux-gnu)
+  # Clang is needed for code coverage.
+  use_clang=1
+  install_packages \
+    qemu-user \
+    gcc-aarch64-linux-gnu \
+    libc6-dev-arm64-cross
+  ;;
+--target=aarch64-unknown-linux-musl|--target=armv7-unknown-linux-musleabihf)
+  use_clang=1
+  install_packages \
+    qemu-user
+  ;;
+--target=arm-unknown-linux-gnueabihf)
+  install_packages \
+    qemu-user \
+    gcc-arm-linux-gnueabihf \
+    libc6-dev-armhf-cross
+  ;;
+--target=i686-unknown-linux-gnu)
+  use_clang=1
+  install_packages \
+    gcc-multilib \
+    libc6-dev-i386
+  ;;
+--target=i686-unknown-linux-musl|--target=x86_64-unknown-linux-musl)
+  use_clang=1
+  ;;
+--target=wasm32-unknown-unknown)
+  # The version of wasm-bindgen-cli must match the wasm-bindgen version.
+  wasm_bindgen_version=$(cargo metadata --format-version 1 | jq -r '.packages | map(select( .name == "wasm-bindgen")) | map(.version) | .[0]')
+  cargo install wasm-bindgen-cli --vers "$wasm_bindgen_version" --bin wasm-bindgen-test-runner
+  case ${features-} in
+    *wasm32_c*)
+      use_clang=1
+      ;;
+    *)
+      ;;
+  esac
+  ;;
+--target=*)
+  ;;
+esac
+
+if [ -n "$use_clang" ]; then
+  llvm_version=10
+  if [ -n "${RING_COVERAGE-}" ]; then
+    # https://github.com/rust-lang/rust/pull/79365 upgraded the coverage file
+    # format to one that only LLVM 11+ can use
+    llvm_version=11
+    sudo apt-key add mk/llvm-snapshot.gpg.key
+    sudo add-apt-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-$llvm_version main"
+    sudo apt-get update
+  fi
+  install_packages clang-$llvm_version llvm-$llvm_version
+fi
diff --git a/mk/llvm-snapshot.gpg.key b/mk/llvm-snapshot.gpg.key
new file mode 100644
index 0000000000..87a01ff889
--- /dev/null
+++ b/mk/llvm-snapshot.gpg.key
@@ -0,0 +1,54 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1.4.12 (GNU/Linux)
+Comment: See https://apt.llvm.org/.
+Comment: Fingerprint: 6084 F3CF 814B 57C1 CF12 EFD5 15CF 4D18 AF4F 7421
+
+mQINBFE9lCwBEADi0WUAApM/mgHJRU8lVkkw0CHsZNpqaQDNaHefD6Rw3S4LxNmM
+EZaOTkhP200XZM8lVdbfUW9xSjA3oPldc1HG26NjbqqCmWpdo2fb+r7VmU2dq3NM
+R18ZlKixiLDE6OUfaXWKamZsXb6ITTYmgTO6orQWYrnW6ckYHSeaAkW0wkDAryl2
+B5v8aoFnQ1rFiVEMo4NGzw4UX+MelF7rxaaregmKVTPiqCOSPJ1McC1dHFN533FY
+Wh/RVLKWo6npu+owtwYFQW+zyQhKzSIMvNujFRzhIxzxR9Gn87MoLAyfgKEzrbbT
+DhqqNXTxS4UMUKCQaO93TzetX/EBrRpJj+vP640yio80h4Dr5pAd7+LnKwgpTDk1
+G88bBXJAcPZnTSKu9I2c6KY4iRNbvRz4i+ZdwwZtdW4nSdl2792L7Sl7Nc44uLL/
+ZqkKDXEBF6lsX5XpABwyK89S/SbHOytXv9o4puv+65Ac5/UShspQTMSKGZgvDauU
+cs8kE1U9dPOqVNCYq9Nfwinkf6RxV1k1+gwtclxQuY7UpKXP0hNAXjAiA5KS5Crq
+7aaJg9q2F4bub0mNU6n7UI6vXguF2n4SEtzPRk6RP+4TiT3bZUsmr+1ktogyOJCc
+Ha8G5VdL+NBIYQthOcieYCBnTeIH7D3Sp6FYQTYtVbKFzmMK+36ERreL/wARAQAB
+tD1TeWx2ZXN0cmUgTGVkcnUgLSBEZWJpYW4gTExWTSBwYWNrYWdlcyA8c3lsdmVz
+dHJlQGRlYmlhbi5vcmc+iQI4BBMBAgAiBQJRPZQsAhsDBgsJCAcDAgYVCAIJCgsE
+FgIDAQIeAQIXgAAKCRAVz00Yr090Ibx+EADArS/hvkDF8juWMXxh17CgR0WZlHCC
+9CTBWkg5a0bNN/3bb97cPQt/vIKWjQtkQpav6/5JTVCSx2riL4FHYhH0iuo4iAPR
+udC7Cvg8g7bSPrKO6tenQZNvQm+tUmBHgFiMBJi92AjZ/Qn1Shg7p9ITivFxpLyX
+wpmnF1OKyI2Kof2rm4BFwfSWuf8Fvh7kDMRLHv+MlnK/7j/BNpKdozXxLcwoFBmn
+l0WjpAH3OFF7Pvm1LJdf1DjWKH0Dc3sc6zxtmBR/KHHg6kK4BGQNnFKujcP7TVdv
+gMYv84kun14pnwjZcqOtN3UJtcx22880DOQzinoMs3Q4w4o05oIF+sSgHViFpc3W
+R0v+RllnH05vKZo+LDzc83DQVrdwliV12eHxrMQ8UYg88zCbF/cHHnlzZWAJgftg
+hB08v1BKPgYRUzwJ6VdVqXYcZWEaUJmQAPuAALyZESw94hSo28FAn0/gzEc5uOYx
+K+xG/lFwgAGYNb3uGM5m0P6LVTfdg6vDwwOeTNIExVk3KVFXeSQef2ZMkhwA7wya
+KJptkb62wBHFE+o9TUdtMCY6qONxMMdwioRE5BYNwAsS1PnRD2+jtlI0DzvKHt7B
+MWd8hnoUKhMeZ9TNmo+8CpsAtXZcBho0zPGz/R8NlJhAWpdAZ1CmcPo83EW86Yq7
+BxQUKnNHcwj2ebkCDQRRPZQsARAA4jxYmbTHwmMjqSizlMJYNuGOpIidEdx9zQ5g
+zOr431/VfWq4S+VhMDhs15j9lyml0y4ok215VRFwrAREDg6UPMr7ajLmBQGau0Fc
+bvZJ90l4NjXp5p0NEE/qOb9UEHT7EGkEhaZ1ekkWFTWCgsy7rRXfZLxB6sk7pzLC
+DshyW3zjIakWAnpQ5j5obiDy708pReAuGB94NSyb1HoW/xGsGgvvCw4r0w3xPStw
+F1PhmScE6NTBIfLliea3pl8vhKPlCh54Hk7I8QGjo1ETlRP4Qll1ZxHJ8u25f/ta
+RES2Aw8Hi7j0EVcZ6MT9JWTI83yUcnUlZPZS2HyeWcUj+8nUC8W4N8An+aNps9l/
+21inIl2TbGo3Yn1JQLnA1YCoGwC34g8QZTJhElEQBN0X29ayWW6OdFx8MDvllbBV
+ymmKq2lK1U55mQTfDli7S3vfGz9Gp/oQwZ8bQpOeUkc5hbZszYwP4RX+68xDPfn+
+M9udl+qW9wu+LyePbW6HX90LmkhNkkY2ZzUPRPDHZANU5btaPXc2H7edX4y4maQa
+xenqD0lGh9LGz/mps4HEZtCI5CY8o0uCMF3lT0XfXhuLksr7Pxv57yue8LLTItOJ
+d9Hmzp9G97SRYYeqU+8lyNXtU2PdrLLq7QHkzrsloG78lCpQcalHGACJzrlUWVP/
+fN3Ht3kAEQEAAYkCHwQYAQIACQUCUT2ULAIbDAAKCRAVz00Yr090IbhWEADbr50X
+OEXMIMGRLe+YMjeMX9NG4jxs0jZaWHc/WrGR+CCSUb9r6aPXeLo+45949uEfdSsB
+pbaEdNWxF5Vr1CSjuO5siIlgDjmT655voXo67xVpEN4HhMrxugDJfCa6z97P0+ML
+PdDxim57uNqkam9XIq9hKQaurxMAECDPmlEXI4QT3eu5qw5/knMzDMZj4Vi6hovL
+wvvAeLHO/jsyfIdNmhBGU2RWCEZ9uo/MeerPHtRPfg74g+9PPfP6nyHD2Wes6yGd
+oVQwtPNAQD6Cj7EaA2xdZYLJ7/jW6yiPu98FFWP74FN2dlyEA2uVziLsfBrgpS4l
+tVOlrO2YzkkqUGrybzbLpj6eeHx+Cd7wcjI8CalsqtL6cG8cUEjtWQUHyTbQWAgG
+5VPEgIAVhJ6RTZ26i/G+4J8neKyRs4vz+57UGwY6zI4AB1ZcWGEE3Bf+CDEDgmnP
+LSwbnHefK9IljT9XU98PelSryUO/5UPw7leE0akXKB4DtekToO226px1VnGp3Bov
+1GBGvpHvL2WizEwdk+nfk8LtrLzej+9FtIcq3uIrYnsac47Pf7p0otcFeTJTjSq3
+krCaoG4Hx0zGQG2ZFpHrSrZTVy6lxvIdfi0beMgY6h78p6M9eYZHQHc02DjFkQXN
+bXb5c6gCHESH5PXwPU4jQEE7Ib9J6sbk7ZT2Mw==
+=j+4q
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/mk/package.sh b/mk/package.sh
index 43b9851272..7def4b623e 100644
--- a/mk/package.sh
+++ b/mk/package.sh
@@ -1,4 +1,3 @@
-
 # This only works on Windows, using MinGW.
 set -eux -o pipefail
 IFS=$'\n\t'
diff --git a/mk/runner b/mk/runner
new file mode 100755
index 0000000000..ffa1441084
--- /dev/null
+++ b/mk/runner
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -eux -o pipefail
+IFS=$'\n\t'
+
+for arg in $*; do
+  # There can be some arguments prefixed in front of the executable, e.g.
+  # when qemu-user is used. There can be arguments after the executable,
+  # e.g. `cargo test` arguments like `TESTNAME`.
+  if [[ $arg = */deps/* ]]; then
+    executable=$arg
+    break
+  fi
+done
+
+export LLVM_PROFILE_FILE=$(dirname "$RING_BUILD_EXECUTABLE_LIST")/$(basename "$executable").profraw
+
+if [ -n "$RING_BUILD_EXECUTABLE_LIST" ]; then
+  echo "$executable" >> "$RING_BUILD_EXECUTABLE_LIST"
+fi
+
+$*
diff --git a/mk/travis-install-kcov.sh b/mk/travis-install-kcov.sh
deleted file mode 100755
index af1dcbf5e5..0000000000
--- a/mk/travis-install-kcov.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright (c) 2016 Pietro Monteiro
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-set -ex
-
-
-# kcov 26 or newer is needed when getting coverage information for Rust.
-# kcov 31 is needed so `kcov --version` doesn't exit with status 1.
-KCOV_VERSION=${KCOV_VERSION:-36}
-
-KCOV_INSTALL_PREFIX="${HOME}/kcov-${TARGET_X}"
-
-# Check if kcov has been cached on travis.
-if [[ -f "$KCOV_INSTALL_PREFIX/bin/kcov" ]]; then
-  KCOV_INSTALLED_VERSION=`$KCOV_INSTALL_PREFIX/bin/kcov --version`
-  # Exit if we don't need to upgrade kcov.
-  if [[ "$KCOV_INSTALLED_VERSION" == "kcov $KCOV_VERSION" ]]; then
-    echo "Using cached kcov version: ${KCOV_VERSION}"
-    exit 0
-  else
-    rm -rf "$KCOV_INSTALL_PREFIX"
-  fi
-fi
-
-curl -L https://github.com/SimonKagstrom/kcov/archive/v$KCOV_VERSION.tar.gz | tar -zxf -
-
-pushd kcov-$KCOV_VERSION
-
-mkdir build
-
-pushd build
-
-if [[  "$TARGET_X" == "i686-unknown-linux-gnu" ]]; then
-  # set PKG_CONFIG_PATH so the kcov build system uses the 32 bit libraries we installed.
-  # otherwise kcov will be linked with 64 bit libraries and won't work with 32 bit executables.
-  PKG_CONFIG_PATH="/usr/lib/i386-linux-gnu/pkgconfig" CFLAGS="-m32" \
-  CXXFLAGS="-m32" TARGET=$TARGET_X \
-  cmake -DCMAKE_INSTALL_PREFIX:PATH="${KCOV_INSTALL_PREFIX}" ..
-else
-  TARGET=$TARGET_X cmake -DCMAKE_INSTALL_PREFIX:PATH="${KCOV_INSTALL_PREFIX}" ..
-fi
-
-make
-make install
-
-$KCOV_INSTALL_PREFIX/bin/kcov --version
-
-popd
-popd
diff --git a/mk/travis.sh b/mk/travis.sh
deleted file mode 100755
index 6938114c46..0000000000
--- a/mk/travis.sh
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2015 Brian Smith.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-set -eux -o pipefail
-IFS=$'\n\t'
-
-printenv
-
-case $TARGET_X in
-aarch64-unknown-linux-gnu)
-  export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
-  ;;
-arm-unknown-linux-gnueabihf)
-  export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
-  ;;
-aarch64-linux-android)
-  # XXX: Tests are built but not run because we couldn't get the emulator to work; see
-  # https://github.com/briansmith/ring/issues/838
-  export ANDROID_ABI=aarch64
-  ;;
-armv7-linux-androideabi)
-  # XXX: Tests are built but not run because we couldn't get the emulator to work; see
-  # https://github.com/briansmith/ring/issues/838
-  # export ANDROID_SYSTEM_IMAGE="system-images;android-18;default;armeabi-v7a"
-  export ANDROID_ABI=armeabi-v7a
-  ;;
-esac
-
-if [[ ! -z "${ANDROID_ABI-}" ]]; then
-  # install the android sdk/ndk
-  mkdir "$ANDROID_HOME/licenses" || true
-  echo "24333f8a63b6825ea9c5514f83c2829b004d1fee" > "$ANDROID_HOME/licenses/android-sdk-license"
-  sdkmanager ndk-bundle
-  curl -sSf https://build.travis-ci.org/files/rustup-init.sh | sh -s -- --default-toolchain=$RUST_X -y
-  export PATH=$HOME/.cargo/bin:$ANDROID_HOME/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH
-  rustup default
-fi
-
-if [[ "$TARGET_X" =~ ^(arm|aarch64) && ! "$TARGET_X" =~ android ]]; then
-  # We need a newer QEMU than Travis has.
-  # sudo is needed until the PPA and its packages are whitelisted.
-  # See https://github.com/travis-ci/apt-source-whitelist/issues/271
-  sudo add-apt-repository ppa:pietro-monteiro/qemu-backport -y
-  sudo apt-get update -qq
-  sudo apt-get install --no-install-recommends binfmt-support qemu-user-binfmt -y
-fi
-
-if [[ ! "$TARGET_X" =~ "x86_64-" ]]; then
-  rustup target add "$TARGET_X"
-
-  # By default cargo/rustc seems to use cc for linking, We installed the
-  # multilib support that corresponds to $CC_X but unless cc happens to match
-  # $CC_X, that's not the right version. The symptom is a linker error
-  # where it fails to find -lgcc_s.
-  if [[ ! -z "${CC_X-}" ]]; then
-    mkdir .cargo
-    echo "[target.$TARGET_X]" > .cargo/config
-    echo "linker= \"$CC_X\"" >> .cargo/config
-    cat .cargo/config
-  fi
-fi
-
-if [[ ! -z "${CC_X-}" ]]; then
-  export CC=$CC_X
-  $CC --version
-else
-  cc --version
-fi
-
-# KCOV needs a C++ compiler.
-if [[ "$KCOV" == "1" ]]; then
-  if [[ ! -z "${CC_X-}" ]]; then
-    CXX="${CC_X/clang/clang++}"
-    CXX="${CC_X/gcc/g++}"
-    export CXX=$CXX
-    $CXX --version
-  else
-    c++ --version
-  fi
-fi
-
-cargo version
-rustc --version
-
-if [[ "$MODE_X" == "RELWITHDEBINFO" ]]; then
-  mode=--release
-  target_dir=target/$TARGET_X/release
-else
-  target_dir=target/$TARGET_X/debug
-fi
-
-if [[ -z "${ANDROID_ABI-}" ]]; then
-  cargo test -vv -j2 ${mode-} ${FEATURES_X-} --target=$TARGET_X
-else
-  cargo test -vv -j2 --no-run ${mode-} ${FEATURES_X-} --target=$TARGET_X
-
-  if [[ ! -z "${ANDROID_SYSTEM_IMAGE-}" ]]; then
-    # Building the AVD is slow. Do it here, after we build the code so that any
-    # build breakage is reported sooner, instead of being delayed by this.
-    sdkmanager tools
-    echo no | avdmanager create avd --force --name $ANDROID_ABI -k $ANDROID_SYSTEM_IMAGE --abi $ANDROID_ABI
-    avdmanager list avd
-
-    $ANDROID_HOME/emulator/emulator @$ANDROID_ABI -memory 2048 -no-skin -no-boot-anim -no-window &
-    adb wait-for-device
-
-    # Run the unit tests first. The file named ring-<something> in $target_dir is
-    # the test executable.
-
-    find $target_dir -maxdepth 1 -name ring-* ! -name "*.*" \
-      -exec adb push {} /data/ring-test \;
-    adb shell "cd /data && ./ring-test" 2>&1 | tee /tmp/ring-test-log
-    grep "test result: ok" /tmp/ring-test-log
-
-    for test_exe in `find $target_dir -maxdepth 1 -name "*test*" -type f ! -name "*.*" `; do
-        adb push $test_exe /data/`basename $test_exe`
-        adb shell "cd /data && ./`basename $test_exe`" 2>&1 | \
-            tee /tmp/`basename $test_exe`-log
-        grep "test result: ok" /tmp/`basename $test_exe`-log
-    done
-
-     adb emu kill
-  fi
-fi
-
-if [[ "$KCOV" == "1" ]]; then
-  # kcov reports coverage as a percentage of code *linked into the executable*
-  # (more accurately, code that has debug info linked into the executable), not
-  # as a percentage of source code. Thus, any code that gets discarded by the
-  # linker due to lack of usage isn't counted at all. Thus, we have to re-link
-  # with "-C link-dead-code" to get accurate code coverage reports.
-  # Alternatively, we could link pass "-C link-dead-code" in the "cargo test"
-  # step above, but then "cargo test" we wouldn't be testing the configuration
-  # we expect people to use in production.
-  #
-  # panic=abort is used to get accurate coverage. See
-  # https://github.com/rust-lang/rust/issues/43410 and
-  # https://github.com/mozilla/grcov/issues/427#issuecomment-623995594 and
-  # https://github.com/rust-lang/rust/issues/55352.
-  cargo clean
-  CARGO_INCREMENTAL=0 \
-  RUSTDOCFLAGS="-Cpanic=abort" \
-  RUSTFLAGS="-Ccodegen-units=1 -Clink-dead-code -Coverflow-checks=on -Cpanic=abort -Zpanic_abort_tests -Zprofile" \
-    cargo test -vv --no-run -j2  ${mode-} ${FEATURES_X-} --target=$TARGET_X
-  mk/travis-install-kcov.sh
-  for test_exe in `find target/$TARGET_X/debug -maxdepth 1 -executable -type f`; do
-    ${HOME}/kcov-${TARGET_X}/bin/kcov \
-      --verify \
-      --coveralls-id=$TRAVIS_JOB_ID \
-      --exclude-path=/usr/include \
-      --include-pattern="ring/crypto,ring/src,ring/tests" \
-      target/kcov \
-      $test_exe
-  done
-fi
-
-echo end of mk/travis.sh
diff --git a/mk/update-travis-yml.py b/mk/update-travis-yml.py
deleted file mode 100755
index 4468484111..0000000000
--- a/mk/update-travis-yml.py
+++ /dev/null
@@ -1,285 +0,0 @@
-﻿# Run this as "python mk/update-travis-yml.py"
-
-# Copyright 2015 Brian Smith.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND BRIAN SMITH AND THE AUTHORS DISCLAIM
-# ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL BRIAN SMITH OR THE AUTHORS
-# BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
-# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
-# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-import re
-import shutil
-
-rusts = [
-    "stable",
-    "nightly",
-    "beta",
-]
-
-gcc = "gcc-7"
-#Clang 5.0 is the default compiler on Travis CI for Ubuntu 14.04.
-clang = "clang"
-
-linux_compilers = [
-    # Assume the default compiler is GCC.
-    # GCC 4.8 is the default compiler on Travis CI for Ubuntu 14.04.
-    "",
-
-    clang,
-
-    gcc,
-]
-
-osx_compilers = [
-     "", # Don't set CC.'
-]
-
-compilers = {
-    "aarch64-unknown-linux-gnu" : [ "aarch64-linux-gnu-gcc" ],
-    "aarch64-linux-android" : [ "aarch64-linux-android21-clang" ],
-    "armv7-linux-androideabi" : [ "armv7a-linux-androideabi18-clang" ],
-    "arm-unknown-linux-gnueabihf" : [ "arm-linux-gnueabihf-gcc" ],
-    "i686-unknown-linux-gnu" : linux_compilers,
-    "x86_64-unknown-linux-gnu" : linux_compilers,
-    "x86_64-apple-darwin" : osx_compilers,
-}
-
-feature_sets = [
-    "",
-]
-
-modes = [
-    "DEBUG",
-    "RELWITHDEBINFO"
-]
-
-# Mac OS X is first because we don't want to have to wait until all the Linux
-# configurations have been built to find out that there is a failure on Mac.
-oss = [
-    "osx",
-    "linux",
-]
-
-targets = {
-    "osx" : [
-        "x86_64-apple-darwin",
-    ],
-    "linux" : [
-        "aarch64-linux-android",
-        "armv7-linux-androideabi",
-        "x86_64-unknown-linux-gnu",
-        "aarch64-unknown-linux-gnu",
-        "i686-unknown-linux-gnu",
-        "arm-unknown-linux-gnueabihf",
-    ],
-}
-
-def format_entries():
-    return "\n".join([format_entry(os, target, compiler, rust, mode, features)
-                      for rust in rusts
-                      for os in oss
-                      for target in targets[os]
-                      for compiler in compilers[target]
-                      for mode in modes
-                      for features in feature_sets])
-
-# We use alternative names (the "_X" suffix) so that, in mk/travis.sh, we can
-# ensure that we set the specific variables we want and that no relevant
-# variables are unintentially inherited into the build process. Also, we have
-# to set |CC_X| instead of |CC| since Travis sets |CC| to its Travis CI default
-# value *after* processing the |env:| directive here.
-entry_template = """
-    - env: TARGET_X=%(target)s %(compilers)s FEATURES_X=%(features)s MODE_X=%(mode)s KCOV=%(kcov)s RUST_X=%(rust)s
-      rust: %(rust)s
-      os: %(os)s"""
-
-entry_indent = "      "
-
-entry_packages_template = """
-      addons:
-        apt:
-          packages:
-            %(packages)s"""
-
-entry_sources_template = """
-          sources:
-            %(sources)s"""
-
-def format_entry(os, target, compiler, rust, mode, features):
-    target_words = target.split("-")
-    arch = target_words[0]
-    vendor = target_words[1]
-    sys = target_words[2]
-
-    # Currently kcov only runs on Linux.
-    #
-    # GCC 7 was picked arbitrarily to restrict coverage report to one build for
-    # efficiency reasons.
-    #
-    # DEBUG mode is needed because debug symbols are needed for coverage
-    # tracking.
-    kcov = (os == "linux" and compiler == gcc and rust == "nightly" and
-            mode == "DEBUG")
-
-    template = entry_template
-
-    if sys == "darwin":
-        abi = sys
-        sys = "macos"
-    elif sys == "androideabi":
-        abi = sys
-        sys = "linux"
-        template += """
-      language: android
-      android:
-        components:
-        - android-18
-        - build-tools-26.0.2
-        - sys-img-armeabi-v7a-android-18"""
-    elif sys == "android":
-        abi = sys
-        sys = "linux"
-        template += """
-      language: android
-      android:
-        components:
-        - android-21
-        - build-tools-26.0.2"""
-    else:
-        abi = target_words[3]
-
-    def prefix_all(prefix, xs):
-        return [prefix + x for x in xs]
-
-    if sys == "linux":
-        packages = sorted(get_linux_packages_to_install(target, compiler, arch, kcov))
-        sources_with_dups = sum([get_sources_for_package(p) for p in packages],[])
-        sources = sorted(list(set(sources_with_dups)))
-        template += """
-      dist: trusty"""
-
-    if sys == "linux":
-        if packages:
-            template += entry_packages_template
-        if sources:
-            template += entry_sources_template
-    else:
-        packages = []
-        sources = []
-
-    cc = compiler
-
-    if os == "osx":
-        os += "\n" + entry_indent + "osx_image: xcode10.1"
-
-    compilers = []
-    if cc != "":
-        compilers += ["CC_X=" + cc]
-    compilers += ""
-
-    return template % {
-            "compilers": " ".join(compilers),
-            "features" : features,
-            "mode" : mode,
-            "kcov": "1" if kcov == True else "0",
-            "packages" : "\n            ".join(prefix_all("- ", packages)),
-            "rust" : rust,
-            "sources" : "\n            ".join(prefix_all("- ", sources)),
-            "target" : target,
-            "os" : os,
-            }
-
-def get_linux_packages_to_install(target, compiler, arch, kcov):
-    if compiler.startswith("clang-") or compiler.startswith("gcc-"):
-        packages = [compiler]
-    else:
-        packages = []
-
-    if kcov:
-        packages += [replace_cc_with_cxx(compiler)]
-
-    if target == "aarch64-unknown-linux-gnu":
-        packages += ["gcc-aarch64-linux-gnu",
-                     "libc6-dev-arm64-cross"]
-    if target == "arm-unknown-linux-gnueabihf":
-        packages += ["gcc-arm-linux-gnueabihf",
-                     "libc6-dev-armhf-cross"]
-
-    if arch == "i686":
-        if kcov == True:
-            packages += [replace_cc_with_cxx(compiler) + "-multilib",
-                         "libcurl3:i386",
-                         "libcurl4-openssl-dev:i386",
-                         "libdw-dev:i386",
-                         "libelf-dev:i386",
-                         "libiberty-dev:i386",
-                         "libkrb5-dev:i386",
-                         "libssl-dev:i386"]
-
-        if compiler.startswith("clang") or compiler == "":
-            packages += ["libc6-dev-i386",
-                         "gcc-multilib"]
-        elif compiler.startswith("gcc-"):
-            packages += [compiler + "-multilib",
-                         "linux-libc-dev:i386"]
-        else:
-            raise ValueError("unexpected compiler: %s" % compiler)
-    elif arch == "x86_64":
-        if kcov == True:
-            packages += ["libcurl4-openssl-dev",
-                         "libelf-dev",
-                         "libdw-dev",
-                         "binutils-dev",
-                         "libiberty-dev"]
-    elif arch not in ["aarch64", "arm", "armv7"]:
-        raise ValueError("unexpected arch: %s" % arch)
-
-    return packages
-
-def get_sources_for_package(package):
-    ubuntu_toolchain = "ubuntu-toolchain-r-test"
-    if package.startswith("clang-"):
-        _, version = package.split("-")
-        llvm_toolchain = "llvm-toolchain-trusty-%s" % version
-
-        # Stuff in llvm-toolchain-trusty depends on stuff in the toolchain
-        # packages.
-        return [llvm_toolchain, ubuntu_toolchain]
-    elif package.startswith("gcc-"):
-        return [ubuntu_toolchain]
-    else:
-        return []
-
-def replace_cc_with_cxx(compiler):
-    return compiler \
-               .replace("gcc", "g++") \
-               .replace("clang", "clang++")
-
-def main():
-    # Make a backup of the file we are about to update.
-    shutil.copyfile(".travis.yml", ".travis.yml~")
-    with open(".travis.yml", "r+b") as file:
-        begin = "    # BEGIN GENERATED\n"
-        end = "    # END GENERATED\n"
-        old_contents = file.read()
-        new_contents = re.sub("%s(.*?)\n[ ]*%s" % (begin, end),
-                              "".join([begin, format_entries(), "\n\n", end]),
-                              old_contents, flags=re.S)
-        if old_contents == new_contents:
-            print "No changes"
-            return
-
-        file.seek(0)
-        file.write(new_contents)
-        file.truncate()
-        print new_contents
-
-if __name__ == '__main__':
-    main()
diff --git a/pregenerate_asm/Cargo.toml b/pregenerate_asm/Cargo.toml
index a200e0a4c2..8a64273f8a 100644
--- a/pregenerate_asm/Cargo.toml
+++ b/pregenerate_asm/Cargo.toml
@@ -9,4 +9,4 @@ path = "../build.rs"
 
 # Keep this in sync with `[build-dependencies]` in ../Cargo.toml.
 [dependencies]
-cc = "1.0.26"
+cc = { version = "1.0.62", default-features = false }
diff --git a/src/aead.rs b/src/aead.rs
index 40d7f201b6..4d6bdb7904 100644
--- a/src/aead.rs
+++ b/src/aead.rs
@@ -398,7 +398,7 @@ impl core::fmt::Debug for UnboundKey {
     }
 }
 
-#[allow(variant_size_differences)]
+#[allow(clippy::large_enum_variant, variant_size_differences)]
 enum KeyInner {
     AesGcm(aes_gcm::Key),
     ChaCha20Poly1305(chacha20_poly1305::Key),
@@ -635,7 +635,7 @@ impl Eq for Algorithm {}
 /// An authentication tag.
 #[must_use]
 #[repr(C)]
-pub struct Tag(Block);
+pub struct Tag([u8; TAG_LEN]);
 
 impl AsRef<[u8]> for Tag {
     fn as_ref(&self) -> &[u8] {
diff --git a/src/aead/aes.rs b/src/aead/aes.rs
index b029668fb9..28558eb35e 100644
--- a/src/aead/aes.rs
+++ b/src/aead/aes.rs
@@ -12,7 +12,7 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-use super::{counter, iv::Iv, Block, Direction, BLOCK_LEN};
+use super::{counter, iv::Iv, quic::Sample, Block, Direction, BLOCK_LEN};
 use crate::{bits::BitLength, c, cpu, endian::*, error, polyfill};
 
 pub(crate) struct Key {
@@ -152,7 +152,7 @@ impl Key {
                 set_encrypt_key!(GFp_vpaes_set_encrypt_key, bytes, key_bits, &mut key)?
             }
 
-            #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+            #[cfg(not(target_arch = "aarch64"))]
             Implementation::NOHW => {
                 set_encrypt_key!(GFp_aes_nohw_set_encrypt_key, bytes, key_bits, &mut key)?
             }
@@ -183,7 +183,7 @@ impl Key {
             ))]
             Implementation::VPAES_BSAES => encrypt_block!(GFp_vpaes_encrypt, a, self),
 
-            #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+            #[cfg(not(target_arch = "aarch64"))]
             Implementation::NOHW => encrypt_block!(GFp_aes_nohw_encrypt, a, self),
         }
     }
@@ -280,7 +280,7 @@ impl Key {
                 });
             }
 
-            #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+            #[cfg(not(target_arch = "aarch64"))]
             Implementation::NOHW => ctr32_encrypt_blocks!(
                 GFp_aes_nohw_ctr32_encrypt_blocks,
                 in_out,
@@ -291,8 +291,8 @@ impl Key {
         }
     }
 
-    pub fn new_mask(&self, sample: Block) -> [u8; 5] {
-        let block = self.encrypt_block(sample);
+    pub fn new_mask(&self, sample: Sample) -> [u8; 5] {
+        let block = self.encrypt_block(Block::from(&sample));
 
         let mut out: [u8; 5] = [0; 5];
         out.copy_from_slice(&block.as_ref()[..5]);
@@ -300,6 +300,10 @@ impl Key {
         out
     }
 
+    // TODO: use `matches!` when MSRV increases to 1.42.0 and remove this
+    // `#[allow(...)]`
+    #[allow(clippy::unknown_clippy_lints)]
+    #[allow(clippy::match_like_matches_macro)]
     #[cfg(target_arch = "x86_64")]
     #[must_use]
     pub fn is_aes_hw(&self) -> bool {
@@ -340,7 +344,7 @@ pub enum Implementation {
         target_arch = "aarch64",
         target_arch = "arm",
         target_arch = "x86_64",
-        target_arch = "x86"
+        target_arch = "x86",
     ))]
     HWAES = 1,
 
@@ -353,7 +357,7 @@ pub enum Implementation {
     ))]
     VPAES_BSAES = 2,
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+    #[cfg(not(target_arch = "aarch64"))]
     NOHW = 3,
 }
 
@@ -398,15 +402,10 @@ fn detect_implementation(cpu_features: cpu::Features) -> Implementation {
         Implementation::VPAES_BSAES
     }
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+    #[cfg(not(target_arch = "aarch64"))]
     {
         Implementation::NOHW
     }
-
-    #[cfg(target_env = "sgx")]
-    {
-        panic!("No AES implementation available!")
-    }
 }
 
 #[must_use]
diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs
index 350e397be3..b225e76821 100644
--- a/src/aead/aes_gcm.rs
+++ b/src/aead/aes_gcm.rs
@@ -190,7 +190,7 @@ fn aead(
         let bytes = tag_iv.into_bytes_less_safe();
         let mut tag = aes_key.encrypt_block(Block::from(&bytes));
         tag.bitxor_assign(pre_tag.into());
-        Tag(tag)
+        Tag(*tag.as_ref())
     })
 }
 
diff --git a/src/aead/block.rs b/src/aead/block.rs
index 658ac22d44..157f8ad842 100644
--- a/src/aead/block.rs
+++ b/src/aead/block.rs
@@ -87,6 +87,7 @@ impl From<&'_ [u8; BLOCK_LEN]> for Block {
 }
 
 impl AsRef<[u8; BLOCK_LEN]> for Block {
+    #[allow(clippy::transmute_ptr_to_ptr)]
     #[inline]
     fn as_ref(&self) -> &[u8; BLOCK_LEN] {
         unsafe { core::mem::transmute(self) }
diff --git a/src/aead/chacha.rs b/src/aead/chacha.rs
index cd6eed205f..5e015083b1 100644
--- a/src/aead/chacha.rs
+++ b/src/aead/chacha.rs
@@ -13,16 +13,16 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-use super::{counter, iv::Iv, Block, BLOCK_LEN};
+use super::{counter, iv::Iv, quic::Sample, BLOCK_LEN};
 use crate::{c, endian::*};
 
-#[repr(C)]
-pub struct Key([u8; KEY_LEN]);
+#[repr(transparent)]
+pub struct Key([LittleEndian<u32>; KEY_LEN / 4]);
 
 impl From<[u8; KEY_LEN]> for Key {
     #[inline]
     fn from(value: [u8; KEY_LEN]) -> Self {
-        Self(value)
+        Self(FromByteArray::from_byte_array(&value))
     }
 }
 
@@ -52,9 +52,9 @@ impl Key {
     }
 
     #[inline]
-    pub fn new_mask(&self, sample: Block) -> [u8; 5] {
+    pub fn new_mask(&self, sample: Sample) -> [u8; 5] {
         let mut out: [u8; 5] = [0; 5];
-        let iv = Iv::assume_unique_for_key(*sample.as_ref());
+        let iv = Iv::assume_unique_for_key(sample);
 
         unsafe {
             self.encrypt(
diff --git a/src/aead/chacha20_poly1305.rs b/src/aead/chacha20_poly1305.rs
index 26d3315b27..1890648449 100644
--- a/src/aead/chacha20_poly1305.rs
+++ b/src/aead/chacha20_poly1305.rs
@@ -80,7 +80,7 @@ fn aead(
     Aad(aad): Aad<&[u8]>,
     in_out: &mut [u8],
     direction: Direction,
-    _todo: cpu::Features,
+    cpu_features: cpu::Features,
 ) -> Tag {
     let chacha20_key = match key {
         aead::KeyInner::ChaCha20Poly1305(key) => key,
@@ -89,7 +89,7 @@ fn aead(
 
     let mut counter = Counter::zero(nonce);
     let mut ctx = {
-        let key = derive_poly1305_key(chacha20_key, counter.increment());
+        let key = derive_poly1305_key(chacha20_key, counter.increment(), cpu_features);
         poly1305::Context::from_key(key)
     };
 
@@ -108,12 +108,12 @@ fn aead(
         }
     };
 
-    ctx.update_block(
+    ctx.update(
         Block::from_u64_le(
             LittleEndian::from(polyfill::u64_from_usize(aad.len())),
             LittleEndian::from(polyfill::u64_from_usize(in_out_len)),
-        ),
-        poly1305::Pad::Pad,
+        )
+        .as_ref(),
     );
     ctx.finish()
 }
@@ -123,20 +123,24 @@ fn poly1305_update_padded_16(ctx: &mut poly1305::Context, input: &[u8]) {
     let remainder_len = input.len() % BLOCK_LEN;
     let whole_len = input.len() - remainder_len;
     if whole_len > 0 {
-        ctx.update_blocks(&input[..whole_len]);
+        ctx.update(&input[..whole_len]);
     }
     if remainder_len > 0 {
         let mut block = Block::zero();
         block.overwrite_part_at(0, &input[whole_len..]);
-        ctx.update_block(block, poly1305::Pad::Pad)
+        ctx.update(block.as_ref())
     }
 }
 
 // Also used by chacha20_poly1305_openssh.
-pub(super) fn derive_poly1305_key(chacha_key: &chacha::Key, iv: Iv) -> poly1305::Key {
+pub(super) fn derive_poly1305_key(
+    chacha_key: &chacha::Key,
+    iv: Iv,
+    cpu_features: cpu::Features,
+) -> poly1305::Key {
     let mut key_bytes = [0u8; 2 * BLOCK_LEN];
     chacha_key.encrypt_iv_xor_blocks_in_place(iv, &mut key_bytes);
-    poly1305::Key::from(key_bytes)
+    poly1305::Key::new(key_bytes, cpu_features)
 }
 
 #[cfg(test)]
diff --git a/src/aead/chacha20_poly1305_openssh.rs b/src/aead/chacha20_poly1305_openssh.rs
index 656ca3bf4d..cb6f6913e4 100644
--- a/src/aead/chacha20_poly1305_openssh.rs
+++ b/src/aead/chacha20_poly1305_openssh.rs
@@ -32,7 +32,7 @@
 use super::{
     chacha::{self, *},
     chacha20_poly1305::derive_poly1305_key,
-    poly1305, Nonce, Tag,
+    cpu, poly1305, Nonce, Tag,
 };
 use crate::{constant_time, endian::*, error};
 use core::convert::TryInto;
@@ -46,7 +46,7 @@ impl SealingKey {
     /// Constructs a new `SealingKey`.
     pub fn new(key_material: &[u8; KEY_LEN]) -> SealingKey {
         SealingKey {
-            key: Key::new(key_material),
+            key: Key::new(key_material, cpu::features()),
         }
     }
 
@@ -64,7 +64,8 @@ impl SealingKey {
         tag_out: &mut [u8; TAG_LEN],
     ) {
         let mut counter = make_counter(sequence_number);
-        let poly_key = derive_poly1305_key(&self.key.k_2, counter.increment());
+        let poly_key =
+            derive_poly1305_key(&self.key.k_2, counter.increment(), self.key.cpu_features);
 
         {
             let (len_in_out, data_and_padding_in_out) =
@@ -92,7 +93,7 @@ impl OpeningKey {
     /// Constructs a new `OpeningKey`.
     pub fn new(key_material: &[u8; KEY_LEN]) -> OpeningKey {
         OpeningKey {
-            key: Key::new(key_material),
+            key: Key::new(key_material, cpu::features()),
         }
     }
 
@@ -131,7 +132,8 @@ impl OpeningKey {
         // We must verify the tag before decrypting so that
         // `ciphertext_in_plaintext_out` is unmodified if verification fails.
         // This is beyond what we guarantee.
-        let poly_key = derive_poly1305_key(&self.key.k_2, counter.increment());
+        let poly_key =
+            derive_poly1305_key(&self.key.k_2, counter.increment(), self.key.cpu_features);
         verify(poly_key, ciphertext_in_plaintext_out, tag)?;
 
         let plaintext_in_ciphertext_out = &mut ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..];
@@ -146,10 +148,11 @@ impl OpeningKey {
 struct Key {
     k_1: chacha::Key,
     k_2: chacha::Key,
+    cpu_features: cpu::Features,
 }
 
 impl Key {
-    fn new(key_material: &[u8; KEY_LEN]) -> Key {
+    fn new(key_material: &[u8; KEY_LEN], cpu_features: cpu::Features) -> Key {
         // The first half becomes K_2 and the second half becomes K_1.
         let (k_2, k_1) = key_material.split_at(chacha::KEY_LEN);
         let k_1: [u8; chacha::KEY_LEN] = k_1.try_into().unwrap();
@@ -157,6 +160,7 @@ impl Key {
         Key {
             k_1: chacha::Key::from(k_1),
             k_2: chacha::Key::from(k_2),
+            cpu_features,
         }
     }
 }
diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs
index bd777236a5..0e5c668c7e 100644
--- a/src/aead/gcm.rs
+++ b/src/aead/gcm.rs
@@ -65,7 +65,7 @@ impl Key {
                 }
             }
 
-            #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+            #[cfg(not(target_arch = "aarch64"))]
             Implementation::Fallback => {
                 h_table.Htable[0] = gcm_nohw::init(h);
             }
@@ -168,7 +168,7 @@ impl Context {
                 }
             }
 
-            #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+            #[cfg(not(target_arch = "aarch64"))]
             Implementation::Fallback => {
                 gcm_nohw::ghash(xi, h_table.Htable[0], input);
             }
@@ -210,7 +210,7 @@ impl Context {
                 }
             }
 
-            #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+            #[cfg(not(target_arch = "aarch64"))]
             Implementation::Fallback => {
                 gcm_nohw::gmult(xi, h_table.Htable[0]);
             }
@@ -228,7 +228,6 @@ impl Context {
     pub(super) fn is_avx2(&self, cpu_features: cpu::Features) -> bool {
         match detect_implementation(cpu_features) {
             Implementation::CLMUL => has_avx_movbe(self.cpu_features),
-            #[cfg(not(target_env = "sgx"))]
             _ => false,
         }
     }
@@ -289,7 +288,7 @@ enum Implementation {
     #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
     NEON,
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+    #[cfg(not(target_arch = "aarch64"))]
     Fallback,
 }
 
@@ -331,18 +330,13 @@ fn detect_implementation(cpu_features: cpu::Features) -> Implementation {
         return Implementation::NEON;
     }
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))]
+    #[cfg(not(target_arch = "aarch64"))]
     {
         return Implementation::Fallback;
     }
-
-    #[cfg(target_env = "sgx")]
-    {
-        panic!("No GCM implementation available!")
-    }
 }
 
 #[cfg(target_arch = "x86_64")]
 fn has_avx_movbe(cpu_features: cpu::Features) -> bool {
-    return cpu::intel::AVX.available(cpu_features) && cpu::intel::MOVBE.available(cpu_features);
+    cpu::intel::AVX.available(cpu_features) && cpu::intel::MOVBE.available(cpu_features)
 }
diff --git a/src/aead/poly1305.rs b/src/aead/poly1305.rs
index b0df82040d..a87f709a4d 100644
--- a/src/aead/poly1305.rs
+++ b/src/aead/poly1305.rs
@@ -15,155 +15,105 @@
 
 // TODO: enforce maximum input length.
 
-use super::{
-    block::{Block, BLOCK_LEN},
-    Tag,
-};
-use crate::{bssl, c, error};
-use core::convert::TryInto;
+use super::{block::BLOCK_LEN, Tag, TAG_LEN};
+use crate::{c, cpu};
 
 /// A Poly1305 key.
-pub struct Key([u8; KEY_LEN]);
+pub(super) struct Key {
+    key_and_nonce: [u8; KEY_LEN],
+    cpu_features: cpu::Features,
+}
 
 const KEY_LEN: usize = 2 * BLOCK_LEN;
 
-impl From<[u8; KEY_LEN]> for Key {
+impl Key {
     #[inline]
-    fn from(value: [u8; KEY_LEN]) -> Self {
-        Self(value)
+    pub(super) fn new(key_and_nonce: [u8; KEY_LEN], cpu_features: cpu::Features) -> Self {
+        Self {
+            key_and_nonce,
+            cpu_features,
+        }
     }
 }
 
 pub struct Context {
-    opaque: Opaque,
-    nonce: Nonce,
-    func: Funcs,
+    state: poly1305_state,
+    #[allow(dead_code)]
+    cpu_features: cpu::Features,
 }
 
-/// The memory manipulated by the assembly.
-#[repr(C, align(8))]
-struct Opaque([u8; OPAQUE_LEN]);
-const OPAQUE_LEN: usize = 192;
+// Keep in sync with `poly1305_state` in GFp/poly1305.h.
+//
+// The C code, in particular the way the `poly1305_aligned_state` functions
+// are used, is only correct when the state buffer is 64-byte aligned.
+#[repr(C, align(64))]
+struct poly1305_state([u8; OPAQUE_LEN]);
+const OPAQUE_LEN: usize = 512;
+
+// Abstracts the dispatching logic that chooses the NEON implementation if and
+// only if it would work.
+macro_rules! dispatch {
+    ( $features:expr =>
+      ( $f:ident | $neon_f:ident )
+      ( $( $p:ident : $t:ty ),+ )
+      ( $( $a:expr ),+ ) ) => {
+        match () {
+            // Apple's 32-bit ARM ABI is incompatible with the assembly code.
+            #[cfg(all(target_arch = "arm", not(target_vendor = "apple")))]
+            () if cpu::arm::NEON.available($features) => {
+                extern "C" {
+                    fn $neon_f( $( $p : $t ),+ );
+                }
+                unsafe { $neon_f( $( $a ),+ ) }
+            }
+            () => {
+                extern "C" {
+                    fn $f( $( $p : $t ),+ );
+                }
+                unsafe { $f( $( $a ),+ ) }
+            }
+        }
+    }
+}
 
 impl Context {
     #[inline]
-    pub fn from_key(Key(key_and_nonce): Key) -> Self {
-        extern "C" {
-            fn GFp_poly1305_blocks(
-                state: &mut Opaque,
-                input: *const u8,
-                len: c::size_t,
-                should_pad: Pad,
-            );
-            fn GFp_poly1305_emit(state: &mut Opaque, tag: &mut Tag, nonce: &Nonce);
-        }
-
-        let (key, nonce) = key_and_nonce.split_at(BLOCK_LEN);
-        let key: [u8; BLOCK_LEN] = key.try_into().unwrap();
-        let nonce: [u8; BLOCK_LEN] = nonce.try_into().unwrap();
-
-        let key = DerivedKey(key);
-        let nonce = Nonce(nonce);
-
+    pub(super) fn from_key(
+        Key {
+            key_and_nonce,
+            cpu_features,
+        }: Key,
+    ) -> Self {
         let mut ctx = Self {
-            opaque: Opaque([0u8; OPAQUE_LEN]),
-            nonce,
-            func: Funcs {
-                blocks_fn: GFp_poly1305_blocks,
-                emit_fn: GFp_poly1305_emit,
-            },
+            state: poly1305_state([0u8; OPAQUE_LEN]),
+            cpu_features,
         };
 
-        // On some platforms `init()` doesn't initialize `funcs`. The
-        // return value of `init()` indicates whether it did or not. Since
-        // we already gave `func` a default value above, we can ignore the
-        // return value assuming `init()` doesn't change `func` if it chose
-        // not to initialize it. Note that this is different than what
-        // BoringSSL does.
-        let _ = init(&mut ctx.opaque, key, &mut ctx.func);
+        dispatch!(
+            cpu_features =>
+            (GFp_poly1305_init | GFp_poly1305_init_neon)
+            (statep: &mut poly1305_state, key: &[u8; KEY_LEN])
+            (&mut ctx.state, &key_and_nonce));
 
         ctx
     }
 
-    pub fn update_block(&mut self, block: Block, pad: Pad) {
-        self.func.blocks(&mut self.opaque, block.as_ref(), pad);
-    }
-
-    pub fn update_blocks(&mut self, input: &[u8]) {
-        debug_assert_eq!(input.len() % BLOCK_LEN, 0);
-        self.func.blocks(&mut self.opaque, input, Pad::Pad);
+    #[inline(always)]
+    pub fn update(&mut self, input: &[u8]) {
+        dispatch!(
+            self.cpu_features =>
+            (GFp_poly1305_update | GFp_poly1305_update_neon)
+            (statep: &mut poly1305_state, input: *const u8, in_len: c::size_t)
+            (&mut self.state, input.as_ptr(), input.len()));
     }
 
     pub(super) fn finish(mut self) -> Tag {
-        self.func.emit(&mut self.opaque, &self.nonce)
-    }
-}
-
-#[cfg(test)]
-pub fn check_state_layout() {
-    let required_state_size = if cfg!(target_arch = "x86") {
-        // See comment above `_poly1305_init_sse2` in poly1305-x86.pl.
-        Some(4 * (5 + 1 + 4 + 2 + 4 * 9))
-    } else if cfg!(target_arch = "x86_64") {
-        // See comment above `__poly1305_block` in poly1305-x86_64.pl.
-        Some(4 * (5 + 1 + 2 * 2 + 2 + 4 * 9))
-    } else {
-        // TODO(davidben): Figure out the layout of the struct. For now,
-        // `OPAQUE_LEN` is taken from OpenSSL.
-        None
-    };
-
-    if let Some(required_state_size) = required_state_size {
-        assert!(core::mem::size_of::<Opaque>() >= required_state_size);
-    }
-}
-
-#[repr(C)]
-struct DerivedKey([u8; BLOCK_LEN]);
-
-/// This is *not* an "AEAD nonce"; it's a Poly1305-specific nonce.
-#[repr(C)]
-struct Nonce([u8; BLOCK_LEN]);
-
-#[repr(C)]
-struct Funcs {
-    blocks_fn:
-        unsafe extern "C" fn(&mut Opaque, input: *const u8, input_len: c::size_t, should_pad: Pad),
-    emit_fn: unsafe extern "C" fn(&mut Opaque, &mut Tag, nonce: &Nonce),
-}
-
-#[inline]
-fn init(state: &mut Opaque, key: DerivedKey, func: &mut Funcs) -> Result<(), error::Unspecified> {
-    extern "C" {
-        fn GFp_poly1305_init_asm(
-            state: &mut Opaque,
-            key: &DerivedKey,
-            out_func: &mut Funcs,
-        ) -> bssl::Result;
-    }
-    Result::from(unsafe { GFp_poly1305_init_asm(state, &key, func) })
-}
-
-#[repr(u32)]
-pub enum Pad {
-    AlreadyPadded = 0,
-    Pad = 1,
-}
-
-impl Funcs {
-    #[inline]
-    fn blocks(&self, state: &mut Opaque, data: &[u8], should_pad: Pad) {
-        unsafe {
-            (self.blocks_fn)(state, data.as_ptr(), data.len(), should_pad);
-        }
-    }
-
-    #[inline]
-    fn emit(&self, state: &mut Opaque, nonce: &Nonce) -> Tag {
-        let mut tag = Tag(Block::zero());
-        unsafe {
-            (self.emit_fn)(state, &mut tag, nonce);
-        }
+        let mut tag = Tag([0u8; TAG_LEN]);
+        dispatch!(
+            self.cpu_features =>
+            (GFp_poly1305_finish | GFp_poly1305_finish_neon)
+            (statep: &mut poly1305_state, mac: &mut [u8; TAG_LEN])
+            (&mut self.state, &mut tag.0));
         tag
     }
 }
@@ -174,16 +124,7 @@ impl Funcs {
 /// poly1305 test vectors.
 pub(super) fn sign(key: Key, input: &[u8]) -> Tag {
     let mut ctx = Context::from_key(key);
-    let remainder_len = input.len() % BLOCK_LEN;
-    let full_blocks_len = input.len() - remainder_len;
-    let (full_blocks, remainder) = input.split_at(full_blocks_len);
-    ctx.update_blocks(full_blocks);
-    if remainder_len > 0 {
-        let mut bytes = [0; BLOCK_LEN];
-        bytes[..remainder_len].copy_from_slice(remainder);
-        bytes[remainder_len] = 1;
-        ctx.update_block(Block::from(&bytes), Pad::AlreadyPadded);
-    }
+    ctx.update(input);
     ctx.finish()
 }
 
@@ -193,21 +134,17 @@ mod tests {
     use crate::test;
     use core::convert::TryInto;
 
-    #[test]
-    pub fn test_state_layout() {
-        check_state_layout();
-    }
-
     // Adapted from BoringSSL's crypto/poly1305/poly1305_test.cc.
     #[test]
     pub fn test_poly1305() {
+        let cpu_features = cpu::features();
         test::run(test_file!("poly1305_test.txt"), |section, test_case| {
             assert_eq!(section, "");
             let key = test_case.consume_bytes("Key");
             let key: &[u8; BLOCK_LEN * 2] = key.as_slice().try_into().unwrap();
             let input = test_case.consume_bytes("Input");
             let expected_mac = test_case.consume_bytes("MAC");
-            let key = Key::from(*key);
+            let key = Key::new(*key, cpu_features);
             let Tag(actual_mac) = sign(key, &input);
             assert_eq!(expected_mac, actual_mac.as_ref());
 
diff --git a/src/aead/quic.rs b/src/aead/quic.rs
index 7f0f0a795d..ac667aeda1 100644
--- a/src/aead/quic.rs
+++ b/src/aead/quic.rs
@@ -17,7 +17,7 @@
 //! See draft-ietf-quic-tls.
 
 use crate::{
-    aead::{aes, block::Block, chacha},
+    aead::{aes, chacha},
     cpu, error, hkdf,
 };
 use core::convert::{TryFrom, TryInto};
@@ -28,7 +28,7 @@ pub struct HeaderProtectionKey {
     algorithm: &'static Algorithm,
 }
 
-#[allow(variant_size_differences)]
+#[allow(clippy::large_enum_variant, variant_size_differences)]
 enum KeyInner {
     Aes(aes::Key),
     ChaCha20(chacha::Key),
@@ -63,9 +63,8 @@ impl HeaderProtectionKey {
     /// `sample` must be exactly `self.algorithm().sample_len()` bytes long.
     pub fn new_mask(&self, sample: &[u8]) -> Result<[u8; 5], error::Unspecified> {
         let sample = <&[u8; SAMPLE_LEN]>::try_from(sample)?;
-        let sample = Block::from(sample);
 
-        let out = (self.algorithm.new_mask)(&self.inner, sample);
+        let out = (self.algorithm.new_mask)(&self.inner, *sample);
         Ok(out)
     }
 
@@ -78,11 +77,14 @@ impl HeaderProtectionKey {
 
 const SAMPLE_LEN: usize = super::TAG_LEN;
 
+/// QUIC sample for new key masks
+pub type Sample = [u8; SAMPLE_LEN];
+
 /// A QUIC Header Protection Algorithm.
 pub struct Algorithm {
     init: fn(key: &[u8], cpu_features: cpu::Features) -> Result<KeyInner, error::Unspecified>,
 
-    new_mask: fn(key: &KeyInner, sample: Block) -> [u8; 5],
+    new_mask: fn(key: &KeyInner, sample: Sample) -> [u8; 5],
 
     key_len: usize,
     id: AlgorithmID,
@@ -152,7 +154,7 @@ fn aes_init_256(key: &[u8], cpu_features: cpu::Features) -> Result<KeyInner, err
     Ok(KeyInner::Aes(aes_key))
 }
 
-fn aes_new_mask(key: &KeyInner, sample: Block) -> [u8; 5] {
+fn aes_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] {
     let aes_key = match key {
         KeyInner::Aes(key) => key,
         _ => unreachable!(),
@@ -174,7 +176,7 @@ fn chacha20_init(key: &[u8], _todo: cpu::Features) -> Result<KeyInner, error::Un
     Ok(KeyInner::ChaCha20(chacha::Key::from(chacha20_key)))
 }
 
-fn chacha20_new_mask(key: &KeyInner, sample: Block) -> [u8; 5] {
+fn chacha20_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] {
     let chacha20_key = match key {
         KeyInner::ChaCha20(key) => key,
         _ => unreachable!(),
diff --git a/src/agreement.rs b/src/agreement.rs
index 4c1e803430..d116c8faab 100644
--- a/src/agreement.rs
+++ b/src/agreement.rs
@@ -63,7 +63,6 @@
 // Model."
 
 use crate::{cpu, debug, ec, error, rand};
-use untrusted;
 
 pub use crate::ec::{
     curve25519::x25519::X25519,
diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs
index f563b66c51..620595effd 100644
--- a/src/arithmetic/bigint.rs
+++ b/src/arithmetic/bigint.rs
@@ -46,7 +46,6 @@ use core::{
     marker::PhantomData,
     ops::{Deref, DerefMut},
 };
-use untrusted;
 
 pub unsafe trait Prime {}
 
@@ -86,7 +85,7 @@ impl<M> Clone for BoxedLimbs<M> {
     fn clone(&self) -> Self {
         Self {
             limbs: self.limbs.clone(),
-            m: self.m.clone(),
+            m: self.m,
         }
     }
 }
@@ -137,7 +136,7 @@ impl<M> BoxedLimbs<M> {
 
     fn zero(width: Width<M>) -> Self {
         Self {
-            limbs: vec![0; width.num_limbs].to_owned().into_boxed_slice(),
+            limbs: vec![0; width.num_limbs].into_boxed_slice(),
             m: PhantomData,
         }
     }
@@ -264,6 +263,7 @@ impl<M> Modulus<M> {
 
         // n_mod_r = n % r. As explained in the documentation for `n0`, this is
         // done by taking the lowest `N0_LIMBS_USED` limbs of `n`.
+        #[allow(clippy::useless_conversion)]
         let n0 = {
             extern "C" {
                 fn GFp_bn_neg_inv_mod_r_u64(n: u64) -> u64;
@@ -389,7 +389,7 @@ impl<M, E> Clone for Elem<M, E> {
     fn clone(&self) -> Self {
         Self {
             limbs: self.limbs.clone(),
-            encoding: self.encoding.clone(),
+            encoding: self.encoding,
         }
     }
 }
@@ -1169,7 +1169,7 @@ impl Nonnegative {
                 return Err(error::Unspecified);
             }
         }
-        return Ok(());
+        Ok(())
     }
 }
 
@@ -1397,7 +1397,6 @@ mod tests {
     use super::*;
     use crate::test;
     use alloc::format;
-    use untrusted;
 
     // Type-level representation of an arbitrary modulus.
     struct M {}
@@ -1530,7 +1529,7 @@ mod tests {
     #[test]
     fn test_modulus_debug() {
         let (modulus, _) = Modulus::<M>::from_be_bytes_with_bit_length(untrusted::Input::from(
-            &vec![0xff; LIMB_BYTES * MODULUS_MIN_LIMBS],
+            &[0xff; LIMB_BYTES * MODULUS_MIN_LIMBS],
         ))
         .unwrap();
         assert_eq!("Modulus", format!("{:?}", modulus));
diff --git a/src/cpu.rs b/src/cpu.rs
index 6ce9ef1b06..f81cb8b4bc 100644
--- a/src/cpu.rs
+++ b/src/cpu.rs
@@ -24,24 +24,23 @@ pub(crate) struct Features(());
 
 #[inline(always)]
 pub(crate) fn features() -> Features {
-    // We don't do runtime feature detection on iOS. instead some features are
-    // assumed to be present; see `arm::Feature`.
-    #[cfg(all(
-        any(
-            target_arch = "aarch64",
-            target_arch = "arm",
-            target_arch = "x86",
-            target_arch = "x86_64",
-            target_env = "sgx"
-        ),
-        not(target_os = "ios")
+    // We don't do runtime feature detection on aarch64-apple-* as all AAarch64
+    // features we use are available on every device since the first devices.
+    #[cfg(any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        target_env = "sgx",
+        all(
+            any(target_arch = "aarch64", target_arch = "arm"),
+            any(target_os = "android", target_os = "fuchsia", target_os = "linux")
+        )
     ))]
     {
         static INIT: spin::Once<()> = spin::Once::new();
         let () = INIT.call_once(|| {
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             {
-                #[cfg(all(target_env = "sgx", feature = "sgx"))]
+                #[cfg(target_env = "sgx")]
                 {
                     extern crate std;
                     use std::is_x86_feature_detected;
@@ -52,97 +51,97 @@ pub(crate) fn features() -> Features {
                     let [l1edx, l1ecx, l7ebx, l7ecx] = unsafe { &mut GFp_ia32cap_P };
 
                     if is_x86_feature_detected!("aes") {
-                        *l1ecx |= 1<<25;
+                        *l1ecx |= 1 << 25;
                     }
                     if is_x86_feature_detected!("pclmulqdq") {
-                        *l1ecx |= 1<<1;
+                        *l1ecx |= 1 << 1;
                     }
                     if is_x86_feature_detected!("rdrand") {
-                        *l1ecx |= 1<<30;
+                        *l1ecx |= 1 << 30;
                     }
                     if is_x86_feature_detected!("rdseed") {
-                        *l7ebx |= 1<<18;
+                        *l7ebx |= 1 << 18;
                     }
                     if is_x86_feature_detected!("tsc") {
-                        *l1edx |= 1<<4;
+                        *l1edx |= 1 << 4;
                     }
                     if is_x86_feature_detected!("mmx") {
-                        *l1edx |= 1<<23;
+                        *l1edx |= 1 << 23;
                     }
                     if is_x86_feature_detected!("sse") {
-                        *l1edx |= 1<<25;
+                        *l1edx |= 1 << 25;
                     }
                     if is_x86_feature_detected!("sse2") {
-                        *l1edx |= 1<<26;
+                        *l1edx |= 1 << 26;
                     }
                     if is_x86_feature_detected!("sse3") {
-                        *l1ecx |= 1<<0;
+                        *l1ecx |= 1 << 0;
                     }
                     if is_x86_feature_detected!("ssse3") {
-                        *l1ecx |= 1<<9;
+                        *l1ecx |= 1 << 9;
                     }
                     if is_x86_feature_detected!("sse4.1") {
-                        *l1ecx |= 1<<19;
+                        *l1ecx |= 1 << 19;
                     }
                     if is_x86_feature_detected!("sse4.2") {
-                        *l1ecx |= 1<<20;
+                        *l1ecx |= 1 << 20;
                     }
                     if is_x86_feature_detected!("sha") {
-                        *l7ebx |= 1<<29;
+                        *l7ebx |= 1 << 29;
                     }
                     if is_x86_feature_detected!("avx") {
-                        *l1ecx |= 1<<28;
+                        *l1ecx |= 1 << 28;
                     }
                     if is_x86_feature_detected!("avx2") {
-                        *l7ebx |= 1<<5;
+                        *l7ebx |= 1 << 5;
                     }
                     if is_x86_feature_detected!("avx512f") {
-                        *l7ebx |= 1<<16;
+                        *l7ebx |= 1 << 16;
                     }
                     if is_x86_feature_detected!("avx512cd") {
-                        *l7ebx |= 1<<28;
+                        *l7ebx |= 1 << 28;
                     }
                     if is_x86_feature_detected!("avx512er") {
-                        *l7ebx |= 1<<27;
+                        *l7ebx |= 1 << 27;
                     }
                     if is_x86_feature_detected!("avx512pf") {
-                        *l7ebx |= 1<<26;
+                        *l7ebx |= 1 << 26;
                     }
                     if is_x86_feature_detected!("avx512bw") {
-                        *l7ebx |= 1<<30;
+                        *l7ebx |= 1 << 30;
                     }
                     if is_x86_feature_detected!("avx512dq") {
-                        *l7ebx |= 1<<17;
+                        *l7ebx |= 1 << 17;
                     }
                     if is_x86_feature_detected!("avx512vl") {
-                        *l7ebx |= 1<<31;
+                        *l7ebx |= 1 << 31;
                     }
                     if is_x86_feature_detected!("avx512ifma") {
-                        *l7ebx |= 1<<21;
+                        *l7ebx |= 1 << 21;
                     }
                     if is_x86_feature_detected!("avx512vbmi") {
-                        *l7ecx |= 1<<1;
+                        *l7ecx |= 1 << 1;
                     }
                     if is_x86_feature_detected!("avx512vpopcntdq") {
-                        *l7ecx |= 1<<14;
+                        *l7ecx |= 1 << 14;
                     }
                     if is_x86_feature_detected!("fma") {
-                        *l1ecx |= 1<<12;
+                        *l1ecx |= 1 << 12;
                     }
                     if is_x86_feature_detected!("bmi1") {
-                        *l7ebx |= 1<<3;
+                        *l7ebx |= 1 << 3;
                     }
                     if is_x86_feature_detected!("bmi2") {
-                        *l7ebx |= 1<<8;
+                        *l7ebx |= 1 << 8;
                     }
                     if is_x86_feature_detected!("popcnt") {
-                        *l1ecx |= 1<<23;
+                        *l1ecx |= 1 << 23;
                     }
                     if is_x86_feature_detected!("fxsr") {
-                        *l1edx |= 1<<24;
+                        *l1edx |= 1 << 24;
                     }
                     if is_x86_feature_detected!("xsave") {
-                        *l1ecx |= 1<<26;
+                        *l1ecx |= 1 << 26;
                     }
                     /* will be stable on 1.33.0
                     if is_x86_feature_detected!("cmpxchg16b") {
@@ -155,14 +154,14 @@ pub(crate) fn features() -> Features {
 
                     // Rust can't detect the MOVBE feature yet, but it's widely
                     // available.
-                    *l1ecx |= 1<<22;
+                    *l1ecx |= 1 << 22;
 
                     // This bit is reserved in the CPUID specification, but the
                     // BoringSSL detection code uses it to represent that this
                     // is an Intel CPU. However, this bit is only used in
                     // conjunction with the AVX bit to test for presence of
                     // AVX, thus serving no purpose. Always set it.
-                    *l1edx |= 1<<30;
+                    *l1edx |= 1 << 30;
 
                     // Features that don't map to leaf 1 or leaf 7:
                     //   Leaf 0xd:
@@ -175,7 +174,6 @@ pub(crate) fn features() -> Features {
                     //   * lzcnt
                     //   * tbm
                 }
-                
 
                 #[cfg(not(target_env = "sgx"))]
                 {
@@ -189,16 +187,11 @@ pub(crate) fn features() -> Features {
             }
 
             #[cfg(all(
-                any(target_os = "android", target_os = "linux"),
-                any(target_arch = "aarch64", target_arch = "arm")
+                any(target_arch = "aarch64", target_arch = "arm"),
+                any(target_os = "android", target_os = "fuchsia", target_os = "linux")
             ))]
             {
-                arm::linux_setup();
-            }
-
-            #[cfg(all(target_os = "fuchsia", any(target_arch = "aarch64")))]
-            {
-                arm::fuchsia_setup();
+                arm::setup();
             }
         });
     }
@@ -211,7 +204,7 @@ pub(crate) mod arm {
         any(target_os = "android", target_os = "linux"),
         any(target_arch = "aarch64", target_arch = "arm")
     ))]
-    pub fn linux_setup() {
+    pub fn setup() {
         use libc::c_ulong;
 
         // XXX: The `libc` crate doesn't provide `libc::getauxval` consistently
@@ -263,15 +256,15 @@ pub(crate) mod arm {
                 features |= PMULL.mask;
             }
             if caps & HWCAP_SHA2 == HWCAP_SHA2 {
-                features |= 1 << 4;
+                features |= SHA256.mask;
             }
 
             unsafe { GFp_armcap_P = features };
         }
     }
 
-    #[cfg(all(target_os = "fuchsia", any(target_arch = "aarch64")))]
-    pub fn fuchsia_setup() {
+    #[cfg(all(target_os = "fuchsia", target_arch = "aarch64"))]
+    pub fn setup() {
         type zx_status_t = i32;
 
         #[link(name = "zircon")]
@@ -308,82 +301,139 @@ pub(crate) mod arm {
         }
     }
 
-    #[cfg(not(target_arch = "wasm32"))]
+    macro_rules! features {
+        {
+            $(
+                $name:ident {
+                    mask: $mask:expr,
+
+                    /// Should we assume that the feature is always available
+                    /// for aarch64-apple-* targets? The first AArch64 iOS
+                    /// device used the Apple A7 chip.
+                    // TODO: When we can use `if` in const expressions:
+                    // ```
+                    // aarch64_apple: $aarch64_apple,
+                    // ```
+                    aarch64_apple: true,
+                }
+            ),+
+            , // trailing comma is required.
+        } => {
+            $(
+                #[allow(dead_code)]
+                pub(crate) const $name: Feature = Feature {
+                    mask: $mask,
+                };
+            )+
+
+            // TODO: When we can use `if` in const expressions, do this:
+            // ```
+            // const ARMCAP_STATIC: u32 = 0
+            //    $(
+            //        | ( if $aarch64_apple &&
+            //               cfg!(all(target_arch = "aarch64",
+            //                        target_vendor = "apple")) {
+            //                $name.mask
+            //            } else {
+            //                0
+            //            }
+            //          )
+            //    )+;
+            // ```
+            //
+            // TODO: Add static feature detection to other targets.
+            // TODO: Combine static feature detection with runtime feature
+            //       detection.
+            #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
+            const ARMCAP_STATIC: u32 = 0
+                $(  | $name.mask
+                )+;
+            #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))]
+            const ARMCAP_STATIC: u32 = 0;
+
+            #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
+            #[test]
+            fn test_armcap_static_available() {
+                let features = crate::cpu::features();
+                $(
+                    assert!($name.available(features));
+                )+
+            }
+        }
+    }
+
+    #[allow(dead_code)]
     pub(crate) struct Feature {
-        #[cfg_attr(
-            any(
-                target_os = "ios",
-                not(any(target_arch = "arm", target_arch = "aarch64"))
-            ),
-            allow(dead_code)
-        )]
         mask: u32,
-
-        #[cfg_attr(not(target_os = "ios"), allow(dead_code))]
-        ios: bool,
     }
 
-    #[cfg(not(target_arch = "wasm32"))]
     impl Feature {
+        #[allow(dead_code)]
         #[inline(always)]
         pub fn available(&self, _: super::Features) -> bool {
-            #[cfg(all(target_os = "ios", any(target_arch = "arm", target_arch = "aarch64")))]
-            {
-                return self.ios;
+            if self.mask == self.mask & ARMCAP_STATIC {
+                return true;
             }
 
             #[cfg(all(
-                any(target_os = "android", target_os = "linux", target_os = "fuchsia"),
+                any(target_os = "android", target_os = "fuchsia", target_os = "linux"),
                 any(target_arch = "arm", target_arch = "aarch64")
             ))]
             {
-                return self.mask == self.mask & unsafe { GFp_armcap_P };
+                if self.mask == self.mask & unsafe { GFp_armcap_P } {
+                    return true;
+                }
             }
 
-            #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
-            {
-                return false;
-            }
+            false
         }
     }
 
-    // Keep in sync with `ARMV7_NEON`.
-    #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
-    pub(crate) const NEON: Feature = Feature {
-        mask: 1 << 0,
-        ios: true,
-    };
-
-    // Keep in sync with `ARMV8_AES`.
-    #[cfg(any(
-        target_arch = "aarch64",
-        target_arch = "arm",
-        target_arch = "x86",
-        target_arch = "x86_64"
-    ))]
-    pub(crate) const AES: Feature = Feature {
-        mask: 1 << 2,
-        ios: true,
-    };
+    features! {
+        // Keep in sync with `ARMV7_NEON`.
+        NEON {
+            mask: 1 << 0,
+            aarch64_apple: true,
+        },
+
+        // Keep in sync with `ARMV8_AES`.
+        AES {
+            mask: 1 << 2,
+            aarch64_apple: true,
+        },
+
+        // Keep in sync with `ARMV8_SHA256`.
+        SHA256 {
+            mask: 1 << 4,
+            aarch64_apple: true,
+        },
+
+        // Keep in sync with `ARMV8_PMULL`.
+        PMULL {
+            mask: 1 << 5,
+            aarch64_apple: true,
+        },
+    }
 
-    // Keep in sync with `ARMV8_PMULL`.
-    #[cfg(any(
-        target_arch = "aarch64",
-        target_arch = "arm",
-        target_arch = "x86",
-        target_arch = "x86_64"
-    ))]
-    pub(crate) const PMULL: Feature = Feature {
-        mask: 1 << 5,
-        ios: true,
-    };
+    // Some non-Rust code still checks this even when it is statically known
+    // the given feature is available, so we have to ensure that this is
+    // initialized properly. Keep this in sync with the initialization in
+    // BoringSSL's crypto.c.
+    //
+    // TODO: This should have "hidden" visibility but we don't have a way of
+    // controlling that yet: https://github.com/rust-lang/rust/issues/73958.
+    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+    #[no_mangle]
+    static mut GFp_armcap_P: u32 = ARMCAP_STATIC;
 
     #[cfg(all(
-        any(target_os = "android", target_os = "linux", target_os = "fuchsia"),
-        any(target_arch = "arm", target_arch = "aarch64")
+        any(target_arch = "arm", target_arch = "aarch64"),
+        target_vendor = "apple"
     ))]
-    extern "C" {
-        static mut GFp_armcap_P: u32;
+    #[test]
+    fn test_armcap_static_matches_armcap_dynamic() {
+        assert_eq!(ARMCAP_STATIC, 1 | 4 | 16 | 32);
+        assert_eq!(ARMCAP_STATIC, unsafe { GFp_armcap_P });
     }
 }
 
@@ -398,6 +448,7 @@ pub(crate) mod intel {
     }
 
     impl Feature {
+        #[allow(clippy::needless_return)]
         #[inline(always)]
         pub fn available(&self, _: super::Features) -> bool {
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
diff --git a/src/ec/curve25519/ed25519/signing.rs b/src/ec/curve25519/ed25519/signing.rs
index b89e0db66a..3b522e8191 100644
--- a/src/ec/curve25519/ed25519/signing.rs
+++ b/src/ec/curve25519/ed25519/signing.rs
@@ -22,7 +22,6 @@ use crate::{
     signature::{self, KeyPair as SigningKeyPair},
 };
 use core::convert::TryInto;
-use untrusted;
 
 /// An Ed25519 key pair, for signing.
 pub struct Ed25519KeyPair {
@@ -43,10 +42,11 @@ impl Ed25519KeyPair {
     /// PKCS#8 document.
     ///
     /// The PKCS#8 document will be a v2 `OneAsymmetricKey` with the public key,
-    /// as described in [RFC 5958 Section 2]. See also
-    /// https://tools.ietf.org/html/draft-ietf-curdle-pkix-04.
+    /// as described in [RFC 5958 Section 2]; see [RFC 8410 Section 10.3] for an
+    /// example.
     ///
     /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2
+    /// [RFC 8410 Section 10.3]: https://tools.ietf.org/html/rfc8410#section-10.3
     pub fn generate_pkcs8(
         rng: &dyn rand::SecureRandom,
     ) -> Result<pkcs8::Document, error::Unspecified> {
diff --git a/src/ec/curve25519/ed25519/verification.rs b/src/ec/curve25519/ed25519/verification.rs
index 6d082e093f..e0c1b652fa 100644
--- a/src/ec/curve25519/ed25519/verification.rs
+++ b/src/ec/curve25519/ed25519/verification.rs
@@ -17,7 +17,6 @@
 use super::{super::ops::*, eddsa_digest};
 use crate::{error, sealed, signature};
 use core::convert::TryInto;
-use untrusted;
 
 /// Parameters for EdDSA signing and verification.
 pub struct EdDSAParameters;
diff --git a/src/ec/curve25519/x25519.rs b/src/ec/curve25519/x25519.rs
index 44ee17f4a9..53a2a5cf84 100644
--- a/src/ec/curve25519/x25519.rs
+++ b/src/ec/curve25519/x25519.rs
@@ -17,7 +17,6 @@
 use super::{ops, scalar::SCALAR_LEN};
 use crate::{agreement, constant_time, cpu, ec, error, rand};
 use core::convert::TryInto;
-use untrusted;
 
 static CURVE25519: ec::Curve = ec::Curve {
     public_key_len: PUBLIC_KEY_LEN,
diff --git a/src/ec/suite_b.rs b/src/ec/suite_b.rs
index caa5b3f1df..9e363563b8 100644
--- a/src/ec/suite_b.rs
+++ b/src/ec/suite_b.rs
@@ -16,7 +16,6 @@
 
 use self::ops::*;
 use crate::{arithmetic::montgomery::*, cpu, ec, error, io::der, limb::LimbMask, pkcs8};
-use untrusted;
 
 // NIST SP 800-56A Step 3: "If q is an odd prime p, verify that
 // yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed modulo
diff --git a/src/ec/suite_b/curve.rs b/src/ec/suite_b/curve.rs
index 0788e10712..e0ff4f4617 100644
--- a/src/ec/suite_b/curve.rs
+++ b/src/ec/suite_b/curve.rs
@@ -31,7 +31,7 @@ macro_rules! suite_b_curve {
         /// [NIST Special Publication 800-56A, revision 2]:
         ///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
         /// [Suite B Implementer's Guide to NIST SP 800-56A]:
-        ///     https://github.com/briansmith/ring/blob/master/doc/ecdh.pdf
+        ///     https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf
         pub static $NAME: ec::Curve = ec::Curve {
             public_key_len: 1 + (2 * (($bits + 7) / 8)),
             elem_scalar_seed_len: ($bits + 7) / 8,
diff --git a/src/ec/suite_b/ecdh.rs b/src/ec/suite_b/ecdh.rs
index f8680ccad6..aae31d2369 100644
--- a/src/ec/suite_b/ecdh.rs
+++ b/src/ec/suite_b/ecdh.rs
@@ -16,7 +16,6 @@
 
 use super::{ops::*, private_key::*, public_key::*};
 use crate::{agreement, ec, error};
-use untrusted;
 
 /// A key agreement algorithm.
 macro_rules! ecdh {
@@ -38,7 +37,7 @@ macro_rules! ecdh {
         /// [NIST Special Publication 800-56A, revision 2]:
         ///     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
         /// [Suite B Implementer's Guide to NIST SP 800-56A]:
-        ///     https://github.com/briansmith/ring/blob/master/doc/ecdh.pdf
+        ///     https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf
         pub static $NAME: agreement::Algorithm = agreement::Algorithm {
             curve: $curve,
             ecdh: $ecdh,
diff --git a/src/ec/suite_b/ecdsa/digest_scalar.rs b/src/ec/suite_b/ecdsa/digest_scalar.rs
index 1f885c1a92..133e7da5df 100644
--- a/src/ec/suite_b/ecdsa/digest_scalar.rs
+++ b/src/ec/suite_b/ecdsa/digest_scalar.rs
@@ -19,7 +19,6 @@ use crate::{
     ec::suite_b::ops::*,
     limb::{self, LIMB_BYTES},
 };
-use untrusted;
 
 /// Calculate the digest of `msg` using the digest algorithm `digest_alg`. Then
 /// convert the digest to a scalar in the range [0, n) as described in
@@ -84,7 +83,6 @@ mod tests {
         limb::{self, LIMB_BYTES},
         test,
     };
-    use untrusted;
 
     #[test]
     fn test() {
diff --git a/src/ec/suite_b/ecdsa/signing.rs b/src/ec/suite_b/ecdsa/signing.rs
index ca2bb39b05..5422e06b16 100644
--- a/src/ec/suite_b/ecdsa/signing.rs
+++ b/src/ec/suite_b/ecdsa/signing.rs
@@ -26,8 +26,6 @@ use crate::{
     io::der,
     limb, pkcs8, rand, sealed, signature,
 };
-use untrusted;
-
 /// An ECDSA signing algorithm.
 pub struct EcdsaSigningAlgorithm {
     curve: &'static ec::Curve,
diff --git a/src/ec/suite_b/ecdsa/verification.rs b/src/ec/suite_b/ecdsa/verification.rs
index 2b4ccbed69..be551e695d 100644
--- a/src/ec/suite_b/ecdsa/verification.rs
+++ b/src/ec/suite_b/ecdsa/verification.rs
@@ -23,7 +23,6 @@ use crate::{
     io::der,
     limb, sealed, signature,
 };
-use untrusted;
 
 /// An ECDSA verification algorithm.
 pub struct EcdsaVerificationAlgorithm {
diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs
index 6bcb8a4bb5..13d80c0d2e 100644
--- a/src/ec/suite_b/ops.rs
+++ b/src/ec/suite_b/ops.rs
@@ -14,7 +14,6 @@
 
 use crate::{arithmetic::montgomery::*, c, error, limb::*};
 use core::marker::PhantomData;
-use untrusted;
 
 pub use self::elem::*;
 
@@ -441,7 +440,6 @@ mod tests {
     use super::*;
     use crate::test;
     use alloc::{format, vec, vec::Vec};
-    use untrusted;
 
     const ZERO_SCALAR: Scalar = Scalar {
         limbs: [0; MAX_LIMBS],
@@ -491,11 +489,11 @@ mod tests {
             let b = consume_elem(cops, test_case, "b");
             let expected_sum = consume_elem(cops, test_case, "r");
 
-            let mut actual_sum = a.clone();
+            let mut actual_sum = a;
             ops.public_key_ops.common.elem_add(&mut actual_sum, &b);
             assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs);
 
-            let mut actual_sum = b.clone();
+            let mut actual_sum = b;
             ops.public_key_ops.common.elem_add(&mut actual_sum, &a);
             assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs);
 
@@ -692,10 +690,10 @@ mod tests {
         test::run(test_file, |section, test_case| {
             assert_eq!(section, "");
             let cops = ops.common;
-            let mut a = consume_scalar(cops, test_case, "a");
+            let a = consume_scalar(cops, test_case, "a");
             let b = consume_scalar_mont(cops, test_case, "b");
             let expected_result = consume_scalar(cops, test_case, "r");
-            let actual_result = ops.scalar_product(&mut a, &b);
+            let actual_result = ops.scalar_product(&a, &b);
             assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs);
 
             Ok(())
@@ -1072,7 +1070,7 @@ mod tests {
         p
     }
 
-    fn consume_point_elem(ops: &CommonOps, limbs_out: &mut [Limb], elems: &Vec<&str>, i: usize) {
+    fn consume_point_elem(ops: &CommonOps, limbs_out: &mut [Limb], elems: &[&str], i: usize) {
         let bytes = test::from_hex(elems[i]).unwrap();
         let bytes = untrusted::Input::from(&bytes);
         let r: Elem<Unencoded> = elem_parse_big_endian_fixed_consttime(ops, bytes).unwrap();
@@ -1087,7 +1085,7 @@ mod tests {
     }
 
     fn consume_point(ops: &PrivateKeyOps, test_case: &mut test::TestCase, name: &str) -> TestPoint {
-        fn consume_point_elem(ops: &CommonOps, elems: &Vec<&str>, i: usize) -> Elem<R> {
+        fn consume_point_elem(ops: &CommonOps, elems: &[&str], i: usize) -> Elem<R> {
             let bytes = test::from_hex(elems[i]).unwrap();
             let bytes = untrusted::Input::from(&bytes);
             let unencoded: Elem<Unencoded> =
@@ -1178,84 +1176,6 @@ mod tests {
     }
 }
 
-#[cfg(feature = "internal_benches")]
-mod internal_benches {
-    use super::{Limb, MAX_LIMBS};
-
-    pub const LIMBS_1: [Limb; MAX_LIMBS] = limbs![1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
-
-    pub const LIMBS_ALTERNATING_10: [Limb; MAX_LIMBS] = limbs![
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010,
-        0b10101010_10101010_10101010_10101010
-    ];
-}
-
-#[cfg(feature = "internal_benches")]
-macro_rules! bench_curve {
-    ( $vectors:expr ) => {
-        use super::super::{Elem, Scalar};
-        extern crate test;
-
-        #[bench]
-        fn elem_inverse_squared_bench(bench: &mut test::Bencher) {
-            // This benchmark assumes that `elem_inverse_squared()` is
-            // constant-time so inverting 1 mod q is as good of a choice as
-            // anything.
-            let mut a = Elem::zero();
-            a.limbs[0] = 1;
-            bench.iter(|| {
-                let _ = PRIVATE_KEY_OPS.elem_inverse_squared(&a);
-            });
-        }
-
-        #[bench]
-        fn elem_product_bench(bench: &mut test::Bencher) {
-            // This benchmark assumes that the multiplication is constant-time
-            // so 0 * 0 is as good of a choice as anything.
-            let a: Elem<R> = Elem::zero();
-            let b: Elem<R> = Elem::zero();
-            bench.iter(|| {
-                let _ = COMMON_OPS.elem_product(&a, &b);
-            });
-        }
-
-        #[bench]
-        fn elem_squared_bench(bench: &mut test::Bencher) {
-            // This benchmark assumes that the squaring is constant-time so
-            // 0**2 * 0 is as good of a choice as anything.
-            let a = Elem::zero();
-            bench.iter(|| {
-                let _ = COMMON_OPS.elem_squared(&a);
-            });
-        }
-
-        #[bench]
-        fn scalar_inv_to_mont_bench(bench: &mut test::Bencher) {
-            const VECTORS: &[Scalar] = $vectors;
-            let vectors_len = VECTORS.len();
-            let mut i = 0;
-            bench.iter(|| {
-                let _ = SCALAR_OPS.scalar_inv_to_mont(&VECTORS[i]);
-
-                i += 1;
-                if i == vectors_len {
-                    i = 0;
-                }
-            });
-        }
-    };
-}
-
 mod elem;
 pub mod p256;
 pub mod p384;
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index 69c89eb893..4c54f5c00e 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -380,36 +380,3 @@ extern "C" {
         rep: Limb,
     );
 }
-
-#[cfg(feature = "internal_benches")]
-mod internal_benches {
-    use super::{super::internal_benches::*, *};
-
-    bench_curve!(&[
-        Scalar {
-            limbs: LIMBS_1,
-            m: PhantomData,
-            encoding: PhantomData,
-        },
-        Scalar {
-            limbs: LIMBS_ALTERNATING_10,
-            m: PhantomData,
-            encoding: PhantomData,
-        },
-        Scalar {
-            // n - 1
-            limbs: p256_limbs![
-                0xfc632551 - 1,
-                0xf3b9cac2,
-                0xa7179e84,
-                0xbce6faad,
-                0xffffffff,
-                0xffffffff,
-                0x00000000,
-                0xffffffff
-            ],
-            m: PhantomData,
-            encoding: PhantomData,
-        },
-    ]);
-}
diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs
index 4b2ecb8300..7ecba1f18f 100644
--- a/src/ec/suite_b/ops/p384.rs
+++ b/src/ec/suite_b/ops/p384.rs
@@ -368,40 +368,3 @@ extern "C" {
         b: *const Limb, // [COMMON_OPS.num_limbs]
     );
 }
-
-#[cfg(feature = "internal_benches")]
-mod internal_benches {
-    use super::{super::internal_benches::*, *};
-
-    bench_curve!(&[
-        Scalar {
-            limbs: LIMBS_1,
-            encoding: PhantomData,
-            m: PhantomData
-        },
-        Scalar {
-            limbs: LIMBS_ALTERNATING_10,
-            encoding: PhantomData,
-            m: PhantomData
-        },
-        Scalar {
-            // n - 1
-            limbs: p384_limbs![
-                0xccc52973 - 1,
-                0xecec196a,
-                0x48b0a77a,
-                0x581a0db2,
-                0xf4372ddf,
-                0xc7634d81,
-                0xffffffff,
-                0xffffffff,
-                0xffffffff,
-                0xffffffff,
-                0xffffffff,
-                0xffffffff
-            ],
-            encoding: PhantomData,
-            m: PhantomData,
-        },
-    ]);
-}
diff --git a/src/ec/suite_b/private_key.rs b/src/ec/suite_b/private_key.rs
index fb16b245e9..31c35664f6 100644
--- a/src/ec/suite_b/private_key.rs
+++ b/src/ec/suite_b/private_key.rs
@@ -22,7 +22,6 @@ use crate::{
     limb::{self, LIMB_BYTES},
     rand,
 };
-use untrusted;
 
 /// Generates a random scalar in the range [1, n).
 pub fn random_scalar(
diff --git a/src/ec/suite_b/public_key.rs b/src/ec/suite_b/public_key.rs
index 4521af339e..5bafa36039 100644
--- a/src/ec/suite_b/public_key.rs
+++ b/src/ec/suite_b/public_key.rs
@@ -17,7 +17,6 @@
 
 use super::{ops::*, verify_affine_point_is_on_the_curve};
 use crate::{arithmetic::montgomery::*, error};
-use untrusted;
 
 /// Parses a public key encoded in uncompressed form. The key is validated
 /// using the ECC Partial Public-Key Validation Routine from
@@ -69,7 +68,6 @@ pub fn parse_uncompressed_point(
 mod tests {
     use super::{super::ops, *};
     use crate::test;
-    use untrusted;
 
     #[test]
     fn parse_uncompressed_point_test() {
diff --git a/src/endian.rs b/src/endian.rs
index 77ecd91e2d..962397c561 100644
--- a/src/endian.rs
+++ b/src/endian.rs
@@ -2,7 +2,12 @@ use core::{convert::TryInto, num::Wrapping};
 
 /// An `Encoding` of a type `T` can be converted to/from its byte
 /// representation without any byte swapping or other computation.
-pub trait Encoding<T>: From<T> + Into<T> {
+///
+/// The `Self: Copy` constraint addresses `clippy::declare_interior_mutable_const`.
+pub trait Encoding<T>: From<T> + Into<T>
+where
+    Self: Copy,
+{
     const ZERO: Self;
 }
 
@@ -19,6 +24,12 @@ pub trait ArrayEncoding<T> {
     fn as_byte_array(&self) -> &T;
 }
 
+/// Work around the inability to implement `from` for arrays of `Encoding`s
+/// due to the coherence rules.
+pub trait FromByteArray<T> {
+    fn from_byte_array(a: &T) -> Self;
+}
+
 macro_rules! define_endian {
     ($endian:ident) => {
         #[repr(transparent)]
@@ -44,15 +55,29 @@ macro_rules! define_endian {
     };
 }
 
-macro_rules! impl_as_ref {
+macro_rules! impl_from_byte_array {
+    ($endian:ident, $base:ident, $elems:expr) => {
+        impl FromByteArray<[u8; $elems * core::mem::size_of::<$base>()]>
+            for [$endian<$base>; $elems]
+        {
+            fn from_byte_array(a: &[u8; $elems * core::mem::size_of::<$base>()]) -> Self {
+                unsafe { core::mem::transmute_copy(a) }
+            }
+        }
+    };
+}
+
+macro_rules! impl_array_encoding {
     ($endian:ident, $base:ident, $elems:expr) => {
         impl ArrayEncoding<[u8; $elems * core::mem::size_of::<$base>()]>
             for [$endian<$base>; $elems]
         {
-            fn as_byte_array<'a>(&'a self) -> &'a [u8; $elems * core::mem::size_of::<$base>()] {
+            fn as_byte_array(&self) -> &[u8; $elems * core::mem::size_of::<$base>()] {
                 as_byte_slice(self).try_into().unwrap()
             }
         }
+
+        impl_from_byte_array!($endian, $base, $elems);
     };
 }
 
@@ -97,10 +122,11 @@ macro_rules! impl_endian {
             }
         }
 
-        impl_as_ref!($endian, $base, 1);
-        impl_as_ref!($endian, $base, 2);
-        impl_as_ref!($endian, $base, 3);
-        impl_as_ref!($endian, $base, 4);
+        impl_array_encoding!($endian, $base, 1);
+        impl_array_encoding!($endian, $base, 2);
+        impl_array_encoding!($endian, $base, 3);
+        impl_array_encoding!($endian, $base, 4);
+        impl_from_byte_array!($endian, $base, 8);
     };
 }
 
diff --git a/src/error.rs b/src/error.rs
index 65d3df0489..23e2ab32a9 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -14,8 +14,6 @@
 
 //! Error reporting.
 
-use untrusted;
-
 #[cfg(feature = "std")]
 extern crate std;
 
@@ -78,15 +76,30 @@ extern crate std;
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct Unspecified;
 
+impl Unspecified {
+    fn description_() -> &'static str {
+        "ring::error::Unspecified"
+    }
+}
+
 // This is required for the implementation of `std::error::Error`.
 impl core::fmt::Display for Unspecified {
     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        f.write_str("ring::error::Unspecified")
+        f.write_str(Self::description_())
     }
 }
 
 #[cfg(feature = "std")]
-impl std::error::Error for Unspecified {}
+impl std::error::Error for Unspecified {
+    #[inline]
+    fn cause(&self) -> Option<&dyn std::error::Error> {
+        None
+    }
+
+    fn description(&self) -> &str {
+        Self::description_()
+    }
+}
 
 impl From<untrusted::EndOfInput> for Unspecified {
     fn from(_: untrusted::EndOfInput) -> Self {
@@ -102,10 +115,10 @@ impl From<core::array::TryFromSliceError> for Unspecified {
 
 /// An error parsing or validating a key.
 ///
-/// The `Display` implementation will return a string that will help you better
-/// understand why a key was rejected change which errors are reported in which
-/// situations while minimizing the likelihood that any applications will be
-/// broken.
+/// The `Display` implementation and `<KeyRejected as Error>::description()`
+/// will return a string that will help you better understand why a key was
+/// rejected change which errors are reported in which situations while
+/// minimizing the likelihood that any applications will be broken.
 ///
 /// Here is an incomplete list of reasons a key may be unsupported:
 ///
@@ -134,6 +147,11 @@ impl From<core::array::TryFromSliceError> for Unspecified {
 pub struct KeyRejected(&'static str);
 
 impl KeyRejected {
+    /// The value returned from <Self as std::error::Error>::description()
+    pub fn description_(&self) -> &'static str {
+        self.0
+    }
+
     pub(crate) fn inconsistent_components() -> Self {
         KeyRejected("InconsistentComponents")
     }
@@ -185,11 +203,19 @@ impl KeyRejected {
 }
 
 #[cfg(feature = "std")]
-impl std::error::Error for KeyRejected {}
+impl std::error::Error for KeyRejected {
+    fn cause(&self) -> Option<&dyn std::error::Error> {
+        None
+    }
+
+    fn description(&self) -> &str {
+        self.description_()
+    }
+}
 
 impl core::fmt::Display for KeyRejected {
     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        f.write_str(self.0)
+        f.write_str(self.description_())
     }
 }
 
diff --git a/src/hmac.rs b/src/hmac.rs
index 1329058f66..3e2d7e7b0f 100644
--- a/src/hmac.rs
+++ b/src/hmac.rs
@@ -105,9 +105,9 @@
 //!
 //! [RFC 2104]: https://tools.ietf.org/html/rfc2104
 //! [code for `ring::pbkdf2`]:
-//!     https://github.com/briansmith/ring/blob/master/src/pbkdf2.rs
+//!     https://github.com/briansmith/ring/blob/main/src/pbkdf2.rs
 //! [code for `ring::hkdf`]:
-//!     https://github.com/briansmith/ring/blob/master/src/hkdf.rs
+//!     https://github.com/briansmith/ring/blob/main/src/hkdf.rs
 
 use crate::{constant_time, digest, error, hkdf, rand};
 
@@ -182,7 +182,9 @@ impl Key {
     /// random value generated from `rng`.
     ///
     /// The key will be `digest_alg.output_len` bytes long, based on the
-    /// recommendation in https://tools.ietf.org/html/rfc2104#section-3.
+    /// recommendation in [RFC 2104 Section 3].
+    ///
+    /// [RFC 2104 Section 3]: https://tools.ietf.org/html/rfc2104#section-3
     pub fn generate(
         algorithm: Algorithm,
         rng: &dyn rand::SecureRandom,
@@ -363,7 +365,7 @@ mod tests {
     // completely wacky.
     #[test]
     pub fn hmac_signing_key_coverage() {
-        let mut rng = rand::SystemRandom::new();
+        let rng = rand::SystemRandom::new();
 
         const HELLO_WORLD_GOOD: &[u8] = b"hello, world";
         const HELLO_WORLD_BAD: &[u8] = b"hello, worle";
@@ -374,7 +376,7 @@ mod tests {
             hmac::HMAC_SHA384,
             hmac::HMAC_SHA512,
         ] {
-            let key = hmac::Key::generate(*algorithm, &mut rng).unwrap();
+            let key = hmac::Key::generate(*algorithm, &rng).unwrap();
             let tag = hmac::sign(&key, HELLO_WORLD_GOOD);
             assert!(hmac::verify(&key, HELLO_WORLD_GOOD, tag.as_ref()).is_ok());
             assert!(hmac::verify(&key, HELLO_WORLD_BAD, tag.as_ref()).is_err())
diff --git a/src/io/der.rs b/src/io/der.rs
index 325d6f0d8b..1a00d85999 100644
--- a/src/io/der.rs
+++ b/src/io/der.rs
@@ -18,7 +18,6 @@
 
 use super::Positive;
 use crate::error;
-use untrusted;
 
 pub const CONSTRUCTED: u8 = 1 << 5;
 pub const CONTEXT_SPECIFIC: u8 = 2 << 6;
@@ -212,7 +211,6 @@ pub fn positive_integer<'a>(
 mod tests {
     use super::*;
     use crate::error;
-    use untrusted;
 
     fn with_good_i<F, R>(value: &[u8], f: F)
     where
diff --git a/src/lib.rs b/src/lib.rs
index 486c0203f6..0e2be35ff3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,8 +15,6 @@
 //! Safe, fast, small crypto using Rust with BoringSSL's cryptography
 //! primitives.
 //!
-//! <code>git clone https://github.com/briansmith/ring</code>
-//!
 //! # Feature Flags
 //!
 //! <table>
@@ -33,22 +31,36 @@
 //!         fallbacks will not occur. See the documentation for
 //!         <code>rand::SystemRandom</code> for more details.
 //! <tr><td><code>std</code>
-//!     <td>Enable features that use libstd, in particular `std::error::Error`
-//!         integration.
+//!     <td>Enable features that use libstd, in particular
+//!         <code>std::error::Error</code> integration.
 //! <tr><td><code>wasm32_c</code>
 //!     <td>Enables features that require a C compiler on wasm32 targets, such as
 //!        the <code>constant_time</code> module, HMAC verification, and PBKDF2
 //!        verification. Without this feature, only a subset of functionality
 //!        is provided to wasm32 targets so that a C compiler isn't needed. A
 //!        typical invocation would be:
-//!        <code>TARGET_AR=llvm-ar cargo test --target=wasm32-unknown-unknown --features=wasm32_c</code>
-//!        with <code>llvm-ar</code> and <code>clang</code> in <code>$PATH</code>.
+//!        <code>TARGET_CC=clang-10 TARGET_AR=llvm-ar-10 cargo test --target=wasm32-unknown-unknown --features=wasm32_c</code>
+//!        with <code>llvm-ar-10</code> and <code>clang-10</code> in <code>$PATH</code>.
 //!        (Going forward more functionality should be enabled by default, without
 //!        requiring these hacks, and without requiring a C compiler.)
 //! </table>
 
 #![doc(html_root_url = "https://briansmith.org/rustdoc/")]
 #![allow(
+    clippy::collapsible_if,
+    clippy::identity_op,
+    clippy::len_without_is_empty,
+    clippy::len_zero,
+    clippy::let_unit_value,
+    clippy::many_single_char_names,
+    clippy::needless_range_loop,
+    clippy::new_without_default,
+    clippy::neg_cmp_op_on_partial_ord,
+    clippy::range_plus_one,
+    clippy::too_many_arguments,
+    clippy::trivially_copy_pass_by_ref,
+    clippy::type_complexity,
+    clippy::unreadable_literal,
     missing_copy_implementations,
     missing_debug_implementations,
     non_camel_case_types,
@@ -57,24 +69,9 @@
 )]
 // `#[derive(...)]` uses `trivial_numeric_casts` and `unused_qualifications`
 // internally.
-#![deny(
-    missing_docs,
-    // unstable_features, // Used by `internal_benches`
-    unused_qualifications,
-    variant_size_differences,
-)]
-#![forbid(
-    anonymous_parameters,
-    trivial_casts,
-    trivial_numeric_casts,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_results,
-    // warnings
-)]
+#![deny(missing_docs, unused_qualifications, variant_size_differences)]
+#![forbid(unused_results)]
 #![no_std]
-#![cfg_attr(feature = "internal_benches", allow(unstable_features), feature(test))]
-
 #![cfg_attr(all(target_env = "sgx", feature = "sgx"), feature(stdsimd))]
 
 #[cfg(feature = "alloc")]
diff --git a/src/limb.rs b/src/limb.rs
index 65fdae5ca7..64fb536c8c 100644
--- a/src/limb.rs
+++ b/src/limb.rs
@@ -19,7 +19,6 @@
 //! limbs use the native endianness.
 
 use crate::{c, error};
-use untrusted;
 
 #[cfg(feature = "alloc")]
 use crate::bits;
@@ -350,7 +349,6 @@ extern "C" {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use untrusted;
 
     const MAX: Limb = LimbMask::True as Limb;
 
diff --git a/src/pbkdf2.rs b/src/pbkdf2.rs
index 4f957f453d..c2bd33e3f0 100644
--- a/src/pbkdf2.rs
+++ b/src/pbkdf2.rs
@@ -122,7 +122,7 @@ pub struct Algorithm(hmac::Algorithm);
 /// PBKDF2 using HMAC-SHA1.
 pub static PBKDF2_HMAC_SHA1: Algorithm = Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY);
 
-/// PBKDF2 using HMAC-h.
+/// PBKDF2 using HMAC-SHA256.
 pub static PBKDF2_HMAC_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256);
 
 /// PBKDF2 using HMAC-SHA384.
diff --git a/src/pkcs8.rs b/src/pkcs8.rs
index 53bec5ae81..c3ca49918a 100644
--- a/src/pkcs8.rs
+++ b/src/pkcs8.rs
@@ -17,7 +17,6 @@
 //! [RFC 5958]: https://tools.ietf.org/html/rfc5958.
 
 use crate::{ec, error, io::der};
-use untrusted;
 
 pub(crate) enum Version {
     V1Only,
diff --git a/src/rand.rs b/src/rand.rs
index d69a7a6c40..1af1571128 100644
--- a/src/rand.rs
+++ b/src/rand.rs
@@ -180,10 +180,12 @@ use self::sysrand::fill as fill_impl;
 use self::sysrand_or_urandom::fill as fill_impl;
 
 #[cfg(any(
+    target_os = "dragonfly",
     target_os = "freebsd",
+    target_os = "illumos",
     target_os = "netbsd",
     target_os = "openbsd",
-    target_os = "solaris"
+    target_os = "solaris",
 ))]
 use self::urandom::fill as fill_impl;
 
@@ -354,10 +356,12 @@ mod sysrand_or_urandom {
         any(target_os = "android", target_os = "linux"),
         feature = "dev_urandom_fallback"
     ),
+    target_os = "dragonfly",
     target_os = "freebsd",
     target_os = "netbsd",
     target_os = "openbsd",
-    target_os = "solaris"
+    target_os = "solaris",
+    target_os = "illumos"
 ))]
 mod urandom {
     use crate::error;
@@ -444,7 +448,7 @@ mod rdrandom {
             let mut buf = [0u8; 8];
             match Result::from(unsafe { CRYPTO_rdrand(&mut buf) }) {
                 Ok(()) => return Ok(buf),
-                Err(_) => continue
+                Err(_) => continue,
             }
         }
         Err(error::Unspecified)
diff --git a/src/rsa.rs b/src/rsa.rs
index 04a3bf87bd..9adb3285dd 100644
--- a/src/rsa.rs
+++ b/src/rsa.rs
@@ -24,7 +24,6 @@ use crate::{
     io::{self, der},
     limb,
 };
-use untrusted;
 
 mod padding;
 
diff --git a/src/rsa/padding.rs b/src/rsa/padding.rs
index 3ba00e10f0..f6b4cf6c74 100644
--- a/src/rsa/padding.rs
+++ b/src/rsa/padding.rs
@@ -14,7 +14,6 @@
 
 use super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN;
 use crate::{bits, digest, error, io::der};
-use untrusted;
 
 #[cfg(feature = "alloc")]
 use crate::rand;
@@ -100,7 +99,7 @@ impl Verification for PKCS1 {
         let mut calculated = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN];
         let calculated = &mut calculated[..mod_bits.as_usize_bytes_rounded_up()];
         pkcs1_encode(&self, m_hash, calculated);
-        if m.read_bytes_to_end() != *calculated.as_ref() {
+        if m.read_bytes_to_end() != *calculated {
             return Err(error::Unspecified);
         }
         Ok(())
@@ -279,7 +278,7 @@ impl RsaEncoding for PSS {
 
         {
             // Steps 7.
-            let masked_db = masked_db.into_iter();
+            let masked_db = masked_db.iter_mut();
             // `PS` is all zero bytes, so skipping `ps_len` bytes is equivalent
             // to XORing `PS` onto `db`.
             let mut masked_db = masked_db.skip(metrics.ps_len);
@@ -522,7 +521,6 @@ mod test {
     use super::*;
     use crate::{digest, error, test};
     use alloc::vec;
-    use untrusted;
 
     #[test]
     fn test_pss_padding_verify() {
diff --git a/src/rsa/signing.rs b/src/rsa/signing.rs
index 9ae904e4b2..52d857d302 100644
--- a/src/rsa/signing.rs
+++ b/src/rsa/signing.rs
@@ -25,7 +25,6 @@ use crate::{
     pkcs8, rand, signature,
 };
 use alloc::boxed::Box;
-use untrusted;
 
 /// An RSA key pair, used for signing.
 pub struct RsaKeyPair {
@@ -621,8 +620,7 @@ mod tests {
         const MESSAGE: &[u8] = b"hello, world";
         let rng = rand::SystemRandom::new();
 
-        const PRIVATE_KEY_DER: &'static [u8] =
-            include_bytes!("signature_rsa_example_private_key.der");
+        const PRIVATE_KEY_DER: &[u8] = include_bytes!("signature_rsa_example_private_key.der");
         let key_pair = signature::RsaKeyPair::from_der(PRIVATE_KEY_DER).unwrap();
 
         // The output buffer is one byte too short.
diff --git a/src/rsa/verification.rs b/src/rsa/verification.rs
index cedf64f783..f898f211a6 100644
--- a/src/rsa/verification.rs
+++ b/src/rsa/verification.rs
@@ -22,8 +22,6 @@ use crate::{
     sealed, signature,
 };
 
-use untrusted;
-
 #[derive(Debug)]
 pub struct Key {
     pub n: bigint::Modulus<N>,
diff --git a/src/signature.rs b/src/signature.rs
index e325dc0a7a..bef92dc4b8 100644
--- a/src/signature.rs
+++ b/src/signature.rs
@@ -111,7 +111,7 @@
 //! [NIST Special Publication 800-56A, revision 2]:
 //!     http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf
 //! [Suite B implementer's guide to FIPS 186-3]:
-//!     https://github.com/briansmith/ring/blob/master/doc/ecdsa.pdf
+//!     https://github.com/briansmith/ring/blob/main/doc/ecdsa.pdf
 //! [RFC 3279 Section 2.2.3]:
 //!     https://tools.ietf.org/html/rfc3279#section-2.2.3
 //! [RFC 3447 Section 8.2]:
@@ -132,7 +132,7 @@
 //!     signature::{self, KeyPair},
 //! };
 //!
-//! # fn sign_and_verify_ed25519() -> Result<(), ring::error::Unspecified> {
+//! # fn main() -> Result<(), ring::error::Unspecified> {
 //! // Generate a key pair in PKCS#8 (v2) format.
 //! let rng = rand::SystemRandom::new();
 //! let pkcs8_bytes = signature::Ed25519KeyPair::generate_pkcs8(&rng)?;
@@ -160,8 +160,6 @@
 //!
 //! # Ok(())
 //! # }
-//!
-//! # fn main() { sign_and_verify_ed25519().unwrap() }
 //! ```
 //!
 //! ## Signing and verifying with RSA (PKCS#1 1.5 padding)
@@ -257,7 +255,6 @@
 //! ```
 
 use crate::{cpu, ec, error, sealed};
-use untrusted;
 
 pub use crate::ec::{
     curve25519::ed25519::{
diff --git a/tests/aead_tests.rs b/tests/aead_tests.rs
index 0fc7ff18e8..75e0e9e92d 100644
--- a/tests/aead_tests.rs
+++ b/tests/aead_tests.rs
@@ -13,23 +13,6 @@
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 #![cfg(any(not(target_arch = "wasm32"), feature = "wasm32_c"))]
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
 
 #[cfg(target_arch = "wasm32")]
 use wasm_bindgen_test::{wasm_bindgen_test, wasm_bindgen_test_configure};
@@ -208,22 +191,20 @@ fn test_aead<Seal, Open>(
         ];
 
         let mut more_comprehensive_in_prefix_lengths = [0; 4096];
-        let in_prefix_lengths;
-        if cfg!(debug_assertions) {
-            in_prefix_lengths = &MINIMAL_IN_PREFIX_LENS[..];
+        let in_prefix_lengths = if cfg!(debug_assertions) {
+            &MINIMAL_IN_PREFIX_LENS[..]
         } else {
+            #[allow(clippy::needless_range_loop)]
             for b in 0..more_comprehensive_in_prefix_lengths.len() {
                 more_comprehensive_in_prefix_lengths[b] = b;
             }
-            in_prefix_lengths = &more_comprehensive_in_prefix_lengths[..];
-        }
+            &more_comprehensive_in_prefix_lengths[..]
+        };
         let mut o_in_out = vec![123u8; 4096];
 
-        for in_prefix_len in in_prefix_lengths.iter() {
+        for &in_prefix_len in in_prefix_lengths.iter() {
             o_in_out.truncate(0);
-            for _ in 0..*in_prefix_len {
-                o_in_out.push(123);
-            }
+            o_in_out.resize(in_prefix_len, 123);
             o_in_out.extend_from_slice(&ct[..]);
 
             let nonce = aead::Nonce::try_assume_unique_for_key(&nonce_bytes).unwrap();
@@ -233,7 +214,7 @@ fn test_aead<Seal, Open>(
                 nonce,
                 aead::Aad::from(&aad[..]),
                 &mut o_in_out,
-                *in_prefix_len..,
+                in_prefix_len..,
             );
             match error {
                 None => {
@@ -300,6 +281,7 @@ fn open_with_less_safe_key<'a>(
     key.open_within(nonce, aad, in_out, ciphertext_and_tag)
 }
 
+#[allow(clippy::range_plus_one)]
 fn test_aead_key_sizes(aead_alg: &'static aead::Algorithm) {
     let key_len = aead_alg.key_len();
     let key_data = vec![0u8; key_len * 2];
@@ -327,6 +309,7 @@ fn test_aead_key_sizes(aead_alg: &'static aead::Algorithm) {
 }
 
 // Test that we reject non-standard nonce sizes.
+#[allow(clippy::range_plus_one)]
 #[test]
 fn test_aead_nonce_sizes() -> Result<(), error::Unspecified> {
     let nonce_len = aead::NONCE_LEN;
@@ -350,6 +333,7 @@ fn test_aead_nonce_sizes() -> Result<(), error::Unspecified> {
     target_arch = "x86_64",
     target_arch = "x86"
 ))]
+#[allow(clippy::range_plus_one)]
 #[test]
 fn aead_chacha20_poly1305_openssh() {
     // TODO: test_aead_key_sizes(...);
@@ -380,7 +364,7 @@ fn aead_chacha20_poly1305_openssh() {
             let mut tag = [0u8; aead::chacha20_poly1305_openssh::TAG_LEN];
             let mut s_in_out = plaintext.clone();
             let s_key = aead::chacha20_poly1305_openssh::SealingKey::new(&key_bytes);
-            let () = s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag);
+            s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag);
             assert_eq!(&ct, &s_in_out);
             assert_eq!(&expected_tag, &tag);
             let o_key = aead::chacha20_poly1305_openssh::OpeningKey::new(&key_bytes);
diff --git a/tests/agreement_tests.rs b/tests/agreement_tests.rs
index 7e4152d139..4162015378 100644
--- a/tests/agreement_tests.rs
+++ b/tests/agreement_tests.rs
@@ -12,30 +12,12 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 extern crate alloc;
 
 use ring::{agreement, error, rand, test, test_file};
 
 #[test]
-fn agreement_traits<'a>() {
+fn agreement_traits() {
     use alloc::vec::Vec;
 
     let rng = rand::SystemRandom::new();
@@ -61,9 +43,9 @@ fn agreement_traits<'a>() {
     // TODO: Test the actual output.
     let _: &dyn core::fmt::Debug = &public_key;
 
-    test::compile_time_assert_clone::<agreement::UnparsedPublicKey<&'a [u8]>>();
-    test::compile_time_assert_copy::<agreement::UnparsedPublicKey<&'a [u8]>>();
-    test::compile_time_assert_sync::<agreement::UnparsedPublicKey<&'a [u8]>>();
+    test::compile_time_assert_clone::<agreement::UnparsedPublicKey<&[u8]>>();
+    test::compile_time_assert_copy::<agreement::UnparsedPublicKey<&[u8]>>();
+    test::compile_time_assert_sync::<agreement::UnparsedPublicKey<&[u8]>>();
 
     test::compile_time_assert_clone::<agreement::UnparsedPublicKey<Vec<u8>>>();
     test::compile_time_assert_sync::<agreement::UnparsedPublicKey<Vec<u8>>>();
@@ -105,13 +87,12 @@ fn agreement_agree_ephemeral() {
 
                 assert_eq!(my_private.algorithm(), alg);
 
-                assert!(
+                let result =
                     agreement::agree_ephemeral(my_private, &peer_public, (), |key_material| {
                         assert_eq!(key_material, &output[..]);
                         Ok(())
-                    })
-                    .is_ok()
-                );
+                    });
+                assert_eq!(result, Ok(()));
             }
 
             Some(_) => {
@@ -134,7 +115,7 @@ fn agreement_agree_ephemeral() {
             }
         }
 
-        return Ok(());
+        Ok(())
     });
 }
 
diff --git a/tests/constant_time_tests.rs b/tests/constant_time_tests.rs
index 37bcebd93b..422ab2c8b9 100644
--- a/tests/constant_time_tests.rs
+++ b/tests/constant_time_tests.rs
@@ -28,7 +28,7 @@ fn test_verify_slices_are_equal() {
     let initial: [u8; 256] = rand::generate(&rand::SystemRandom::new()).unwrap().expose();
 
     {
-        let copy = initial.clone();
+        let copy = initial;
         for len in 0..copy.len() {
             // Not equal because the lengths do not match.
             assert_eq!(
@@ -50,7 +50,7 @@ fn test_verify_slices_are_equal() {
 
     for i in 0..initial.len() {
         for bit in 0..8 {
-            let mut copy = initial.clone();
+            let mut copy = initial;
             copy[i] ^= 1u8 << bit;
 
             for len in 0..=initial.len() {
@@ -67,7 +67,7 @@ fn test_verify_slices_are_equal() {
                     // The flipped bit is outside of `b` so `a` and `b` are equal.
                     Ok(())
                 };
-                assert_eq!((&a == &b), expected_result.is_ok()); // Sanity check.
+                assert_eq!(a == b, expected_result.is_ok()); // Sanity check.
                 assert_eq!(
                     constant_time::verify_slices_are_equal(&a, &b),
                     expected_result
diff --git a/tests/digest_tests.rs b/tests/digest_tests.rs
index 1b16bb66e0..c275de7054 100644
--- a/tests/digest_tests.rs
+++ b/tests/digest_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use ring::{digest, test, test_file};
 
 #[cfg(target_arch = "wasm32")]
diff --git a/tests/ecdsa_tests.rs b/tests/ecdsa_tests.rs
index d0d728d043..317fdbc938 100644
--- a/tests/ecdsa_tests.rs
+++ b/tests/ecdsa_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use ring::{
     rand,
     signature::{self, KeyPair},
@@ -86,7 +68,7 @@ fn ecdsa_from_pkcs8_test() {
 
             match (
                 signature::EcdsaKeyPair::from_pkcs8(this_asn1, &input),
-                error.clone(),
+                error,
             ) {
                 (Ok(_), None) => (),
                 (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e),
@@ -209,9 +191,11 @@ fn ecdsa_test_public_key_coverage() {
     assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY);
 
     // Test `Clone`.
-    {
-        let _ = key_pair.public_key().clone();
-    }
+    #[allow(clippy::clone_on_copy, clippy::redundant_clone)]
+    let _: <signature::EcdsaKeyPair as KeyPair>::PublicKey = key_pair.public_key().clone();
+
+    // Test `Copy`.
+    let _: <signature::EcdsaKeyPair as KeyPair>::PublicKey = *key_pair.public_key();
 
     // Test `Debug`.
     assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key()));
diff --git a/tests/ed25519_tests.rs b/tests/ed25519_tests.rs
index 0289362162..059ecdb044 100644
--- a/tests/ed25519_tests.rs
+++ b/tests/ed25519_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use ring::{
     signature::{self, Ed25519KeyPair, KeyPair},
     test, test_file,
@@ -114,14 +96,11 @@ fn test_ed25519_from_pkcs8_unchecked() {
             let input = test_case.consume_bytes("Input");
             let error = test_case.consume_optional_string("Error");
 
-            match (
-                Ed25519KeyPair::from_pkcs8_maybe_unchecked(&input),
-                error.clone(),
-            ) {
+            match (Ed25519KeyPair::from_pkcs8_maybe_unchecked(&input), error) {
                 (Ok(_), None) => (),
                 (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e),
                 (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e),
-                (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected),
+                (Err(actual), Some(expected)) => assert_eq!(actual.description_(), expected),
             };
 
             Ok(())
@@ -139,11 +118,11 @@ fn test_ed25519_from_pkcs8() {
             let input = test_case.consume_bytes("Input");
             let error = test_case.consume_optional_string("Error");
 
-            match (Ed25519KeyPair::from_pkcs8(&input), error.clone()) {
+            match (Ed25519KeyPair::from_pkcs8(&input), error) {
                 (Ok(_), None) => (),
                 (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e),
                 (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e),
-                (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected),
+                (Err(actual), Some(expected)) => assert_eq!(actual.description_(), expected),
             };
 
             Ok(())
@@ -155,7 +134,7 @@ fn test_ed25519_from_pkcs8() {
 fn ed25519_test_public_key_coverage() {
     const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.p8");
     const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.der");
-    const PUBLIC_KEY_DEBUG: &'static str =
+    const PUBLIC_KEY_DEBUG: &str =
         "PublicKey(\"5809e9fef6dcec58f0f2e3b0d67e9880a11957e083ace85835c3b6c8fbaf6b7d\")";
 
     let key_pair = signature::Ed25519KeyPair::from_pkcs8(PRIVATE_KEY).unwrap();
@@ -164,7 +143,11 @@ fn ed25519_test_public_key_coverage() {
     assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY);
 
     // Test `Clone`.
-    let _ = key_pair.public_key().clone();
+    #[allow(clippy::clone_on_copy)]
+    let _: <Ed25519KeyPair as KeyPair>::PublicKey = key_pair.public_key().clone();
+
+    // Test `Copy`.
+    let _: <Ed25519KeyPair as KeyPair>::PublicKey = *key_pair.public_key();
 
     // Test `Debug`.
     assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key()));
diff --git a/tests/hkdf_tests.rs b/tests/hkdf_tests.rs
index e17968c456..88435a845e 100644
--- a/tests/hkdf_tests.rs
+++ b/tests/hkdf_tests.rs
@@ -12,23 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use ring::{digest, error, hkdf, test, test_file};
 
 #[cfg(target_arch = "wasm32")]
diff --git a/tests/hmac_tests.rs b/tests/hmac_tests.rs
index 9e01714eb6..486a90a530 100644
--- a/tests/hmac_tests.rs
+++ b/tests/hmac_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use ring::{digest, error, hmac, test, test_file};
 
 #[cfg(target_arch = "wasm32")]
diff --git a/tests/pbkdf2_tests.rs b/tests/pbkdf2_tests.rs
index 0b0cf94b3b..13300fa46e 100644
--- a/tests/pbkdf2_tests.rs
+++ b/tests/pbkdf2_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use core::num::NonZeroU32;
 use ring::{digest, error, pbkdf2, test, test_file};
 
diff --git a/tests/quic_tests.rs b/tests/quic_tests.rs
index 472938f87d..545d7a76fb 100644
--- a/tests/quic_tests.rs
+++ b/tests/quic_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 use ring::{aead::quic, test, test_file};
 
 #[test]
@@ -64,6 +46,7 @@ fn test_quic(alg: &'static quic::Algorithm, test_file: test::File) {
     });
 }
 
+#[allow(clippy::range_plus_one)]
 fn test_sample_len(alg: &'static quic::Algorithm) {
     let key_len = alg.key_len();
     let key_data = vec![0u8; key_len];
diff --git a/tests/rsa_tests.rs b/tests/rsa_tests.rs
index 03e062e29f..2b29b26150 100644
--- a/tests/rsa_tests.rs
+++ b/tests/rsa_tests.rs
@@ -12,24 +12,6 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-#![forbid(
-    anonymous_parameters,
-    box_pointers,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs,
-    trivial_casts,
-    trivial_numeric_casts,
-    unsafe_code,
-    unstable_features,
-    unused_extern_crates,
-    unused_import_braces,
-    unused_qualifications,
-    unused_results,
-    variant_size_differences,
-    warnings
-)]
-
 #[cfg(feature = "alloc")]
 use ring::{
     error,
diff --git a/util/ar/ar.go b/util/ar/ar.go
deleted file mode 100644
index 756caf53d8..0000000000
--- a/util/ar/ar.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2017, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// ar.go contains functions for parsing .a archive files.
-
-package ar
-
-import (
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"strconv"
-	"strings"
-)
-
-// ParseAR parses an archive file from r and returns a map from filename to
-// contents, or else an error.
-func ParseAR(r io.Reader) (map[string][]byte, error) {
-	// See https://en.wikipedia.org/wiki/Ar_(Unix)#File_format_details
-	const expectedMagic = "!<arch>\n"
-	var magic [len(expectedMagic)]byte
-	if _, err := io.ReadFull(r, magic[:]); err != nil {
-		return nil, err
-	}
-	if string(magic[:]) != expectedMagic {
-		return nil, errors.New("ar: not an archive file")
-	}
-
-	const filenameTableName = "//"
-	const symbolTableName = "/"
-	var longFilenameTable []byte
-	ret := make(map[string][]byte)
-
-	for {
-		var header [60]byte
-		if _, err := io.ReadFull(r, header[:]); err != nil {
-			if err == io.EOF {
-				break
-			}
-			return nil, errors.New("ar: error reading file header: " + err.Error())
-		}
-
-		name := strings.TrimRight(string(header[:16]), " ")
-		sizeStr := strings.TrimRight(string(header[48:58]), "\x00 ")
-		size, err := strconv.ParseUint(sizeStr, 10, 64)
-		if err != nil {
-			return nil, errors.New("ar: failed to parse file size: " + err.Error())
-		}
-
-		// File contents are padded to a multiple of two bytes
-		storedSize := size
-		if storedSize%2 == 1 {
-			storedSize++
-		}
-
-		contents := make([]byte, storedSize)
-		if _, err := io.ReadFull(r, contents); err != nil {
-			return nil, errors.New("ar: error reading file contents: " + err.Error())
-		}
-		contents = contents[:size]
-
-		switch {
-		case name == filenameTableName:
-			if longFilenameTable != nil {
-				return nil, errors.New("ar: two filename tables found")
-			}
-			longFilenameTable = contents
-			continue
-
-		case name == symbolTableName:
-			continue
-
-		case len(name) > 1 && name[0] == '/':
-			if longFilenameTable == nil {
-				return nil, errors.New("ar: long filename reference found before filename table")
-			}
-
-			// A long filename is stored as "/" followed by a
-			// base-10 offset in the filename table.
-			offset, err := strconv.ParseUint(name[1:], 10, 64)
-			if err != nil {
-				return nil, errors.New("ar: failed to parse filename offset: " + err.Error())
-			}
-			if offset > uint64((^uint(0))>>1) {
-				return nil, errors.New("ar: filename offset overflow")
-			}
-
-			if int(offset) > len(longFilenameTable) {
-				return nil, errors.New("ar: filename offset out of bounds")
-			}
-
-			filename := longFilenameTable[offset:]
-			// Windows terminates filenames with NUL characters,
-			// while sysv/GNU uses /.
-			if i := bytes.IndexAny(filename, "/\x00"); i < 0 {
-				return nil, errors.New("ar: unterminated filename in table")
-			} else {
-				filename = filename[:i]
-			}
-
-			name = string(filename)
-
-		default:
-			name = strings.TrimRight(name, "/")
-		}
-
-		// Post-processing for BSD:
-		// https://en.wikipedia.org/wiki/Ar_(Unix)#BSD_variant
-		//
-		// If the name is of the form #1/XXX, XXX identifies the length of the
-		// name, and the name itself is stored as a prefix of the data, possibly
-		// null-padded.
-
-		var namelen uint
-		n, err := fmt.Sscanf(name, "#1/%d", &namelen)
-		if err == nil && n == 1 && len(contents) >= int(namelen) {
-			name = string(contents[:namelen])
-			contents = contents[namelen:]
-
-			// Names can be null padded; find the first null (if any). Note that
-			// this also handles the case of a null followed by non-null
-			// characters. It's not clear whether those can ever show up in
-			// practice, but we might as well handle them in case they can show
-			// up.
-			var null int
-			for ; null < len(name); null++ {
-				if name[null] == 0 {
-					break
-				}
-			}
-			name = name[:null]
-		}
-
-		if name == "__.SYMDEF" || name == "__.SYMDEF SORTED" {
-			continue
-		}
-
-		ret[name] = contents
-	}
-
-	return ret, nil
-}
diff --git a/util/ar/ar_test.go b/util/ar/ar_test.go
deleted file mode 100644
index ef37d795d2..0000000000
--- a/util/ar/ar_test.go
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2018, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-package ar
-
-import (
-	"bytes"
-	"flag"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"testing"
-)
-
-var testDataDir = flag.String("testdata", "testdata", "The path to the test data directory.")
-
-type arTest struct {
-	name string
-	in   string
-	out  map[string]string
-	// allowPadding is true if the contents may have trailing newlines at end.
-	// On macOS, ar calls ranlib which pads all inputs up to eight bytes with
-	// newlines. Unlike ar's native padding up to two bytes, this padding is
-	// included in the size field, so it is not removed when decoding.
-	allowPadding bool
-}
-
-func (test *arTest) Path(file string) string {
-	return filepath.Join(*testDataDir, test.name, file)
-}
-
-func removeTrailingNewlines(in []byte) []byte {
-	for len(in) > 0 && in[len(in)-1] == '\n' {
-		in = in[:len(in)-1]
-	}
-	return in
-}
-
-var arTests = []arTest{
-	{
-		"linux",
-		"libsample.a",
-		map[string]string{
-			"foo.c.o": "foo.c.o",
-			"bar.cc.o": "bar.cc.o",
-		},
-		false,
-	},
-	{
-		"mac",
-		"libsample.a",
-		map[string]string{
-			"foo.c.o": "foo.c.o",
-			"bar.cc.o": "bar.cc.o",
-		},
-		true,
-	},
-	{
-		"windows",
-		"sample.lib",
-		map[string]string{
-			"CMakeFiles\\sample.dir\\foo.c.obj": "foo.c.obj",
-			"CMakeFiles\\sample.dir\\bar.cc.obj": "bar.cc.obj",
-		},
-		false,
-	},
-}
-
-func TestAR(t *testing.T) {
-	for _, test := range arTests {
-		t.Run(test.name, func(t *testing.T) {
-			in, err := os.Open(test.Path(test.in))
-			if err != nil {
-				t.Fatalf("opening input failed: %s", err)
-			}
-			defer in.Close()
-
-			ret, err := ParseAR(in)
-			if err != nil {
-				t.Fatalf("reading input failed: %s", err)
-			}
-
-			for file, contentsPath := range test.out {
-				expected, err := ioutil.ReadFile(test.Path(contentsPath))
-				if err != nil {
-					t.Fatalf("error reading %s: %s", contentsPath, err)
-				}
-				got, ok := ret[file]
-				if test.allowPadding {
-					got = removeTrailingNewlines(got)
-					expected = removeTrailingNewlines(got)
-				}
-				if !ok {
-					t.Errorf("file %s missing from output", file)
-				} else if !bytes.Equal(got, expected) {
-					t.Errorf("contents for file %s did not match", file)
-				}
-			}
-
-			for file, _ := range ret {
-				if _, ok := test.out[file]; !ok {
-					t.Errorf("output contained unexpected file %q", file)
-				}
-			}
-		})
-	}
-}
diff --git a/util/ar/testdata/linux/bar.cc.o b/util/ar/testdata/linux/bar.cc.o
deleted file mode 100644
index 92e83a9a11..0000000000
Binary files a/util/ar/testdata/linux/bar.cc.o and /dev/null differ
diff --git a/util/ar/testdata/linux/foo.c.o b/util/ar/testdata/linux/foo.c.o
deleted file mode 100644
index 6423c1d49b..0000000000
Binary files a/util/ar/testdata/linux/foo.c.o and /dev/null differ
diff --git a/util/ar/testdata/linux/libsample.a b/util/ar/testdata/linux/libsample.a
deleted file mode 100644
index cae6ae70c9..0000000000
Binary files a/util/ar/testdata/linux/libsample.a and /dev/null differ
diff --git a/util/ar/testdata/mac/bar.cc.o b/util/ar/testdata/mac/bar.cc.o
deleted file mode 100644
index 9c60798532..0000000000
Binary files a/util/ar/testdata/mac/bar.cc.o and /dev/null differ
diff --git a/util/ar/testdata/mac/foo.c.o b/util/ar/testdata/mac/foo.c.o
deleted file mode 100644
index 0f96a0a018..0000000000
Binary files a/util/ar/testdata/mac/foo.c.o and /dev/null differ
diff --git a/util/ar/testdata/mac/libsample.a b/util/ar/testdata/mac/libsample.a
deleted file mode 100644
index b7d8eb5ce0..0000000000
Binary files a/util/ar/testdata/mac/libsample.a and /dev/null differ
diff --git a/util/ar/testdata/sample/CMakeLists.txt b/util/ar/testdata/sample/CMakeLists.txt
deleted file mode 100644
index 9ea2fe8ee1..0000000000
--- a/util/ar/testdata/sample/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-project(Sample)
-add_library(sample STATIC foo.c bar.cc)
diff --git a/util/ar/testdata/sample/bar.cc b/util/ar/testdata/sample/bar.cc
deleted file mode 100644
index a0ac7e14ab..0000000000
--- a/util/ar/testdata/sample/bar.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-extern "C" {
-void foo();
-void bar() {}
-}
-
-namespace bar_namespace {
-
-void SomeExternalFunction();
-
-void SomeFunction() {
-  foo();
-  SomeExternalFunction();
-}
-
-}  // namespace bar_namespace
diff --git a/util/ar/testdata/sample/foo.c b/util/ar/testdata/sample/foo.c
deleted file mode 100644
index fed596cbe2..0000000000
--- a/util/ar/testdata/sample/foo.c
+++ /dev/null
@@ -1,7 +0,0 @@
-extern void external_symbol(void);
-extern void bar(void);
-
-void foo(void) {
-  external_symbol();
-  bar();
-}
diff --git a/util/ar/testdata/windows/bar.cc.obj b/util/ar/testdata/windows/bar.cc.obj
deleted file mode 100644
index 4a315cdd6b..0000000000
Binary files a/util/ar/testdata/windows/bar.cc.obj and /dev/null differ
diff --git a/util/ar/testdata/windows/foo.c.obj b/util/ar/testdata/windows/foo.c.obj
deleted file mode 100644
index 9b4aad7a42..0000000000
Binary files a/util/ar/testdata/windows/foo.c.obj and /dev/null differ
diff --git a/util/ar/testdata/windows/sample.lib b/util/ar/testdata/windows/sample.lib
deleted file mode 100644
index efeebb24e5..0000000000
Binary files a/util/ar/testdata/windows/sample.lib and /dev/null differ
diff --git a/util/generate-asm-lcov.py b/util/generate-asm-lcov.py
deleted file mode 100755
index 257ae841c3..0000000000
--- a/util/generate-asm-lcov.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-import os
-import os.path
-import subprocess
-import sys
-
-# The LCOV output format for each source file is:
-#
-# SF:<filename>
-# DA:<line>,<execution count>
-# ...
-# end_of_record
-#
-# The <execution count> can either be 0 for an unexecuted instruction or a
-# value representing the number of executions. The DA line should be omitted
-# for lines not representing an instruction.
-
-SECTION_SEPERATOR = '-' * 80
-
-def is_asm(l):
-  """Returns whether a line should be considered to be an instruction."""
-  l = l.strip()
-  # Empty lines
-  if l == '':
-    return False
-  # Comments
-  if l.startswith('#'):
-    return False
-  # Assembly Macros
-  if l.startswith('.'):
-    return False
-  # Label
-  if l.endswith(':'):
-    return False
-  return True
-
-def merge(callgrind_files, srcs):
-  """Calls callgrind_annotate over the set of callgrind output
-  |callgrind_files| using the sources |srcs| and merges the results
-  together."""
-  out = ''
-  for file in callgrind_files:
-    data = subprocess.check_output(['callgrind_annotate', file] + srcs)
-    out += '%s\n%s\n' % (data, SECTION_SEPERATOR)
-  return out
-
-def parse(filename, data, current):
-  """Parses an annotated execution flow |data| from callgrind_annotate for
-  source |filename| and updates the current execution counts from |current|."""
-  with open(filename) as f:
-    source = f.read().split('\n')
-
-  out = current
-  if out == None:
-    out = [0 if is_asm(l) else None for l in source]
-
-  # Lines are of the following formats:
-  #   -- line: Indicates that analysis continues from a different place.
-  #   Ir     : Indicates the start of a file.
-  #   =>     : Indicates a call/jump in the control flow.
-  #   <Count> <Code>: Indicates that the line has been executed that many times.
-  line = None
-  for l in data:
-    l = l.strip() + ' '
-    if l.startswith('-- line'):
-      line = int(l.split(' ')[2]) - 1
-    elif l.strip() == 'Ir':
-      line = 0
-    elif line != None and l.strip() and '=>' not in l and 'unidentified lines' not in l:
-      count = l.split(' ')[0].replace(',', '').replace('.', '0')
-      instruction = l.split(' ', 1)[1].strip()
-      if count != '0' or is_asm(instruction):
-        if out[line] == None:
-          out[line] = 0
-        out[line] += int(count)
-      line += 1
-
-  return out
-
-
-def generate(data):
-  """Parses the merged callgrind_annotate output |data| and generates execution
-  counts for all annotated files."""
-  out = {}
-  data = [p.strip() for p in data.split(SECTION_SEPERATOR)]
-
-
-  # Most sections are ignored, but a section with:
-  #  User-annotated source: <file>
-  # precedes a listing of execution count for that <file>.
-  for i in range(len(data)):
-    if 'User-annotated source' in data[i] and i < len(data) - 1:
-      filename = data[i].split(':', 1)[1].strip()
-      res = data[i + 1]
-      if filename not in out:
-        out[filename] = None
-      if 'No information' in res:
-        res = []
-      else:
-        res = res.split('\n')
-      out[filename] = parse(filename, res, out[filename])
-  return out
-
-def output(data):
-  """Takes a dictionary |data| of filenames and execution counts and generates
-  a LCOV coverage output."""
-  out = ''
-  for filename, counts in data.iteritems():
-    out += 'SF:%s\n' % (os.path.abspath(filename))
-    for line, count in enumerate(counts):
-      if count != None:
-        out += 'DA:%d,%s\n' % (line + 1, count)
-    out += 'end_of_record\n'
-  return out
-
-if __name__ == '__main__':
-  if len(sys.argv) != 3:
-    print '%s <Callgrind Folder> <Build Folder>' % (__file__)
-    sys.exit()
-
-  cg_folder = sys.argv[1]
-  build_folder = sys.argv[2]
-
-  cg_files = []
-  for (cwd, _, files) in os.walk(cg_folder):
-    for f in files:
-      if f.startswith('callgrind.out'):
-        cg_files.append(os.path.abspath(os.path.join(cwd, f)))
-
-  srcs = []
-  for (cwd, _, files) in os.walk(build_folder):
-    for f in files:
-      fn = os.path.join(cwd, f)
-      if fn.endswith('.S'):
-        srcs.append(fn)
-
-  annotated = merge(cg_files, srcs)
-  lcov = generate(annotated)
-  print output(lcov)
diff --git a/util/generate-coverage.sh b/util/generate-coverage.sh
deleted file mode 100755
index 2fbe6b8378..0000000000
--- a/util/generate-coverage.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/sh
-# Copyright (c) 2016, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-set -xe
-
-SRC=$PWD
-
-BUILD=$(mktemp -d '/tmp/boringssl.XXXXXX')
-BUILD_SRC=$(mktemp -d '/tmp/boringssl-src.XXXXXX')
-LCOV=$(mktemp -d '/tmp/boringssl-lcov.XXXXXX')
-
-if [ -n "$1" ]; then
-  LCOV=$(readlink -f "$1")
-  mkdir -p "$LCOV"
-fi
-
-cd "$BUILD"
-cmake "$SRC" -GNinja -DCMAKE_C_FLAGS='-fprofile-arcs -ftest-coverage' \
-  -DCMAKE_CXX_FLAGS='-fprofile-arcs -ftest-coverage' -DCMAKE_ASM_FLAGS='-Wa,-g'
-ninja
-
-cp -r "$SRC/crypto" "$SRC/decrepit" "$SRC/include" "$SRC/ssl" "$SRC/tool" \
-  "$BUILD_SRC"
-cp -r "$BUILD"/* "$BUILD_SRC"
-mkdir "$BUILD/callgrind/"
-
-cd "$SRC"
-go run "$SRC/util/all_tests.go" -build-dir "$BUILD" -callgrind -num-workers 16
-util/generate-asm-lcov.py "$BUILD/callgrind" "$BUILD" > "$BUILD/asm.info"
-
-go run "util/all_tests.go" -build-dir "$BUILD"
-
-cd "$SRC/ssl/test/runner"
-go test -shim-path "$BUILD/ssl/test/bssl_shim" -num-workers 1
-
-cd "$LCOV"
-lcov -c -d "$BUILD" -b "$BUILD" -o "$BUILD/lcov.info"
-lcov -r "$BUILD/lcov.info" "*_test.c" -o "$BUILD/lcov-1.info"
-lcov -r "$BUILD/lcov-1.info" "*_test.cc" -o "$BUILD/lcov-2.info"
-cat "$BUILD/lcov-2.info" "$BUILD/asm.info" > "$BUILD/final.info"
-sed -i "s;$BUILD;$BUILD_SRC;g" "$BUILD/final.info"
-sed -i "s;$SRC;$BUILD_SRC;g" "$BUILD/final.info"
-genhtml -p "$BUILD_SRC" "$BUILD/final.info"
-
-rm -rf "$BUILD"
-rm -rf "$BUILD_SRC"
-
-xdg-open index.html
diff --git a/util/make_prefix_headers.go b/util/make_prefix_headers.go
deleted file mode 100644
index b536f14cea..0000000000
--- a/util/make_prefix_headers.go
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (c) 2018, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-// This program takes a file containing newline-separated symbols, and generates
-// boringssl_prefix_symbols.h, boringssl_prefix_symbols_asm.h, and
-// boringssl_prefix_symbols_nasm.inc. These header files can be used to build
-// BoringSSL with a prefix for all symbols in order to avoid symbol name
-// conflicts when linking a project with multiple copies of BoringSSL; see
-// BUILDING.md for more details.
-
-// TODO(joshlf): For platforms which support it, use '#pragma redefine_extname'
-// instead of a custom macro. This avoids the need for a custom macro, but also
-// ensures that our renaming won't conflict with symbols defined and used by our
-// consumers (the "HMAC" problem). An example of this approach can be seen in
-// IllumOS' fork of OpenSSL:
-// https://github.com/joyent/illumos-extra/blob/master/openssl1x/sunw_prefix.h
-
-package main
-
-import (
-	"bufio"
-	"flag"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-)
-
-var out = flag.String("out", ".", "Path to a directory where the outputs will be written")
-
-// Read newline-separated symbols from a file, ignoring any comments started
-// with '#'.
-func readSymbols(path string) ([]string, error) {
-	f, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-	scanner := bufio.NewScanner(f)
-	var ret []string
-	for scanner.Scan() {
-		line := scanner.Text()
-		if idx := strings.IndexByte(line, '#'); idx >= 0 {
-			line = line[:idx]
-		}
-		line = strings.TrimSpace(line)
-		if len(line) == 0 {
-			continue
-		}
-		ret = append(ret, line)
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
-	}
-	return ret, nil
-}
-
-func writeCHeader(symbols []string, path string) error {
-	f, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	if _, err := f.WriteString(`// Copyright (c) 2018, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-// BORINGSSL_ADD_PREFIX pastes two identifiers into one. It performs one
-// iteration of macro expansion on its arguments before pasting.
-#define BORINGSSL_ADD_PREFIX(a, b) BORINGSSL_ADD_PREFIX_INNER(a, b)
-#define BORINGSSL_ADD_PREFIX_INNER(a, b) a ## _ ## b
-
-`); err != nil {
-		return err
-	}
-
-	for _, symbol := range symbols {
-		if _, err := fmt.Fprintf(f, "#define %s BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, %s)\n", symbol, symbol); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func writeASMHeader(symbols []string, path string) error {
-	f, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	if _, err := f.WriteString(`// Copyright (c) 2018, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-#if !defined(__APPLE__)
-#include <boringssl_prefix_symbols.h>
-#else
-// On iOS and macOS, we need to treat assembly symbols differently from other
-// symbols. The linker expects symbols to be prefixed with an underscore.
-// Perlasm thus generates symbol with this underscore applied. Our macros must,
-// in turn, incorporate it.
-#define BORINGSSL_ADD_PREFIX_MAC_ASM(a, b) BORINGSSL_ADD_PREFIX_INNER_MAC_ASM(a, b)
-#define BORINGSSL_ADD_PREFIX_INNER_MAC_ASM(a, b) _ ## a ## _ ## b
-
-`); err != nil {
-		return err
-	}
-
-	for _, symbol := range symbols {
-		if _, err := fmt.Fprintf(f, "#define _%s BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, %s)\n", symbol, symbol); err != nil {
-			return err
-		}
-	}
-
-	_, err = fmt.Fprintf(f, "#endif\n")
-	return nil
-}
-
-func writeNASMHeader(symbols []string, path string) error {
-	f, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	// NASM uses a different syntax from the C preprocessor.
-	if _, err := f.WriteString(`; Copyright (c) 2018, Google Inc.
-;
-; Permission to use, copy, modify, and/or distribute this software for any
-; purpose with or without fee is hereby granted, provided that the above
-; copyright notice and this permission notice appear in all copies.
-;
-; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-; WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-; MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-; SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-; OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-; 32-bit Windows adds underscores to C functions, while 64-bit Windows does not.
-%ifidn __OUTPUT_FORMAT__, win32
-`); err != nil {
-		return err
-	}
-
-	for _, symbol := range symbols {
-		if _, err := fmt.Fprintf(f, "%%xdefine _%s _ %%+ BORINGSSL_PREFIX %%+ _%s\n", symbol, symbol); err != nil {
-			return err
-		}
-	}
-
-	if _, err := fmt.Fprintf(f, "%%else\n"); err != nil {
-		return err
-	}
-
-	for _, symbol := range symbols {
-		if _, err := fmt.Fprintf(f, "%%xdefine %s BORINGSSL_PREFIX %%+ _%s\n", symbol, symbol); err != nil {
-			return err
-		}
-	}
-
-	if _, err := fmt.Fprintf(f, "%%endif\n"); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func main() {
-	flag.Parse()
-	if flag.NArg() != 1 {
-		fmt.Fprintf(os.Stderr, "Usage: %s [-out OUT] SYMBOLS\n", os.Args[0])
-		os.Exit(1)
-	}
-
-	symbols, err := readSymbols(flag.Arg(0))
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error reading symbols: %s\n", err)
-		os.Exit(1)
-	}
-
-	if err := writeCHeader(symbols, filepath.Join(*out, "boringssl_prefix_symbols.h")); err != nil {
-		fmt.Fprintf(os.Stderr, "Error writing boringssl_prefix_symbols.h: %s\n", err)
-		os.Exit(1)
-	}
-
-	if err := writeASMHeader(symbols, filepath.Join(*out, "boringssl_prefix_symbols_asm.h")); err != nil {
-		fmt.Fprintf(os.Stderr, "Error writing boringssl_prefix_symbols_asm.h: %s\n", err)
-		os.Exit(1)
-	}
-
-	if err := writeNASMHeader(symbols, filepath.Join(*out, "boringssl_prefix_symbols_nasm.inc")); err != nil {
-		fmt.Fprintf(os.Stderr, "Error writing boringssl_prefix_symbols_nasm.inc: %s\n", err)
-		os.Exit(1)
-	}
-
-}
diff --git a/util/read_symbols.go b/util/read_symbols.go
deleted file mode 100644
index 791ea5d126..0000000000
--- a/util/read_symbols.go
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright (c) 2018, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-// read_symbols scans one or more .a files and, for each object contained in
-// the .a files, reads the list of symbols in that object file.
-package main
-
-import (
-	"bytes"
-	"debug/elf"
-	"debug/macho"
-	"debug/pe"
-	"flag"
-	"fmt"
-	"os"
-	"runtime"
-	"sort"
-	"strings"
-
-	"boringssl.googlesource.com/boringssl/util/ar"
-)
-
-const (
-	ObjFileFormatELF   = "elf"
-	ObjFileFormatMachO = "macho"
-	ObjFileFormatPE    = "pe"
-)
-
-var (
-	outFlag       = flag.String("out", "-", "File to write output symbols")
-	objFileFormat = flag.String("obj-file-format", defaultObjFileFormat(runtime.GOOS), "Object file format to expect (options are elf, macho, pe)")
-)
-
-func defaultObjFileFormat(goos string) string {
-	switch goos {
-	case "linux":
-		return ObjFileFormatELF
-	case "darwin":
-		return ObjFileFormatMachO
-	case "windows":
-		return ObjFileFormatPE
-	default:
-		// By returning a value here rather than panicking, the user can still
-		// cross-compile from an unsupported platform to a supported platform by
-		// overriding this default with a flag. If the user doesn't provide the
-		// flag, we will panic during flag parsing.
-		return "unsupported"
-	}
-}
-
-func printAndExit(format string, args ...interface{}) {
-	s := fmt.Sprintf(format, args...)
-	fmt.Fprintln(os.Stderr, s)
-	os.Exit(1)
-}
-
-func main() {
-	flag.Parse()
-	if flag.NArg() < 1 {
-		printAndExit("Usage: %s [-out OUT] [-obj-file-format FORMAT] ARCHIVE_FILE [ARCHIVE_FILE [...]]", os.Args[0])
-	}
-	archiveFiles := flag.Args()
-
-	out := os.Stdout
-	if *outFlag != "-" {
-		var err error
-		out, err = os.Create(*outFlag)
-		if err != nil {
-			printAndExit("Error opening %q: %s", *outFlag, err)
-		}
-		defer out.Close()
-	}
-
-	var symbols []string
-	// Only add first instance of any symbol; keep track of them in this map.
-	added := make(map[string]struct{})
-	for _, archive := range archiveFiles {
-		f, err := os.Open(archive)
-		if err != nil {
-			printAndExit("Error opening %s: %s", archive, err)
-		}
-		objectFiles, err := ar.ParseAR(f)
-		f.Close()
-		if err != nil {
-			printAndExit("Error parsing %s: %s", archive, err)
-		}
-
-		for name, contents := range objectFiles {
-			syms, err := listSymbols(contents)
-			if err != nil {
-				printAndExit("Error listing symbols from %q in %q: %s", name, archive, err)
-			}
-			for _, s := range syms {
-				if _, ok := added[s]; !ok {
-					added[s] = struct{}{}
-					symbols = append(symbols, s)
-				}
-			}
-		}
-	}
-
-	sort.Strings(symbols)
-	for _, s := range symbols {
-		var skipSymbols = []string{
-			// Inline functions, etc., from the compiler or language
-			// runtime will naturally end up in the library, to be
-			// deduplicated against other object files. Such symbols
-			// should not be prefixed. It is a limitation of this
-			// symbol-prefixing strategy that we cannot distinguish
-			// our own inline symbols (which should be prefixed)
-			// from the system's (which should not), so we blacklist
-			// known system symbols.
-			"__local_stdio_printf_options",
-			"__local_stdio_scanf_options",
-			"_vscprintf",
-			"_vscprintf_l",
-			"_vsscanf_l",
-			"_xmm",
-			"sscanf",
-			"vsnprintf",
-			// sdallocx is a weak symbol and intended to merge with
-			// the real one, if present.
-			"sdallocx",
-		}
-		var skip bool
-		for _, sym := range skipSymbols {
-			if sym == s {
-				skip = true
-				break
-			}
-		}
-		if skip || isCXXSymbol(s) || strings.HasPrefix(s, "__real@") || strings.HasPrefix(s, "__x86.get_pc_thunk.") {
-			continue
-		}
-		if _, err := fmt.Fprintln(out, s); err != nil {
-			printAndExit("Error writing to %s: %s", *outFlag, err)
-		}
-	}
-}
-
-func isCXXSymbol(s string) bool {
-	if *objFileFormat == ObjFileFormatPE {
-		return strings.HasPrefix(s, "?")
-	}
-	return strings.HasPrefix(s, "_Z")
-}
-
-// listSymbols lists the exported symbols from an object file.
-func listSymbols(contents []byte) ([]string, error) {
-	switch *objFileFormat {
-	case ObjFileFormatELF:
-		return listSymbolsELF(contents)
-	case ObjFileFormatMachO:
-		return listSymbolsMachO(contents)
-	case ObjFileFormatPE:
-		return listSymbolsPE(contents)
-	default:
-		return nil, fmt.Errorf("unsupported object file format %q", *objFileFormat)
-	}
-}
-
-func listSymbolsELF(contents []byte) ([]string, error) {
-	f, err := elf.NewFile(bytes.NewReader(contents))
-	if err != nil {
-		return nil, err
-	}
-	syms, err := f.Symbols()
-	if err != nil {
-		return nil, err
-	}
-
-	var names []string
-	for _, sym := range syms {
-		// Only include exported, defined symbols
-		if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF {
-			names = append(names, sym.Name)
-		}
-	}
-	return names, nil
-}
-
-func listSymbolsMachO(contents []byte) ([]string, error) {
-	f, err := macho.NewFile(bytes.NewReader(contents))
-	if err != nil {
-		return nil, err
-	}
-	if f.Symtab == nil {
-		return nil, nil
-	}
-	var names []string
-	for _, sym := range f.Symtab.Syms {
-		// Source: https://opensource.apple.com/source/xnu/xnu-3789.51.2/EXTERNAL_HEADERS/mach-o/nlist.h.auto.html
-		const (
-			N_PEXT uint8 = 0x10 // Private external symbol bit
-			N_EXT  uint8 = 0x01 // External symbol bit, set for external symbols
-			N_TYPE uint8 = 0x0e // mask for the type bits
-
-			N_UNDF uint8 = 0x0 // undefined, n_sect == NO_SECT
-			N_ABS  uint8 = 0x2 // absolute, n_sect == NO_SECT
-			N_SECT uint8 = 0xe // defined in section number n_sect
-			N_PBUD uint8 = 0xc // prebound undefined (defined in a dylib)
-			N_INDR uint8 = 0xa // indirect
-		)
-
-		// Only include exported, defined symbols.
-		if sym.Type&N_EXT != 0 && sym.Type&N_TYPE != N_UNDF {
-			if len(sym.Name) == 0 || sym.Name[0] != '_' {
-				return nil, fmt.Errorf("unexpected symbol without underscore prefix: %q", sym.Name)
-			}
-			names = append(names, sym.Name[1:])
-		}
-	}
-	return names, nil
-}
-
-func listSymbolsPE(contents []byte) ([]string, error) {
-	f, err := pe.NewFile(bytes.NewReader(contents))
-	if err != nil {
-		return nil, err
-	}
-	var ret []string
-	for _, sym := range f.Symbols {
-		const (
-			// https://docs.microsoft.com/en-us/windows/desktop/debug/pe-format#section-number-values
-			IMAGE_SYM_UNDEFINED = 0
-			// https://docs.microsoft.com/en-us/windows/desktop/debug/pe-format#storage-class
-			IMAGE_SYM_CLASS_EXTERNAL = 2
-		)
-		if sym.SectionNumber != IMAGE_SYM_UNDEFINED && sym.StorageClass == IMAGE_SYM_CLASS_EXTERNAL {
-			name := sym.Name
-			if f.Machine == pe.IMAGE_FILE_MACHINE_I386 {
-				// On 32-bit Windows, C symbols are decorated by calling
-				// convention.
-				// https://msdn.microsoft.com/en-us/library/56h2zst2.aspx#FormatC
-				if strings.HasPrefix(name, "_") || strings.HasPrefix(name, "@") {
-					// __cdecl, __stdcall, or __fastcall. Remove the prefix and
-					// suffix, if present.
-					name = name[1:]
-					if idx := strings.LastIndex(name, "@"); idx >= 0 {
-						name = name[:idx]
-					}
-				} else if idx := strings.LastIndex(name, "@@"); idx >= 0 {
-					// __vectorcall. Remove the suffix.
-					name = name[:idx]
-				}
-			}
-			ret = append(ret, name)
-		}
-	}
-	return ret, nil
-}